From 415546a0f23e30a4f8cfc94e5079d29b04284032 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 25 Feb 2026 18:23:43 -0500 Subject: [PATCH 01/17] Replace Playwright with Kernel native API in OpenAI CUA templates Both TypeScript and Python OpenAI CUA templates now use Kernel's native computer control API (screenshot, click, type, scroll, batch, etc.) instead of Playwright over CDP. This enables the batch_computer_actions tool which executes multiple actions in a single API call for lower latency. Key changes: - New KernelComputer class wrapping Kernel SDK for all computer actions - Added batch_computer_actions function tool with system instructions - Navigation (goto/back/forward) via Kernel's playwright.execute endpoint - Local test scripts create remote Kernel browsers without app deployment - Removed playwright-core, sharp (TS) and playwright (Python) dependencies - Bumped @onkernel/sdk to ^0.38.0 and kernel to >=0.38.0 Made-with: Cursor --- .../python/openai-computer-use/.env.example | 3 +- .../python/openai-computer-use/README.md | 23 +- .../python/openai-computer-use/agent/agent.py | 122 ++++- .../openai-computer-use/computers/__init__.py | 8 +- .../openai-computer-use/computers/computer.py | 14 +- .../openai-computer-use/computers/config.py | 6 +- .../computers/default/__init__.py | 2 - .../computers/default/kernel.py | 48 -- .../computers/default/local_playwright.py | 54 --- .../computers/kernel_computer.py | 178 ++++++++ .../computers/shared/__init__.py | 0 .../computers/shared/base_playwright.py | 154 ------- .../python/openai-computer-use/main.py | 92 ++-- .../python/openai-computer-use/pyproject.toml | 5 +- .../python/openai-computer-use/test_local.py | 70 +++ .../python/openai-computer-use/utils.py | 29 +- .../python/openai-computer-use/uv.lock | 191 +------- .../openai-computer-use/.env.example | 3 +- .../typescript/openai-computer-use/README.md | 25 +- .../typescript/openai-computer-use/index.ts | 20 +- .../openai-computer-use/lib/agent.ts | 215 +++++---- .../openai-computer-use/lib/computers.ts | 28 -- .../lib/kernel-computer.ts | 243 ++++++++++ .../lib/playwright/base.ts | 242 ---------- .../lib/playwright/kernel.ts | 43 -- .../lib/playwright/local.ts | 43 -- .../openai-computer-use/lib/toolset.ts | 64 ++- .../openai-computer-use/lib/utils.ts | 17 +- .../openai-computer-use/package.json | 10 +- .../openai-computer-use/pnpm-lock.yaml | 421 +++++++++--------- .../openai-computer-use/test.local.ts | 100 +++-- 31 files changed, 1190 insertions(+), 1283 deletions(-) delete mode 100644 pkg/templates/python/openai-computer-use/computers/default/__init__.py delete mode 100644 pkg/templates/python/openai-computer-use/computers/default/kernel.py delete mode 100644 pkg/templates/python/openai-computer-use/computers/default/local_playwright.py create mode 100644 pkg/templates/python/openai-computer-use/computers/kernel_computer.py delete mode 100644 pkg/templates/python/openai-computer-use/computers/shared/__init__.py delete mode 100644 pkg/templates/python/openai-computer-use/computers/shared/base_playwright.py create mode 100644 pkg/templates/python/openai-computer-use/test_local.py delete mode 100644 pkg/templates/typescript/openai-computer-use/lib/computers.ts create mode 100644 pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts delete mode 100644 pkg/templates/typescript/openai-computer-use/lib/playwright/base.ts delete mode 100644 pkg/templates/typescript/openai-computer-use/lib/playwright/kernel.ts delete mode 100644 pkg/templates/typescript/openai-computer-use/lib/playwright/local.ts diff --git a/pkg/templates/python/openai-computer-use/.env.example b/pkg/templates/python/openai-computer-use/.env.example index b74e0a29..3ff84207 100644 --- a/pkg/templates/python/openai-computer-use/.env.example +++ b/pkg/templates/python/openai-computer-use/.env.example @@ -1,2 +1,3 @@ -# Copy this file to .env and fill in your API key +# Copy this file to .env and fill in your API keys OPENAI_API_KEY=your_openai_api_key_here +KERNEL_API_KEY=your_kernel_api_key_here diff --git a/pkg/templates/python/openai-computer-use/README.md b/pkg/templates/python/openai-computer-use/README.md index e45b15d4..f0227f8f 100644 --- a/pkg/templates/python/openai-computer-use/README.md +++ b/pkg/templates/python/openai-computer-use/README.md @@ -1,7 +1,24 @@ # Kernel Python Sample App - OpenAI Computer Use -This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI. +This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI with Kernel's native browser control API. -It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. +It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) instead of Playwright, and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. -See the [docs](https://www.kernel.sh/docs/quickstart) for more information. \ No newline at end of file +## Local testing + +You can test against a remote Kernel browser without deploying: + +```bash +cp .env.example .env +# Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env +uv run test_local.py +``` + +## Deploy to Kernel + +```bash +kernel deploy main.py --env-file .env +kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}' +``` + +See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index d7f4267f..4a6dc5d1 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -1,4 +1,6 @@ -from computers import Computer +import json +from typing import Callable +from computers.kernel_computer import KernelComputer from utils import ( create_response, show_image, @@ -6,27 +8,75 @@ sanitize_message, check_blocklisted_url, ) -import json -from typing import Callable +BATCH_FUNC_NAME = "batch_computer_actions" -class Agent: - """ - A sample agent class that can be used to interact with a computer. +BATCH_INSTRUCTIONS = """You have two ways to perform actions: +1. The standard computer tool — use for single actions when you need screenshot feedback after each step. +2. batch_computer_actions — use to execute multiple actions at once when you can predict the outcome. + +ALWAYS prefer batch_computer_actions when performing predictable sequences like: +- Clicking a text field, typing text, and pressing Enter +- Typing a URL and pressing Enter +- Any sequence where you don't need to see intermediate results""" + +BATCH_TOOL = { + "type": "function", + "name": BATCH_FUNC_NAME, + "description": ( + "Execute multiple computer actions in sequence without waiting for " + "screenshots between them. Use this when you can predict the outcome of a " + "sequence of actions without needing intermediate visual feedback. After all " + "actions execute, a single screenshot is taken and returned.\n\n" + "PREFER this over individual computer actions when:\n" + "- Typing text followed by pressing Enter\n" + "- Clicking a field and then typing into it\n" + "- Any sequence where intermediate screenshots are not needed" + ), + "parameters": { + "type": "object", + "properties": { + "actions": { + "type": "array", + "description": "Ordered list of actions to execute", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["click", "double_click", "type", "keypress", "scroll", "move", "drag", "wait"], + }, + "x": {"type": "number"}, + "y": {"type": "number"}, + "text": {"type": "string"}, + "keys": {"type": "array", "items": {"type": "string"}}, + "button": {"type": "string"}, + "scroll_x": {"type": "number"}, + "scroll_y": {"type": "number"}, + }, + "required": ["type"], + }, + }, + }, + "required": ["actions"], + }, + "strict": False, +} - (See simple_cua_loop.py for a simple example without an agent.) - """ + +class Agent: + """An agent that uses OpenAI CUA with Kernel's native computer control API.""" def __init__( self, model="computer-use-preview", - computer: Computer = None, + computer: KernelComputer = None, tools: list[dict] = [], acknowledge_safety_check_callback: Callable = lambda message: False, ): self.model = model self.computer = computer - self.tools = tools + self.tools = list(tools) self.print_steps = True self.debug = False self.show_images = False @@ -41,6 +91,7 @@ def __init__( "display_height": dimensions[1], "environment": computer.get_environment(), }, + BATCH_TOOL, { "type": "function", "name": "back", @@ -75,6 +126,28 @@ def debug_print(self, *args): if self.debug: pp(*args) + def _execute_computer_action(self, action_type, action_args): + if action_type == "click": + self.computer.click(**action_args) + elif action_type == "double_click": + self.computer.double_click(**action_args) + elif action_type == "type": + self.computer.type(**action_args) + elif action_type == "keypress": + self.computer.keypress(**action_args) + elif action_type == "scroll": + self.computer.scroll(**action_args) + elif action_type == "move": + self.computer.move(**action_args) + elif action_type == "drag": + self.computer.drag(**action_args) + elif action_type == "wait": + self.computer.wait(**action_args) + elif action_type == "screenshot": + pass + else: + print(f"Warning: unknown action type: {action_type}") + def handle_item(self, item): """Handle each item; may cause a computer action + screenshot.""" if item["type"] == "message": @@ -86,14 +159,17 @@ def handle_item(self, item): if self.print_steps: print(f"{name}({args})") - if hasattr(self.computer, name): # if function exists on computer, call it + if name == BATCH_FUNC_NAME: + return self._handle_batch_call(item["call_id"], args) + + if hasattr(self.computer, name): method = getattr(self.computer, name) method(**args) return [ { "type": "function_call_output", "call_id": item["call_id"], - "output": "success", # hard-coded output for demo + "output": "success", } ] @@ -104,14 +180,12 @@ def handle_item(self, item): if self.print_steps: print(f"{action_type}({action_args})") - method = getattr(self.computer, action_type) - method(**action_args) + self._execute_computer_action(action_type, action_args) screenshot_base64 = self.computer.screenshot() if self.show_images: show_image(screenshot_base64) - # if user doesn't ack all safety checks exit with error pending_checks = item.get("pending_safety_checks", []) for check in pending_checks: message = check["message"] @@ -130,7 +204,6 @@ def handle_item(self, item): }, } - # additional URL safety checks for browser environments if self.computer.get_environment() == "browser": current_url = self.computer.get_current_url() check_blocklisted_url(current_url) @@ -139,6 +212,21 @@ def handle_item(self, item): return [call_output] return [] + def _handle_batch_call(self, call_id, args): + actions = args.get("actions", []) + self.computer.batch_actions(actions) + screenshot_base64 = self.computer.screenshot() + return [ + { + "type": "function_call_output", + "call_id": call_id, + "output": json.dumps([ + {"type": "text", "text": "Actions executed successfully."}, + {"type": "image_url", "image_url": f"data:image/png;base64,{screenshot_base64}"}, + ]), + } + ] + def run_full_turn( self, input_items, print_steps=True, debug=False, show_images=False ): @@ -147,7 +235,6 @@ def run_full_turn( self.show_images = show_images new_items = [] - # keep looping until we get a final response while new_items[-1].get("role") != "assistant" if new_items else True: self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) @@ -156,6 +243,7 @@ def run_full_turn( input=input_items + new_items, tools=self.tools, truncation="auto", + instructions=BATCH_INSTRUCTIONS, ) self.debug_print(response) diff --git a/pkg/templates/python/openai-computer-use/computers/__init__.py b/pkg/templates/python/openai-computer-use/computers/__init__.py index 0e8c132d..843071d0 100644 --- a/pkg/templates/python/openai-computer-use/computers/__init__.py +++ b/pkg/templates/python/openai-computer-use/computers/__init__.py @@ -1,11 +1,7 @@ -from . import default -from . import contrib +from .kernel_computer import KernelComputer from .computer import Computer -from .config import computers_config __all__ = [ - "default", - "contrib", + "KernelComputer", "Computer", - "computers_config", ] diff --git a/pkg/templates/python/openai-computer-use/computers/computer.py b/pkg/templates/python/openai-computer-use/computers/computer.py index 80986509..8b389459 100644 --- a/pkg/templates/python/openai-computer-use/computers/computer.py +++ b/pkg/templates/python/openai-computer-use/computers/computer.py @@ -1,8 +1,8 @@ -from typing import Protocol, List, Literal, Dict +from typing import Protocol, List, Literal, Dict, Any class Computer(Protocol): - """Defines the 'shape' (methods/properties) our loop expects.""" + """Defines the shape (methods/properties) the agent loop expects.""" def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... @@ -26,4 +26,12 @@ def keypress(self, keys: List[str]) -> None: ... def drag(self, path: List[Dict[str, int]]) -> None: ... - def get_current_url() -> str: ... + def batch_actions(self, actions: List[Dict[str, Any]]) -> None: ... + + def goto(self, url: str) -> None: ... + + def back(self) -> None: ... + + def forward(self) -> None: ... + + def get_current_url(self) -> str: ... diff --git a/pkg/templates/python/openai-computer-use/computers/config.py b/pkg/templates/python/openai-computer-use/computers/config.py index 4bf314c4..28a9b7ee 100644 --- a/pkg/templates/python/openai-computer-use/computers/config.py +++ b/pkg/templates/python/openai-computer-use/computers/config.py @@ -1,7 +1,5 @@ -from .default import * -from .contrib import * +from .kernel_computer import KernelComputer computers_config = { - "local-playwright": LocalPlaywrightBrowser, - "kernel": KernelPlaywrightBrowser, + "kernel": KernelComputer, } diff --git a/pkg/templates/python/openai-computer-use/computers/default/__init__.py b/pkg/templates/python/openai-computer-use/computers/default/__init__.py deleted file mode 100644 index 5e168f70..00000000 --- a/pkg/templates/python/openai-computer-use/computers/default/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .local_playwright import LocalPlaywrightBrowser -from .kernel import KernelPlaywrightBrowser diff --git a/pkg/templates/python/openai-computer-use/computers/default/kernel.py b/pkg/templates/python/openai-computer-use/computers/default/kernel.py deleted file mode 100644 index 5fbb7e5b..00000000 --- a/pkg/templates/python/openai-computer-use/computers/default/kernel.py +++ /dev/null @@ -1,48 +0,0 @@ -from playwright.sync_api import Browser, Page -from ..shared.base_playwright import BasePlaywrightComputer - -class KernelPlaywrightBrowser(BasePlaywrightComputer): - """ - Connects to a remote Chromium instance using a provided CDP URL. - Expects a dict as input: {'cdp_ws_url': ..., 'width': ..., 'height': ...} - Width and height are optional, defaulting to 1024x768. - """ - - def __init__(self, config: dict): - super().__init__() - self.cdp_ws_url = config.get("cdp_ws_url") - if not self.cdp_ws_url: - raise ValueError("cdp_ws_url must be provided in config dict") - self.width = config.get("width", 1024) - self.height = config.get("height", 768) - self.dimensions = (self.width, self.height) - - def get_dimensions(self): - return self.dimensions - - def _get_browser_and_page(self) -> tuple[Browser, Page]: - # Connect to the remote browser using the CDP URL - browser = self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) - context = browser.contexts[0] if browser.contexts else browser.new_context() - page = context.pages[0] if context.pages else context.new_page() - page.set_viewport_size({"width": self.width, "height": self.height}) - page.on("close", self._handle_page_close) - # Optionally, navigate to a default page - # page.goto("about:blank") - return browser, page - - def _handle_new_page(self, page: Page): - """Handle the creation of a new page.""" - print("New page created") - self._page = page - page.on("close", self._handle_page_close) - - def _handle_page_close(self, page: Page): - """Handle the closure of a page.""" - print("Page closed") - if hasattr(self, "_browser") and self._page == page: - if self._browser.contexts[0].pages: - self._page = self._browser.contexts[0].pages[-1] - else: - print("Warning: All pages have been closed.") - self._page = None diff --git a/pkg/templates/python/openai-computer-use/computers/default/local_playwright.py b/pkg/templates/python/openai-computer-use/computers/default/local_playwright.py deleted file mode 100644 index 6810f34b..00000000 --- a/pkg/templates/python/openai-computer-use/computers/default/local_playwright.py +++ /dev/null @@ -1,54 +0,0 @@ -from playwright.sync_api import Browser, Page -from ..shared.base_playwright import BasePlaywrightComputer - - -class LocalPlaywrightBrowser(BasePlaywrightComputer): - """Launches a local Chromium instance using Playwright.""" - - def __init__(self, headless: bool = False): - super().__init__() - self.headless = headless - - def _get_browser_and_page(self) -> tuple[Browser, Page]: - width, height = self.get_dimensions() - launch_args = [ - f"--window-size={width},{height}", - "--disable-extensions", - "--disable-file-system", - ] - browser = self._playwright.chromium.launch( - chromium_sandbox=True, - headless=self.headless, - args=launch_args, - env={"DISPLAY": ":0"}, - ) - - context = browser.contexts[0] if browser.contexts else browser.new_context() - - - # Add event listeners for page creation and closure - context.on("page", self._handle_new_page) - - page = context.pages[0] if context.pages else context.new_page() - page.set_viewport_size({"width": width, "height": height}) - page.on("close", self._handle_page_close) - - # page.goto("about:blank") - - return browser, page - - def _handle_new_page(self, page: Page): - """Handle the creation of a new page.""" - print("New page created") - self._page = page - page.on("close", self._handle_page_close) - - def _handle_page_close(self, page: Page): - """Handle the closure of a page.""" - print("Page closed") - if self._page == page: - if self._browser.contexts[0].pages: - self._page = self._browser.contexts[0].pages[-1] - else: - print("Warning: All pages have been closed.") - self._page = None diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py new file mode 100644 index 00000000..1c2e1936 --- /dev/null +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -0,0 +1,178 @@ +import base64 +import json +import time +from typing import List, Dict, Any + +from kernel import Kernel + +# CUA model key names -> X11 keysym names for the Kernel computer API +KEYSYM_MAP = { + "ENTER": "Return", + "Enter": "Return", + "RETURN": "Return", + "BACKSPACE": "BackSpace", + "Backspace": "BackSpace", + "DELETE": "Delete", + "TAB": "Tab", + "ESCAPE": "Escape", + "Escape": "Escape", + "ESC": "Escape", + "SPACE": "space", + "Space": "space", + "UP": "Up", + "DOWN": "Down", + "LEFT": "Left", + "RIGHT": "Right", + "HOME": "Home", + "END": "End", + "PAGEUP": "Prior", + "PAGE_UP": "Prior", + "PageUp": "Prior", + "PAGEDOWN": "Next", + "PAGE_DOWN": "Next", + "PageDown": "Next", + "CAPS_LOCK": "Caps_Lock", + "CapsLock": "Caps_Lock", + "CTRL": "Control_L", + "Ctrl": "Control_L", + "CONTROL": "Control_L", + "Control": "Control_L", + "ALT": "Alt_L", + "Alt": "Alt_L", + "SHIFT": "Shift_L", + "Shift": "Shift_L", + "META": "Super_L", + "Meta": "Super_L", + "SUPER": "Super_L", + "Super": "Super_L", + "CMD": "Super_L", + "COMMAND": "Super_L", + "F1": "F1", "F2": "F2", "F3": "F3", "F4": "F4", + "F5": "F5", "F6": "F6", "F7": "F7", "F8": "F8", + "F9": "F9", "F10": "F10", "F11": "F11", "F12": "F12", + "INSERT": "Insert", + "Insert": "Insert", + "PRINT": "Print", + "SCROLLLOCK": "Scroll_Lock", + "PAUSE": "Pause", + "NUMLOCK": "Num_Lock", +} + + +def _translate_keys(keys: List[str]) -> List[str]: + return [KEYSYM_MAP.get(k, k) for k in keys] + + +def _normalize_button(button) -> str: + if button is None: + return "left" + if isinstance(button, int): + return {1: "left", 2: "middle", 3: "right"}.get(button, "left") + return str(button) + + +def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: + action_type = action.get("type", "") + if action_type == "click": + return { + "type": "click_mouse", + "click_mouse": { + "x": action.get("x", 0), + "y": action.get("y", 0), + "button": _normalize_button(action.get("button")), + }, + } + elif action_type == "double_click": + return { + "type": "click_mouse", + "click_mouse": { + "x": action.get("x", 0), + "y": action.get("y", 0), + "num_clicks": 2, + }, + } + elif action_type == "type": + return {"type": "type_text", "type_text": {"text": action.get("text", "")}} + elif action_type == "keypress": + return {"type": "press_key", "press_key": {"keys": _translate_keys(action.get("keys", []))}} + elif action_type == "scroll": + return { + "type": "scroll", + "scroll": { + "x": action.get("x", 0), + "y": action.get("y", 0), + "delta_x": action.get("scroll_x", 0), + "delta_y": action.get("scroll_y", 0), + }, + } + elif action_type == "move": + return {"type": "move_mouse", "move_mouse": {"x": action.get("x", 0), "y": action.get("y", 0)}} + elif action_type == "drag": + path = [[p["x"], p["y"]] for p in action.get("path", [])] + return {"type": "drag_mouse", "drag_mouse": {"path": path}} + elif action_type == "wait": + return {"type": "sleep", "sleep": {"duration_ms": action.get("ms", 1000)}} + else: + raise ValueError(f"Unknown CUA action type: {action_type}") + + +class KernelComputer: + """Wraps Kernel's native computer control API for browser automation.""" + + def __init__(self, client: Kernel, session_id: str): + self.client = client + self.session_id = session_id + + def get_environment(self): + return "browser" + + def get_dimensions(self): + return (1024, 768) + + def screenshot(self) -> str: + resp = self.client.browsers.computer.capture_screenshot(self.session_id) + return base64.b64encode(resp.read()).decode("utf-8") + + def click(self, x: int, y: int, button="left") -> None: + self.client.browsers.computer.click_mouse(self.session_id, x=x, y=y, button=_normalize_button(button)) + + def double_click(self, x: int, y: int) -> None: + self.client.browsers.computer.click_mouse(self.session_id, x=x, y=y, num_clicks=2) + + def type(self, text: str) -> None: + self.client.browsers.computer.type_text(self.session_id, text=text) + + def keypress(self, keys: List[str]) -> None: + self.client.browsers.computer.press_key(self.session_id, keys=_translate_keys(keys)) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + self.client.browsers.computer.scroll(self.session_id, x=x, y=y, delta_x=scroll_x, delta_y=scroll_y) + + def move(self, x: int, y: int) -> None: + self.client.browsers.computer.move_mouse(self.session_id, x=x, y=y) + + def drag(self, path: List[Dict[str, int]]) -> None: + p = [[pt["x"], pt["y"]] for pt in path] + self.client.browsers.computer.drag_mouse(self.session_id, path=p) + + def wait(self, ms: int = 1000) -> None: + time.sleep(ms / 1000) + + def batch_actions(self, actions: List[Dict[str, Any]]) -> None: + translated = [_translate_cua_action(a) for a in actions] + self.client.browsers.computer.batch(self.session_id, actions=translated) + + def goto(self, url: str) -> None: + self.client.browsers.playwright.execute( + self.session_id, code=f"await page.goto({json.dumps(url)})" + ) + + def back(self) -> None: + self.client.browsers.playwright.execute(self.session_id, code="await page.goBack()") + + def forward(self) -> None: + self.client.browsers.playwright.execute(self.session_id, code="await page.goForward()") + + def get_current_url(self) -> str: + result = self.client.browsers.playwright.execute(self.session_id, code="return page.url()") + return result.result if result.result else "" diff --git a/pkg/templates/python/openai-computer-use/computers/shared/__init__.py b/pkg/templates/python/openai-computer-use/computers/shared/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pkg/templates/python/openai-computer-use/computers/shared/base_playwright.py b/pkg/templates/python/openai-computer-use/computers/shared/base_playwright.py deleted file mode 100644 index 0c38e24f..00000000 --- a/pkg/templates/python/openai-computer-use/computers/shared/base_playwright.py +++ /dev/null @@ -1,154 +0,0 @@ -import time -import base64 -from typing import List, Dict, Literal -from playwright.sync_api import sync_playwright, Browser, Page -from utils import check_blocklisted_url - -# Optional: key mapping if your model uses "CUA" style keys -CUA_KEY_TO_PLAYWRIGHT_KEY = { - "/": "Divide", - "\\": "Backslash", - "alt": "Alt", - "arrowdown": "ArrowDown", - "arrowleft": "ArrowLeft", - "arrowright": "ArrowRight", - "arrowup": "ArrowUp", - "backspace": "Backspace", - "capslock": "CapsLock", - "cmd": "Meta", - "ctrl": "Control", - "delete": "Delete", - "end": "End", - "enter": "Enter", - "esc": "Escape", - "home": "Home", - "insert": "Insert", - "option": "Alt", - "pagedown": "PageDown", - "pageup": "PageUp", - "shift": "Shift", - "space": " ", - "super": "Meta", - "tab": "Tab", - "win": "Meta", -} - - -class BasePlaywrightComputer: - """ - Abstract base for Playwright-based computers: - - - Subclasses override `_get_browser_and_page()` to do local or remote connection, - returning (Browser, Page). - - This base class handles context creation (`__enter__`/`__exit__`), - plus standard "Computer" actions like click, scroll, etc. - - We also have extra browser actions: `goto(url)` and `back()`. - """ - - def get_environment(self): - return "browser" - - def get_dimensions(self): - return (1024, 768) - - def __init__(self): - self._playwright = None - self._browser: Browser | None = None - self._page: Page | None = None - - def __enter__(self): - # Start Playwright and call the subclass hook for getting browser/page - self._playwright = sync_playwright().start() - self._browser, self._page = self._get_browser_and_page() - - # Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS - def handle_route(route, request): - - url = request.url - if check_blocklisted_url(url): - print(f"Flagging blocked domain: {url}") - route.abort() - else: - route.continue_() - - self._page.route("**/*", handle_route) - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._browser: - self._browser.close() - if self._playwright: - self._playwright.stop() - - def get_current_url(self) -> str: - return self._page.url - - # --- Common "Computer" actions --- - def screenshot(self) -> str: - """Capture only the viewport (not full_page).""" - png_bytes = self._page.screenshot(full_page=False) - return base64.b64encode(png_bytes).decode("utf-8") - - def click(self, x: int, y: int, button: str = "left") -> None: - match button: - case "back": - self.back() - case "forward": - self.forward() - case "wheel": - self._page.mouse.wheel(x, y) - case _: - button_mapping = {"left": "left", "right": "right"} - button_type = button_mapping.get(button, "left") - self._page.mouse.click(x, y, button=button_type) - - def double_click(self, x: int, y: int) -> None: - self._page.mouse.dblclick(x, y) - - def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - self._page.mouse.move(x, y) - self._page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})") - - def type(self, text: str) -> None: - self._page.keyboard.type(text) - - def wait(self, ms: int = 1000) -> None: - time.sleep(ms / 1000) - - def move(self, x: int, y: int) -> None: - self._page.mouse.move(x, y) - - def keypress(self, keys: List[str]) -> None: - mapped_keys = [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys] - for key in mapped_keys: - self._page.keyboard.down(key) - for key in reversed(mapped_keys): - self._page.keyboard.up(key) - - def drag(self, path: List[Dict[str, int]]) -> None: - if not path: - return - self._page.mouse.move(path[0]["x"], path[0]["y"]) - self._page.mouse.down() - for point in path[1:]: - self._page.mouse.move(point["x"], point["y"]) - self._page.mouse.up() - - # --- Extra browser-oriented actions --- - def goto(self, url: str) -> None: - try: - return self._page.goto(url) - except Exception as e: - print(f"Error navigating to {url}: {e}") - - def back(self) -> None: - return self._page.go_back() - - def forward(self) -> None: - return self._page.go_forward() - - # --- Subclass hook --- - def _get_browser_and_page(self) -> tuple[Browser, Page]: - """Subclasses must implement, returning (Browser, Page).""" - raise NotImplementedError diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index 6ab17b17..77c6964b 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -5,7 +5,7 @@ import kernel from agent import Agent -from computers.default import KernelPlaywrightBrowser +from computers.kernel_computer import KernelComputer from kernel import Kernel """ @@ -43,8 +43,6 @@ async def cua_task( ctx: kernel.KernelContext, payload: CuaInput, ) -> CuaOutput: - # A function that processes a user task using the kernel browser and agent - if not payload or not payload.get("task"): raise ValueError("task is required") @@ -52,55 +50,49 @@ async def cua_task( client.browsers.create, invocation_id=ctx.invocation_id, stealth=True ) print("Kernel browser live view url: ", kernel_browser.browser_live_view_url) - cdp_ws_url = kernel_browser.cdp_ws_url def run_agent(): - with KernelPlaywrightBrowser({"cdp_ws_url": cdp_ws_url}) as computer: - # Navigate to DuckDuckGo as starting page (less likely to trigger captchas than Google) - computer.goto("https://duckduckgo.com") - - # messages to provide to the agent - items = [ - { - "role": "system", - "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", - }, - {"role": "user", "content": payload["task"]}, - ] - - # setup the agent - agent = Agent( - computer=computer, - tools=[], # can provide additional tools to the agent - acknowledge_safety_check_callback=lambda message: ( - print(f"> agent : safety check message (skipping): {message}") - or True - ), # safety check function , now defaults to true - ) - - # run the agent - response_items = agent.run_full_turn( - items, - debug=True, - show_images=False, - ) - - if not response_items or "content" not in response_items[-1]: - raise ValueError("No response from agent") - # The content may be a list of blocks, get the first text block - content = response_items[-1]["content"] - if ( - isinstance(content, list) - and content - and isinstance(content[0], dict) - and "text" in content[0] - ): - result = content[0]["text"] - elif isinstance(content, str): - result = content - else: - result = str(content) - return {"result": result} + computer = KernelComputer(client, kernel_browser.session_id) + computer.goto("https://duckduckgo.com") + + items = [ + { + "role": "system", + "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + }, + {"role": "user", "content": payload["task"]}, + ] + + agent = Agent( + computer=computer, + tools=[], + acknowledge_safety_check_callback=lambda message: ( + print(f"> agent : safety check message (skipping): {message}") + or True + ), + ) + + response_items = agent.run_full_turn( + items, + debug=True, + show_images=False, + ) + + if not response_items or "content" not in response_items[-1]: + raise ValueError("No response from agent") + content = response_items[-1]["content"] + if ( + isinstance(content, list) + and content + and isinstance(content[0], dict) + and "text" in content[0] + ): + result = content[0]["text"] + elif isinstance(content, str): + result = content + else: + result = str(content) + return {"result": result} try: return await asyncio.to_thread(run_agent) diff --git a/pkg/templates/python/openai-computer-use/pyproject.toml b/pkg/templates/python/openai-computer-use/pyproject.toml index 3ea73870..47e45577 100644 --- a/pkg/templates/python/openai-computer-use/pyproject.toml +++ b/pkg/templates/python/openai-computer-use/pyproject.toml @@ -6,10 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "httpx>=0.28.1", - "pillow>=12.0.0", - "kernel>=0.23.0", - "playwright>=1.56.0", - "pydantic>=2.12.5", + "kernel>=0.38.0", "python-dotenv>=1.2.1", "requests>=2.32.5", ] diff --git a/pkg/templates/python/openai-computer-use/test_local.py b/pkg/templates/python/openai-computer-use/test_local.py new file mode 100644 index 00000000..7897cd35 --- /dev/null +++ b/pkg/templates/python/openai-computer-use/test_local.py @@ -0,0 +1,70 @@ +""" +Local test script that creates a remote Kernel browser and runs the CUA agent. +No Kernel app deployment needed. + +Usage: + KERNEL_API_KEY=... OPENAI_API_KEY=... uv run test_local.py +""" + +import datetime +import os +import json + +from dotenv import load_dotenv + +load_dotenv(override=True) + +from kernel import Kernel +from agent import Agent +from computers.kernel_computer import KernelComputer + + +def main(): + if not os.getenv("KERNEL_API_KEY"): + raise ValueError("KERNEL_API_KEY is not set") + if not os.getenv("OPENAI_API_KEY"): + raise ValueError("OPENAI_API_KEY is not set") + + client = Kernel(api_key=os.getenv("KERNEL_API_KEY")) + browser = client.browsers.create(timeout_seconds=300) + print(f"> Browser session: {browser.session_id}") + print(f"> Live view: {browser.browser_live_view_url}") + + computer = KernelComputer(client, browser.session_id) + + try: + computer.goto("https://duckduckgo.com") + + items = [ + { + "role": "system", + "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + }, + { + "role": "user", + "content": "go to ebay.com and look up oberheim ob-x prices and give me a report", + }, + ] + + agent = Agent( + computer=computer, + tools=[], + acknowledge_safety_check_callback=lambda message: ( + print(f"> safety check: {message}") or True + ), + ) + + response_items = agent.run_full_turn( + items, + debug=True, + show_images=False, + ) + + print(json.dumps(response_items, indent=2, default=str)) + finally: + client.browsers.delete_by_id(browser.session_id) + print("> Browser session deleted") + + +if __name__ == "__main__": + main() diff --git a/pkg/templates/python/openai-computer-use/utils.py b/pkg/templates/python/openai-computer-use/utils.py index b17ee811..fe795ad2 100644 --- a/pkg/templates/python/openai-computer-use/utils.py +++ b/pkg/templates/python/openai-computer-use/utils.py @@ -2,10 +2,6 @@ import requests from dotenv import load_dotenv import json -import base64 -from PIL import Image -from io import BytesIO -import io from urllib.parse import urlparse load_dotenv(override=True) @@ -21,19 +17,19 @@ def pp(obj): - print(json.dumps(obj, indent=4)) + print(json.dumps(obj, indent=4, default=str)) def show_image(base_64_image): - image_data = base64.b64decode(base_64_image) - image = Image.open(BytesIO(image_data)) - image.show() - - -def calculate_image_dimensions(base_64_image): - image_data = base64.b64decode(base_64_image) - image = Image.open(io.BytesIO(image_data)) - return image.size + import base64 + from io import BytesIO + try: + from PIL import Image + image_data = base64.b64decode(base_64_image) + image = Image.open(BytesIO(image_data)) + image.show() + except ImportError: + print("[show_image] PIL not installed, skipping image display") def sanitize_message(msg: dict) -> dict: @@ -68,7 +64,10 @@ def create_response(**kwargs): def check_blocklisted_url(url: str) -> None: """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" - hostname = urlparse(url).hostname or "" + try: + hostname = urlparse(url).hostname or "" + except Exception: + return if any( hostname == blocked or hostname.endswith(f".{blocked}") for blocked in BLOCKED_DOMAINS diff --git a/pkg/templates/python/openai-computer-use/uv.lock b/pkg/templates/python/openai-computer-use/uv.lock index 5ab5090c..42620637 100644 --- a/pkg/templates/python/openai-computer-use/uv.lock +++ b/pkg/templates/python/openai-computer-use/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" [[package]] @@ -115,53 +115,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, ] -[[package]] -name = "greenlet" -version = "3.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/e5/40dbda2736893e3e53d25838e0f19a2b417dfc122b9989c91918db30b5d3/greenlet-3.3.0.tar.gz", hash = "sha256:a82bb225a4e9e4d653dd2fb7b8b2d36e4fb25bc0165422a11e48b88e9e6f78fb", size = 190651, upload-time = "2025-12-04T14:49:44.05Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1f/cb/48e964c452ca2b92175a9b2dca037a553036cb053ba69e284650ce755f13/greenlet-3.3.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e29f3018580e8412d6aaf5641bb7745d38c85228dacf51a73bd4e26ddf2a6a8e", size = 274908, upload-time = "2025-12-04T14:23:26.435Z" }, - { url = "https://files.pythonhosted.org/packages/28/da/38d7bff4d0277b594ec557f479d65272a893f1f2a716cad91efeb8680953/greenlet-3.3.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a687205fb22794e838f947e2194c0566d3812966b41c78709554aa883183fb62", size = 577113, upload-time = "2025-12-04T14:50:05.493Z" }, - { url = "https://files.pythonhosted.org/packages/3c/f2/89c5eb0faddc3ff014f1c04467d67dee0d1d334ab81fadbf3744847f8a8a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4243050a88ba61842186cb9e63c7dfa677ec146160b0efd73b855a3d9c7fcf32", size = 590338, upload-time = "2025-12-04T14:57:41.136Z" }, - { url = "https://files.pythonhosted.org/packages/80/d7/db0a5085035d05134f8c089643da2b44cc9b80647c39e93129c5ef170d8f/greenlet-3.3.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:670d0f94cd302d81796e37299bcd04b95d62403883b24225c6b5271466612f45", size = 601098, upload-time = "2025-12-04T15:07:11.898Z" }, - { url = "https://files.pythonhosted.org/packages/dc/a6/e959a127b630a58e23529972dbc868c107f9d583b5a9f878fb858c46bc1a/greenlet-3.3.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6cb3a8ec3db4a3b0eb8a3c25436c2d49e3505821802074969db017b87bc6a948", size = 590206, upload-time = "2025-12-04T14:26:01.254Z" }, - { url = "https://files.pythonhosted.org/packages/48/60/29035719feb91798693023608447283b266b12efc576ed013dd9442364bb/greenlet-3.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2de5a0b09eab81fc6a382791b995b1ccf2b172a9fec934747a7a23d2ff291794", size = 1550668, upload-time = "2025-12-04T15:04:22.439Z" }, - { url = "https://files.pythonhosted.org/packages/0a/5f/783a23754b691bfa86bd72c3033aa107490deac9b2ef190837b860996c9f/greenlet-3.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4449a736606bd30f27f8e1ff4678ee193bc47f6ca810d705981cfffd6ce0d8c5", size = 1615483, upload-time = "2025-12-04T14:27:28.083Z" }, - { url = "https://files.pythonhosted.org/packages/1d/d5/c339b3b4bc8198b7caa4f2bd9fd685ac9f29795816d8db112da3d04175bb/greenlet-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:7652ee180d16d447a683c04e4c5f6441bae7ba7b17ffd9f6b3aff4605e9e6f71", size = 301164, upload-time = "2025-12-04T14:42:51.577Z" }, - { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, - { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, - { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, - { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, - { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, - { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, - { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, - { url = "https://files.pythonhosted.org/packages/6c/79/3912a94cf27ec503e51ba493692d6db1e3cd8ac7ac52b0b47c8e33d7f4f9/greenlet-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7a34b13d43a6b78abf828a6d0e87d3385680eaf830cd60d20d52f249faabf39", size = 301964, upload-time = "2025-12-04T14:36:58.316Z" }, - { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, - { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, - { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, - { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, - { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, - { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, - { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, - { url = "https://files.pythonhosted.org/packages/7e/71/ba21c3fb8c5dce83b8c01f458a42e99ffdb1963aeec08fff5a18588d8fd7/greenlet-3.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:9ee1942ea19550094033c35d25d20726e4f1c40d59545815e1128ac58d416d38", size = 301833, upload-time = "2025-12-04T14:32:23.929Z" }, - { url = "https://files.pythonhosted.org/packages/d7/7c/f0a6d0ede2c7bf092d00bc83ad5bafb7e6ec9b4aab2fbdfa6f134dc73327/greenlet-3.3.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:60c2ef0f578afb3c8d92ea07ad327f9a062547137afe91f38408f08aacab667f", size = 275671, upload-time = "2025-12-04T14:23:05.267Z" }, - { url = "https://files.pythonhosted.org/packages/44/06/dac639ae1a50f5969d82d2e3dd9767d30d6dbdbab0e1a54010c8fe90263c/greenlet-3.3.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a5d554d0712ba1de0a6c94c640f7aeba3f85b3a6e1f2899c11c2c0428da9365", size = 646360, upload-time = "2025-12-04T14:50:10.026Z" }, - { url = "https://files.pythonhosted.org/packages/e0/94/0fb76fe6c5369fba9bf98529ada6f4c3a1adf19e406a47332245ef0eb357/greenlet-3.3.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3a898b1e9c5f7307ebbde4102908e6cbfcb9ea16284a3abe15cab996bee8b9b3", size = 658160, upload-time = "2025-12-04T14:57:45.41Z" }, - { url = "https://files.pythonhosted.org/packages/93/79/d2c70cae6e823fac36c3bbc9077962105052b7ef81db2f01ec3b9bf17e2b/greenlet-3.3.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dcd2bdbd444ff340e8d6bdf54d2f206ccddbb3ccfdcd3c25bf4afaa7b8f0cf45", size = 671388, upload-time = "2025-12-04T15:07:15.789Z" }, - { url = "https://files.pythonhosted.org/packages/b8/14/bab308fc2c1b5228c3224ec2bf928ce2e4d21d8046c161e44a2012b5203e/greenlet-3.3.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5773edda4dc00e173820722711d043799d3adb4f01731f40619e07ea2750b955", size = 660166, upload-time = "2025-12-04T14:26:05.099Z" }, - { url = "https://files.pythonhosted.org/packages/4b/d2/91465d39164eaa0085177f61983d80ffe746c5a1860f009811d498e7259c/greenlet-3.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ac0549373982b36d5fd5d30beb8a7a33ee541ff98d2b502714a09f1169f31b55", size = 1615193, upload-time = "2025-12-04T15:04:27.041Z" }, - { url = "https://files.pythonhosted.org/packages/42/1b/83d110a37044b92423084d52d5d5a3b3a73cafb51b547e6d7366ff62eff1/greenlet-3.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d198d2d977460358c3b3a4dc844f875d1adb33817f0613f663a656f463764ccc", size = 1683653, upload-time = "2025-12-04T14:27:32.366Z" }, - { url = "https://files.pythonhosted.org/packages/7c/9a/9030e6f9aa8fd7808e9c31ba4c38f87c4f8ec324ee67431d181fe396d705/greenlet-3.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:73f51dd0e0bdb596fb0417e475fa3c5e32d4c83638296e560086b8d7da7c4170", size = 305387, upload-time = "2025-12-04T14:26:51.063Z" }, - { url = "https://files.pythonhosted.org/packages/a0/66/bd6317bc5932accf351fc19f177ffba53712a202f9df10587da8df257c7e/greenlet-3.3.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d6ed6f85fae6cdfdb9ce04c9bf7a08d666cfcfb914e7d006f44f840b46741931", size = 282638, upload-time = "2025-12-04T14:25:20.941Z" }, - { url = "https://files.pythonhosted.org/packages/30/cf/cc81cb030b40e738d6e69502ccbd0dd1bced0588e958f9e757945de24404/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d9125050fcf24554e69c4cacb086b87b3b55dc395a8b3ebe6487b045b2614388", size = 651145, upload-time = "2025-12-04T14:50:11.039Z" }, - { url = "https://files.pythonhosted.org/packages/9c/ea/1020037b5ecfe95ca7df8d8549959baceb8186031da83d5ecceff8b08cd2/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:87e63ccfa13c0a0f6234ed0add552af24cc67dd886731f2261e46e241608bee3", size = 654236, upload-time = "2025-12-04T14:57:47.007Z" }, - { url = "https://files.pythonhosted.org/packages/69/cc/1e4bae2e45ca2fa55299f4e85854606a78ecc37fead20d69322f96000504/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2662433acbca297c9153a4023fe2161c8dcfdcc91f10433171cf7e7d94ba2221", size = 662506, upload-time = "2025-12-04T15:07:16.906Z" }, - { url = "https://files.pythonhosted.org/packages/57/b9/f8025d71a6085c441a7eaff0fd928bbb275a6633773667023d19179fe815/greenlet-3.3.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c6e9b9c1527a78520357de498b0e709fb9e2f49c3a513afd5a249007261911b", size = 653783, upload-time = "2025-12-04T14:26:06.225Z" }, - { url = "https://files.pythonhosted.org/packages/f6/c7/876a8c7a7485d5d6b5c6821201d542ef28be645aa024cfe1145b35c120c1/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:286d093f95ec98fdd92fcb955003b8a3d054b4e2cab3e2707a5039e7b50520fd", size = 1614857, upload-time = "2025-12-04T15:04:28.484Z" }, - { url = "https://files.pythonhosted.org/packages/4f/dc/041be1dff9f23dac5f48a43323cd0789cb798342011c19a248d9c9335536/greenlet-3.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c10513330af5b8ae16f023e8ddbfb486ab355d04467c4679c5cfe4659975dd9", size = 1676034, upload-time = "2025-12-04T14:27:33.531Z" }, -] - [[package]] name = "h11" version = "0.16.0" @@ -210,7 +163,7 @@ wheels = [ [[package]] name = "kernel" -version = "0.23.0" +version = "0.38.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -220,115 +173,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9f/84/917ef7d15d8b05660d72728771e662c870b9ab0adcc8eaf3bc64a3809b95/kernel-0.23.0.tar.gz", hash = "sha256:2cea5de91ddb4fc0882e2dadaa1c62e659d23c8acafd5c7df814c36007f73eb9", size = 170960, upload-time = "2025-12-11T20:19:26.62Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/e2/04abb962657c06b87d3469fd0bf355470588d12ecfa57f7bafa96aa7d10b/kernel-0.23.0-py3-none-any.whl", hash = "sha256:c5b7055bfc4bef6b36d984a870a3c779eb1018766ab0fe2845b11f130e88d83d", size = 199616, upload-time = "2025-12-11T20:19:24.24Z" }, -] - -[[package]] -name = "pillow" -version = "12.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/5a/a2f6773b64edb921a756eb0729068acad9fc5208a53f4a349396e9436721/pillow-12.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0fd00cac9c03256c8b2ff58f162ebcd2587ad3e1f2e397eab718c47e24d231cc", size = 5289798, upload-time = "2025-10-15T18:21:47.763Z" }, - { url = "https://files.pythonhosted.org/packages/2e/05/069b1f8a2e4b5a37493da6c5868531c3f77b85e716ad7a590ef87d58730d/pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3475b96f5908b3b16c47533daaa87380c491357d197564e0ba34ae75c0f3257", size = 4650589, upload-time = "2025-10-15T18:21:49.515Z" }, - { url = "https://files.pythonhosted.org/packages/61/e3/2c820d6e9a36432503ead175ae294f96861b07600a7156154a086ba7111a/pillow-12.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:110486b79f2d112cf6add83b28b627e369219388f64ef2f960fef9ebaf54c642", size = 6230472, upload-time = "2025-10-15T18:21:51.052Z" }, - { url = "https://files.pythonhosted.org/packages/4f/89/63427f51c64209c5e23d4d52071c8d0f21024d3a8a487737caaf614a5795/pillow-12.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5269cc1caeedb67e6f7269a42014f381f45e2e7cd42d834ede3c703a1d915fe3", size = 8033887, upload-time = "2025-10-15T18:21:52.604Z" }, - { url = "https://files.pythonhosted.org/packages/f6/1b/c9711318d4901093c15840f268ad649459cd81984c9ec9887756cca049a5/pillow-12.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa5129de4e174daccbc59d0a3b6d20eaf24417d59851c07ebb37aeb02947987c", size = 6343964, upload-time = "2025-10-15T18:21:54.619Z" }, - { url = "https://files.pythonhosted.org/packages/41/1e/db9470f2d030b4995083044cd8738cdd1bf773106819f6d8ba12597d5352/pillow-12.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bee2a6db3a7242ea309aa7ee8e2780726fed67ff4e5b40169f2c940e7eb09227", size = 7034756, upload-time = "2025-10-15T18:21:56.151Z" }, - { url = "https://files.pythonhosted.org/packages/cc/b0/6177a8bdd5ee4ed87cba2de5a3cc1db55ffbbec6176784ce5bb75aa96798/pillow-12.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:90387104ee8400a7b4598253b4c406f8958f59fcf983a6cea2b50d59f7d63d0b", size = 6458075, upload-time = "2025-10-15T18:21:57.759Z" }, - { url = "https://files.pythonhosted.org/packages/bc/5e/61537aa6fa977922c6a03253a0e727e6e4a72381a80d63ad8eec350684f2/pillow-12.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc91a56697869546d1b8f0a3ff35224557ae7f881050e99f615e0119bf934b4e", size = 7125955, upload-time = "2025-10-15T18:21:59.372Z" }, - { url = "https://files.pythonhosted.org/packages/1f/3d/d5033539344ee3cbd9a4d69e12e63ca3a44a739eb2d4c8da350a3d38edd7/pillow-12.0.0-cp311-cp311-win32.whl", hash = "sha256:27f95b12453d165099c84f8a8bfdfd46b9e4bda9e0e4b65f0635430027f55739", size = 6298440, upload-time = "2025-10-15T18:22:00.982Z" }, - { url = "https://files.pythonhosted.org/packages/4d/42/aaca386de5cc8bd8a0254516957c1f265e3521c91515b16e286c662854c4/pillow-12.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b583dc9070312190192631373c6c8ed277254aa6e6084b74bdd0a6d3b221608e", size = 6999256, upload-time = "2025-10-15T18:22:02.617Z" }, - { url = "https://files.pythonhosted.org/packages/ba/f1/9197c9c2d5708b785f631a6dfbfa8eb3fb9672837cb92ae9af812c13b4ed/pillow-12.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:759de84a33be3b178a64c8ba28ad5c135900359e85fb662bc6e403ad4407791d", size = 2436025, upload-time = "2025-10-15T18:22:04.598Z" }, - { url = "https://files.pythonhosted.org/packages/2c/90/4fcce2c22caf044e660a198d740e7fbc14395619e3cb1abad12192c0826c/pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371", size = 5249377, upload-time = "2025-10-15T18:22:05.993Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082", size = 4650343, upload-time = "2025-10-15T18:22:07.718Z" }, - { url = "https://files.pythonhosted.org/packages/e7/a1/f81fdeddcb99c044bf7d6faa47e12850f13cee0849537a7d27eeab5534d4/pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f", size = 6232981, upload-time = "2025-10-15T18:22:09.287Z" }, - { url = "https://files.pythonhosted.org/packages/88/e1/9098d3ce341a8750b55b0e00c03f1630d6178f38ac191c81c97a3b047b44/pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d", size = 8041399, upload-time = "2025-10-15T18:22:10.872Z" }, - { url = "https://files.pythonhosted.org/packages/a7/62/a22e8d3b602ae8cc01446d0c57a54e982737f44b6f2e1e019a925143771d/pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953", size = 6347740, upload-time = "2025-10-15T18:22:12.769Z" }, - { url = "https://files.pythonhosted.org/packages/4f/87/424511bdcd02c8d7acf9f65caa09f291a519b16bd83c3fb3374b3d4ae951/pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8", size = 7040201, upload-time = "2025-10-15T18:22:14.813Z" }, - { url = "https://files.pythonhosted.org/packages/dc/4d/435c8ac688c54d11755aedfdd9f29c9eeddf68d150fe42d1d3dbd2365149/pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79", size = 6462334, upload-time = "2025-10-15T18:22:16.375Z" }, - { url = "https://files.pythonhosted.org/packages/2b/f2/ad34167a8059a59b8ad10bc5c72d4d9b35acc6b7c0877af8ac885b5f2044/pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba", size = 7134162, upload-time = "2025-10-15T18:22:17.996Z" }, - { url = "https://files.pythonhosted.org/packages/0c/b1/a7391df6adacf0a5c2cf6ac1cf1fcc1369e7d439d28f637a847f8803beb3/pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0", size = 6298769, upload-time = "2025-10-15T18:22:19.923Z" }, - { url = "https://files.pythonhosted.org/packages/a2/0b/d87733741526541c909bbf159e338dcace4f982daac6e5a8d6be225ca32d/pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a", size = 7001107, upload-time = "2025-10-15T18:22:21.644Z" }, - { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" }, - { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" }, - { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" }, - { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" }, - { url = "https://files.pythonhosted.org/packages/a4/a4/a0a31467e3f83b94d37568294b01d22b43ae3c5d85f2811769b9c66389dd/pillow-12.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c50f36a62a22d350c96e49ad02d0da41dbd17ddc2e29750dbdba4323f85eb4a5", size = 5249132, upload-time = "2025-10-15T18:22:30.641Z" }, - { url = "https://files.pythonhosted.org/packages/83/06/48eab21dd561de2914242711434c0c0eb992ed08ff3f6107a5f44527f5e9/pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5193fde9a5f23c331ea26d0cf171fbf67e3f247585f50c08b3e205c7aeb4589b", size = 4650099, upload-time = "2025-10-15T18:22:32.73Z" }, - { url = "https://files.pythonhosted.org/packages/fc/bd/69ed99fd46a8dba7c1887156d3572fe4484e3f031405fcc5a92e31c04035/pillow-12.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bde737cff1a975b70652b62d626f7785e0480918dece11e8fef3c0cf057351c3", size = 6230808, upload-time = "2025-10-15T18:22:34.337Z" }, - { url = "https://files.pythonhosted.org/packages/ea/94/8fad659bcdbf86ed70099cb60ae40be6acca434bbc8c4c0d4ef356d7e0de/pillow-12.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6597ff2b61d121172f5844b53f21467f7082f5fb385a9a29c01414463f93b07", size = 8037804, upload-time = "2025-10-15T18:22:36.402Z" }, - { url = "https://files.pythonhosted.org/packages/20/39/c685d05c06deecfd4e2d1950e9a908aa2ca8bc4e6c3b12d93b9cafbd7837/pillow-12.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b817e7035ea7f6b942c13aa03bb554fc44fea70838ea21f8eb31c638326584e", size = 6345553, upload-time = "2025-10-15T18:22:38.066Z" }, - { url = "https://files.pythonhosted.org/packages/38/57/755dbd06530a27a5ed74f8cb0a7a44a21722ebf318edbe67ddbd7fb28f88/pillow-12.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f4f1231b7dec408e8670264ce63e9c71409d9583dd21d32c163e25213ee2a344", size = 7037729, upload-time = "2025-10-15T18:22:39.769Z" }, - { url = "https://files.pythonhosted.org/packages/ca/b6/7e94f4c41d238615674d06ed677c14883103dce1c52e4af16f000338cfd7/pillow-12.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e51b71417049ad6ab14c49608b4a24d8fb3fe605e5dfabfe523b58064dc3d27", size = 6459789, upload-time = "2025-10-15T18:22:41.437Z" }, - { url = "https://files.pythonhosted.org/packages/9c/14/4448bb0b5e0f22dd865290536d20ec8a23b64e2d04280b89139f09a36bb6/pillow-12.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d120c38a42c234dc9a8c5de7ceaaf899cf33561956acb4941653f8bdc657aa79", size = 7130917, upload-time = "2025-10-15T18:22:43.152Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" }, - { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" }, - { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5e/9046b423735c21f0487ea6cb5b10f89ea8f8dfbe32576fe052b5ba9d4e5b/pillow-12.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7fa22993bac7b77b78cae22bad1e2a987ddf0d9015c63358032f84a53f23cdc3", size = 5251406, upload-time = "2025-10-15T18:22:49.905Z" }, - { url = "https://files.pythonhosted.org/packages/12/66/982ceebcdb13c97270ef7a56c3969635b4ee7cd45227fa707c94719229c5/pillow-12.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f135c702ac42262573fe9714dfe99c944b4ba307af5eb507abef1667e2cbbced", size = 4653218, upload-time = "2025-10-15T18:22:51.587Z" }, - { url = "https://files.pythonhosted.org/packages/16/b3/81e625524688c31859450119bf12674619429cab3119eec0e30a7a1029cb/pillow-12.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c85de1136429c524e55cfa4e033b4a7940ac5c8ee4d9401cc2d1bf48154bbc7b", size = 6266564, upload-time = "2025-10-15T18:22:53.215Z" }, - { url = "https://files.pythonhosted.org/packages/98/59/dfb38f2a41240d2408096e1a76c671d0a105a4a8471b1871c6902719450c/pillow-12.0.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38df9b4bfd3db902c9c2bd369bcacaf9d935b2fff73709429d95cc41554f7b3d", size = 8069260, upload-time = "2025-10-15T18:22:54.933Z" }, - { url = "https://files.pythonhosted.org/packages/dc/3d/378dbea5cd1874b94c312425ca77b0f47776c78e0df2df751b820c8c1d6c/pillow-12.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d87ef5795da03d742bf49439f9ca4d027cde49c82c5371ba52464aee266699a", size = 6379248, upload-time = "2025-10-15T18:22:56.605Z" }, - { url = "https://files.pythonhosted.org/packages/84/b0/d525ef47d71590f1621510327acec75ae58c721dc071b17d8d652ca494d8/pillow-12.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aff9e4d82d082ff9513bdd6acd4f5bd359f5b2c870907d2b0a9c5e10d40c88fe", size = 7066043, upload-time = "2025-10-15T18:22:58.53Z" }, - { url = "https://files.pythonhosted.org/packages/61/2c/aced60e9cf9d0cde341d54bf7932c9ffc33ddb4a1595798b3a5150c7ec4e/pillow-12.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8d8ca2b210ada074d57fcee40c30446c9562e542fc46aedc19baf758a93532ee", size = 6490915, upload-time = "2025-10-15T18:23:00.582Z" }, - { url = "https://files.pythonhosted.org/packages/ef/26/69dcb9b91f4e59f8f34b2332a4a0a951b44f547c4ed39d3e4dcfcff48f89/pillow-12.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:99a7f72fb6249302aa62245680754862a44179b545ded638cf1fef59befb57ef", size = 7157998, upload-time = "2025-10-15T18:23:02.627Z" }, - { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" }, - { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" }, - { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" }, - { url = "https://files.pythonhosted.org/packages/54/2a/9a8c6ba2c2c07b71bec92cf63e03370ca5e5f5c5b119b742bcc0cde3f9c5/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9", size = 4045531, upload-time = "2025-10-15T18:23:10.121Z" }, - { url = "https://files.pythonhosted.org/packages/84/54/836fdbf1bfb3d66a59f0189ff0b9f5f666cee09c6188309300df04ad71fa/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2", size = 4120554, upload-time = "2025-10-15T18:23:12.14Z" }, - { url = "https://files.pythonhosted.org/packages/0d/cd/16aec9f0da4793e98e6b54778a5fbce4f375c6646fe662e80600b8797379/pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a", size = 3576812, upload-time = "2025-10-15T18:23:13.962Z" }, - { url = "https://files.pythonhosted.org/packages/f6/b7/13957fda356dc46339298b351cae0d327704986337c3c69bb54628c88155/pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b", size = 5252689, upload-time = "2025-10-15T18:23:15.562Z" }, - { url = "https://files.pythonhosted.org/packages/fc/f5/eae31a306341d8f331f43edb2e9122c7661b975433de5e447939ae61c5da/pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad", size = 4650186, upload-time = "2025-10-15T18:23:17.379Z" }, - { url = "https://files.pythonhosted.org/packages/86/62/2a88339aa40c4c77e79108facbd307d6091e2c0eb5b8d3cf4977cfca2fe6/pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01", size = 6230308, upload-time = "2025-10-15T18:23:18.971Z" }, - { url = "https://files.pythonhosted.org/packages/c7/33/5425a8992bcb32d1cb9fa3dd39a89e613d09a22f2c8083b7bf43c455f760/pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c", size = 8039222, upload-time = "2025-10-15T18:23:20.909Z" }, - { url = "https://files.pythonhosted.org/packages/d8/61/3f5d3b35c5728f37953d3eec5b5f3e77111949523bd2dd7f31a851e50690/pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e", size = 6346657, upload-time = "2025-10-15T18:23:23.077Z" }, - { url = "https://files.pythonhosted.org/packages/3a/be/ee90a3d79271227e0f0a33c453531efd6ed14b2e708596ba5dd9be948da3/pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e", size = 7038482, upload-time = "2025-10-15T18:23:25.005Z" }, - { url = "https://files.pythonhosted.org/packages/44/34/a16b6a4d1ad727de390e9bd9f19f5f669e079e5826ec0f329010ddea492f/pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9", size = 6461416, upload-time = "2025-10-15T18:23:27.009Z" }, - { url = "https://files.pythonhosted.org/packages/b6/39/1aa5850d2ade7d7ba9f54e4e4c17077244ff7a2d9e25998c38a29749eb3f/pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab", size = 7131584, upload-time = "2025-10-15T18:23:29.752Z" }, - { url = "https://files.pythonhosted.org/packages/bf/db/4fae862f8fad0167073a7733973bfa955f47e2cac3dc3e3e6257d10fab4a/pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b", size = 6400621, upload-time = "2025-10-15T18:23:32.06Z" }, - { url = "https://files.pythonhosted.org/packages/2b/24/b350c31543fb0107ab2599464d7e28e6f856027aadda995022e695313d94/pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b", size = 7142916, upload-time = "2025-10-15T18:23:34.71Z" }, - { url = "https://files.pythonhosted.org/packages/0f/9b/0ba5a6fd9351793996ef7487c4fdbde8d3f5f75dbedc093bb598648fddf0/pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0", size = 2523836, upload-time = "2025-10-15T18:23:36.967Z" }, - { url = "https://files.pythonhosted.org/packages/f5/7a/ceee0840aebc579af529b523d530840338ecf63992395842e54edc805987/pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6", size = 5255092, upload-time = "2025-10-15T18:23:38.573Z" }, - { url = "https://files.pythonhosted.org/packages/44/76/20776057b4bfd1aef4eeca992ebde0f53a4dce874f3ae693d0ec90a4f79b/pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6", size = 4653158, upload-time = "2025-10-15T18:23:40.238Z" }, - { url = "https://files.pythonhosted.org/packages/82/3f/d9ff92ace07be8836b4e7e87e6a4c7a8318d47c2f1463ffcf121fc57d9cb/pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1", size = 6267882, upload-time = "2025-10-15T18:23:42.434Z" }, - { url = "https://files.pythonhosted.org/packages/9f/7a/4f7ff87f00d3ad33ba21af78bfcd2f032107710baf8280e3722ceec28cda/pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e", size = 8071001, upload-time = "2025-10-15T18:23:44.29Z" }, - { url = "https://files.pythonhosted.org/packages/75/87/fcea108944a52dad8cca0715ae6247e271eb80459364a98518f1e4f480c1/pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca", size = 6380146, upload-time = "2025-10-15T18:23:46.065Z" }, - { url = "https://files.pythonhosted.org/packages/91/52/0d31b5e571ef5fd111d2978b84603fce26aba1b6092f28e941cb46570745/pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925", size = 7067344, upload-time = "2025-10-15T18:23:47.898Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f4/2dd3d721f875f928d48e83bb30a434dee75a2531bca839bb996bb0aa5a91/pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8", size = 6491864, upload-time = "2025-10-15T18:23:49.607Z" }, - { url = "https://files.pythonhosted.org/packages/30/4b/667dfcf3d61fc309ba5a15b141845cece5915e39b99c1ceab0f34bf1d124/pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4", size = 7158911, upload-time = "2025-10-15T18:23:51.351Z" }, - { url = "https://files.pythonhosted.org/packages/a2/2f/16cabcc6426c32218ace36bf0d55955e813f2958afddbf1d391849fee9d1/pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52", size = 6408045, upload-time = "2025-10-15T18:23:53.177Z" }, - { url = "https://files.pythonhosted.org/packages/35/73/e29aa0c9c666cf787628d3f0dcf379f4791fba79f4936d02f8b37165bdf8/pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a", size = 7148282, upload-time = "2025-10-15T18:23:55.316Z" }, - { url = "https://files.pythonhosted.org/packages/c1/70/6b41bdcddf541b437bbb9f47f94d2db5d9ddef6c37ccab8c9107743748a4/pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7", size = 2525630, upload-time = "2025-10-15T18:23:57.149Z" }, - { url = "https://files.pythonhosted.org/packages/1d/b3/582327e6c9f86d037b63beebe981425d6811104cb443e8193824ef1a2f27/pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8", size = 5215068, upload-time = "2025-10-15T18:23:59.594Z" }, - { url = "https://files.pythonhosted.org/packages/fd/d6/67748211d119f3b6540baf90f92fae73ae51d5217b171b0e8b5f7e5d558f/pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a", size = 4614994, upload-time = "2025-10-15T18:24:01.669Z" }, - { url = "https://files.pythonhosted.org/packages/2d/e1/f8281e5d844c41872b273b9f2c34a4bf64ca08905668c8ae730eedc7c9fa/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197", size = 5246639, upload-time = "2025-10-15T18:24:03.403Z" }, - { url = "https://files.pythonhosted.org/packages/94/5a/0d8ab8ffe8a102ff5df60d0de5af309015163bf710c7bb3e8311dd3b3ad0/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aeaefa96c768fc66818730b952a862235d68825c178f1b3ffd4efd7ad2edcb7c", size = 6986839, upload-time = "2025-10-15T18:24:05.344Z" }, - { url = "https://files.pythonhosted.org/packages/20/2e/3434380e8110b76cd9eb00a363c484b050f949b4bbe84ba770bb8508a02c/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09f2d0abef9e4e2f349305a4f8cc784a8a6c2f58a8c4892eea13b10a943bd26e", size = 5313505, upload-time = "2025-10-15T18:24:07.137Z" }, - { url = "https://files.pythonhosted.org/packages/57/ca/5a9d38900d9d74785141d6580950fe705de68af735ff6e727cb911b64740/pillow-12.0.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdee52571a343d721fb2eb3b090a82d959ff37fc631e3f70422e0c2e029f3e76", size = 5963654, upload-time = "2025-10-15T18:24:09.579Z" }, - { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" }, -] - -[[package]] -name = "playwright" -version = "1.56.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "greenlet" }, - { name = "pyee" }, -] +sdist = { url = "https://files.pythonhosted.org/packages/90/77/2b2430c9b017d50dc1b4bad2c394cb862d4e504dfd5868de5634ec2129df/kernel-0.38.0.tar.gz", hash = "sha256:6eb8bf6abc35c43c96a69ef6efe4235e2007393dd12dbb95f084595bef234453", size = 193498, upload-time = "2026-02-25T18:54:51.895Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/31/a5362cee43f844509f1f10d8a27c9cc0e2f7bdce5353d304d93b2151c1b1/playwright-1.56.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33eb89c516cbc6723f2e3523bada4a4eb0984a9c411325c02d7016a5d625e9c", size = 40611424, upload-time = "2025-11-11T18:39:10.175Z" }, - { url = "https://files.pythonhosted.org/packages/ef/95/347eef596d8778fb53590dc326c344d427fa19ba3d42b646fce2a4572eb3/playwright-1.56.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b228b3395212b9472a4ee5f1afe40d376eef9568eb039fcb3e563de8f4f4657b", size = 39400228, upload-time = "2025-11-11T18:39:13.915Z" }, - { url = "https://files.pythonhosted.org/packages/b9/54/6ad97b08b2ca1dfcb4fbde4536c4f45c0d9d8b1857a2d20e7bbfdf43bf15/playwright-1.56.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:0ef7e6fd653267798a8a968ff7aa2dcac14398b7dd7440ef57524e01e0fbbd65", size = 40611424, upload-time = "2025-11-11T18:39:17.093Z" }, - { url = "https://files.pythonhosted.org/packages/e4/76/6d409e37e82cdd5dda3df1ab958130ae32b46e42458bd4fc93d7eb8749cb/playwright-1.56.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:404be089b49d94bc4c1fe0dfb07664bda5ffe87789034a03bffb884489bdfb5c", size = 46263122, upload-time = "2025-11-11T18:39:20.619Z" }, - { url = "https://files.pythonhosted.org/packages/4f/84/fb292cc5d45f3252e255ea39066cd1d2385c61c6c1596548dfbf59c88605/playwright-1.56.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64cda7cf4e51c0d35dab55190841bfcdfb5871685ec22cb722cd0ad2df183e34", size = 46110645, upload-time = "2025-11-11T18:39:24.005Z" }, - { url = "https://files.pythonhosted.org/packages/61/bd/8c02c3388ae14edc374ac9f22cbe4e14826c6a51b2d8eaf86e89fabee264/playwright-1.56.0-py3-none-win32.whl", hash = "sha256:d87b79bcb082092d916a332c27ec9732e0418c319755d235d93cc6be13bdd721", size = 35639837, upload-time = "2025-11-11T18:39:27.174Z" }, - { url = "https://files.pythonhosted.org/packages/64/27/f13b538fbc6b7a00152f4379054a49f6abc0bf55ac86f677ae54bc49fb82/playwright-1.56.0-py3-none-win_amd64.whl", hash = "sha256:3c7fc49bb9e673489bf2622855f9486d41c5101bbed964638552b864c4591f94", size = 35639843, upload-time = "2025-11-11T18:39:30.851Z" }, - { url = "https://files.pythonhosted.org/packages/f2/c7/3ee8b556107995846576b4fe42a08ed49b8677619421f2afacf6ee421138/playwright-1.56.0-py3-none-win_arm64.whl", hash = "sha256:2745490ae8dd58d27e5ea4d9aa28402e8e2991eb84fb4b2fd5fbde2106716f6f", size = 31248959, upload-time = "2025-11-11T18:39:33.998Z" }, + { url = "https://files.pythonhosted.org/packages/4c/4d/c7b95eeac08fed24d15f11fee11c4807e154fbec7ad5cc99c7943e4a9e06/kernel-0.38.0-py3-none-any.whl", hash = "sha256:8548d34980034a1e9300a5bec51730a38729115355d86a7cd3e2680095f15bd6", size = 225184, upload-time = "2026-02-25T18:54:50.454Z" }, ] [[package]] @@ -444,27 +291,21 @@ wheels = [ ] [[package]] -name = "pyee" -version = "13.0.0" +name = "python-dotenv" +version = "1.2.1" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" }, + { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, ] [[package]] -name = "python-cua" +name = "python-openai-cua" version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "httpx" }, { name = "kernel" }, - { name = "pillow" }, - { name = "playwright" }, - { name = "pydantic" }, { name = "python-dotenv" }, { name = "requests" }, ] @@ -472,23 +313,11 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, - { name = "kernel", specifier = ">=0.23.0" }, - { name = "pillow", specifier = ">=12.0.0" }, - { name = "playwright", specifier = ">=1.56.0" }, - { name = "pydantic", specifier = ">=2.12.5" }, + { name = "kernel", specifier = ">=0.38.0" }, { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "requests", specifier = ">=2.32.5" }, ] -[[package]] -name = "python-dotenv" -version = "1.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, -] - [[package]] name = "requests" version = "2.32.5" diff --git a/pkg/templates/typescript/openai-computer-use/.env.example b/pkg/templates/typescript/openai-computer-use/.env.example index b74e0a29..3ff84207 100644 --- a/pkg/templates/typescript/openai-computer-use/.env.example +++ b/pkg/templates/typescript/openai-computer-use/.env.example @@ -1,2 +1,3 @@ -# Copy this file to .env and fill in your API key +# Copy this file to .env and fill in your API keys OPENAI_API_KEY=your_openai_api_key_here +KERNEL_API_KEY=your_kernel_api_key_here diff --git a/pkg/templates/typescript/openai-computer-use/README.md b/pkg/templates/typescript/openai-computer-use/README.md index 6ac98411..36f408a9 100644 --- a/pkg/templates/typescript/openai-computer-use/README.md +++ b/pkg/templates/typescript/openai-computer-use/README.md @@ -1,8 +1,25 @@ # Kernel TypeScript Sample App - OpenAI Computer Use -This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI. +This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI with Kernel's native browser control API. -It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. -Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel. +It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) instead of Playwright, and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. -See the [docs](https://www.kernel.sh/docs/quickstart) for information. +## Local testing + +You can test against a remote Kernel browser without deploying: + +```bash +cp .env.example .env +# Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env +pnpm install +pnpm run test:local +``` + +## Deploy to Kernel + +```bash +kernel deploy index.ts --env-file .env +kernel invoke ts-openai-cua cua-task -p '{"task":"Go to https://news.ycombinator.com and get the top 5 articles"}' +``` + +See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index 30c26477..014105fc 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -2,7 +2,7 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; import 'dotenv/config'; import type { ResponseItem, ResponseOutputMessage } from 'openai/resources/responses/responses'; import { Agent } from './lib/agent'; -import computers from './lib/computers'; +import { KernelComputer } from './lib/kernel-computer'; interface CuaInput { task: string; @@ -42,10 +42,9 @@ app.action( const kb = await kernel.browsers.create({ invocation_id: ctx.invocation_id }); console.log('> Kernel browser live view url:', kb.browser_live_view_url); - try { - const { computer } = await computers.create({ type: 'kernel', cdp_ws_url: kb.cdp_ws_url }); + const computer = new KernelComputer(kernel, kb.session_id); - // Navigate to DuckDuckGo as starting page (less likely to trigger captchas than Google) + try { await computer.goto('https://duckduckgo.com'); const agent = new Agent({ @@ -58,7 +57,6 @@ app.action( }, }); - // run agent and get response const logs = await agent.runFullTurn({ messages: [ { @@ -81,7 +79,6 @@ app.action( const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); - // filter only LLM messages const messages = logs.filter( (item): item is ResponseOutputMessage => item.type === 'message' && @@ -93,18 +90,11 @@ app.action( const lastContent = lastContentIndex >= 0 ? assistant?.content?.[lastContentIndex] : null; const answer = lastContent && 'text' in lastContent ? lastContent.text : null; - return { - // logs, // optionally, get the full agent run messages logs - elapsed, - answer, - }; + return { elapsed, answer }; } catch (error) { const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); console.error('Error in cua-task:', error); - return { - elapsed, - answer: null, - }; + return { elapsed, answer: null }; } finally { await kernel.browsers.deleteByID(kb.session_id); } diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 97441654..0ff0dbc4 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -7,22 +7,19 @@ import { type ResponseComputerToolCall, type ResponseComputerToolCallOutputItem, type ComputerTool, + type Tool, } from 'openai/resources/responses/responses'; import * as utils from './utils'; -import toolset from './toolset'; -import type { BasePlaywrightComputer } from './playwright/base'; -import type { LocalPlaywrightComputer } from './playwright/local'; -import type { KernelPlaywrightComputer } from './playwright/kernel'; +import { batchInstructions, batchComputerTool, navigationTools } from './toolset'; +import type { KernelComputer } from './kernel-computer'; + +const BATCH_FUNC_NAME = 'batch_computer_actions'; export class Agent { private model: string; - private computer: - | BasePlaywrightComputer - | LocalPlaywrightComputer - | KernelPlaywrightComputer - | undefined; - private tools: ComputerTool[]; + private computer: KernelComputer; + private tools: Tool[]; private print_steps = true; private debug = false; private show_images = false; @@ -30,28 +27,26 @@ export class Agent { constructor(opts: { model?: string; - computer?: - | BasePlaywrightComputer - | LocalPlaywrightComputer - | KernelPlaywrightComputer - | undefined; - tools?: ComputerTool[]; + computer: KernelComputer; + tools?: Tool[]; acknowledge_safety_check_callback?: (msg: string) => boolean; }) { this.model = opts.model ?? 'computer-use-preview'; this.computer = opts.computer; - this.tools = [...toolset.shared, ...(opts.tools ?? [])] as ComputerTool[]; this.ackCb = opts.acknowledge_safety_check_callback ?? ((): boolean => true); - if (this.computer) { - const [w, h] = this.computer.getDimensions(); - this.tools.push({ + const [w, h] = this.computer.getDimensions(); + this.tools = [ + ...navigationTools, + batchComputerTool, + ...(opts.tools ?? []), + { type: 'computer_use_preview', display_width: w, display_height: h, environment: this.computer.getEnvironment(), - }); - } + } as ComputerTool, + ]; } private debugPrint(...args: unknown[]): void { @@ -80,10 +75,18 @@ export class Agent { const fc = item as ResponseFunctionToolCallItem; const argsObj = JSON.parse(fc.arguments) as Record; if (this.print_steps) console.log(`${fc.name}(${JSON.stringify(argsObj)})`); - if (this.computer) { - const fn = (this.computer as unknown as Record)[fc.name]; - if (typeof fn === 'function') - await (fn as (...a: unknown[]) => unknown)(...Object.values(argsObj)); + + if (fc.name === BATCH_FUNC_NAME) { + return this.handleBatchCall(fc.call_id, argsObj); + } + + // Navigation tools (goto, back, forward) + const navFn = (this.computer as unknown as Record)[fc.name]; + if (typeof navFn === 'function') { + await (navFn as (...a: unknown[]) => unknown).call( + this.computer, + ...Object.values(argsObj), + ); } return [ { @@ -98,34 +101,99 @@ export class Agent { const cc = item as ResponseComputerToolCall; const { type: actionType, ...actionArgs } = cc.action; if (this.print_steps) console.log(`${actionType}(${JSON.stringify(actionArgs)})`); - if (this.computer) { - const fn = (this.computer as unknown as Record)[actionType as string]; - if (typeof fn === 'function') { - await (fn as (...a: unknown[]) => unknown)(...Object.values(actionArgs)); - const screenshot = await this.computer.screenshot(); - const pending = cc.pending_safety_checks ?? []; - for (const { message } of pending) - if (!this.ackCb(message)) throw new Error(`Safety check failed: ${message}`); - const out: Omit = { - type: 'computer_call_output', - call_id: cc.call_id, - // id: "?", // <---- omitting to work - need to determine id source, != call_id - acknowledged_safety_checks: pending, - output: { - type: 'computer_screenshot', - image_url: `data:image/webp;base64,${screenshot}`, - }, - }; - if (this.computer.getEnvironment() === 'browser') - utils.checkBlocklistedUrl(this.computer.getCurrentUrl()); - return [out as ResponseItem]; - } + + await this.executeComputerAction(actionType as string, cc.action as unknown as Record); + const screenshot = await this.computer.screenshot(); + + const pending = cc.pending_safety_checks ?? []; + for (const check of pending) { + const msg = check.message ?? ''; + if (!this.ackCb(msg)) throw new Error(`Safety check failed: ${msg}`); } + + const currentUrl = await this.computer.getCurrentUrl(); + utils.checkBlocklistedUrl(currentUrl); + + const out: Omit = { + type: 'computer_call_output', + call_id: cc.call_id, + acknowledged_safety_checks: pending, + output: { + type: 'computer_screenshot', + image_url: `data:image/png;base64,${screenshot}`, + }, + }; + return [out as ResponseItem]; } return []; } + private async executeComputerAction( + actionType: string, + action: Record, + ): Promise { + switch (actionType) { + case 'click': + await this.computer.click( + action.x as number, + action.y as number, + (action.button as string) ?? 'left', + ); + break; + case 'double_click': + await this.computer.doubleClick(action.x as number, action.y as number); + break; + case 'type': + await this.computer.type(action.text as string); + break; + case 'keypress': + await this.computer.keypress(action.keys as string[]); + break; + case 'scroll': + await this.computer.scroll( + action.x as number, + action.y as number, + (action.scroll_x as number) ?? 0, + (action.scroll_y as number) ?? 0, + ); + break; + case 'move': + await this.computer.move(action.x as number, action.y as number); + break; + case 'drag': + await this.computer.drag(action.path as Array<{ x: number; y: number }>); + break; + case 'wait': + await this.computer.wait((action.ms as number) ?? 1000); + break; + case 'screenshot': + break; + default: + console.warn(`Unknown computer action: ${actionType}`); + } + } + + private async handleBatchCall( + callId: string, + argsObj: Record, + ): Promise { + const actions = argsObj.actions as unknown as Parameters[0]; + await this.computer.batchActions(actions); + + const screenshot = await this.computer.screenshot(); + return [ + { + type: 'function_call_output', + call_id: callId, + output: JSON.stringify([ + { type: 'text', text: 'Actions executed successfully.' }, + { type: 'image_url', image_url: `data:image/png;base64,${screenshot}` }, + ]), + } as unknown as ResponseFunctionToolCallOutputItem, + ]; + } + async runFullTurn(opts: { messages: ResponseInputItem[]; print_steps?: boolean; @@ -141,49 +209,16 @@ export class Agent { newItems.length === 0 || (newItems[newItems.length - 1] as ResponseItem & { role?: string }).role !== 'assistant' ) { - // Add current URL to system message if in browser environment const inputMessages = [...opts.messages]; - if (this.computer?.getEnvironment() === 'browser') { - const current_url = this.computer.getCurrentUrl(); - // Find system message by checking if it has a role property with value 'system' - const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); - - if (sysIndex >= 0) { - const msg = inputMessages[sysIndex]; - const urlInfo = `\n- Current URL: ${current_url}`; - - // Create a properly typed message based on the original - if (msg && 'content' in msg) { - if (typeof msg.content === 'string') { - // Create a new message with the updated content - const updatedMsg = { - ...msg, - content: msg.content + urlInfo, - }; - // Type assertion to ensure compatibility - inputMessages[sysIndex] = updatedMsg as typeof msg; - } else if (Array.isArray(msg.content) && msg.content.length > 0) { - // Handle array content case - const updatedContent = [...msg.content]; - - // Check if first item has text property - if (updatedContent[0] && 'text' in updatedContent[0]) { - updatedContent[0] = { - ...updatedContent[0], - text: updatedContent[0].text + urlInfo, - }; - } - - // Create updated message with new content - const updatedMsg = { - ...msg, - content: updatedContent, - }; - // Type assertion to ensure compatibility - inputMessages[sysIndex] = updatedMsg as typeof msg; - } - } + // Append current URL context to system message + const currentUrl = await this.computer.getCurrentUrl(); + const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); + if (sysIndex >= 0) { + const msg = inputMessages[sysIndex]; + const urlInfo = `\n- Current URL: ${currentUrl}`; + if (msg && 'content' in msg && typeof msg.content === 'string') { + inputMessages[sysIndex] = { ...msg, content: msg.content + urlInfo } as typeof msg; } } @@ -193,6 +228,7 @@ export class Agent { input: [...inputMessages, ...newItems], tools: this.tools, truncation: 'auto', + instructions: batchInstructions, }); if (!response.output) throw new Error('No output from model'); for (const msg of response.output as ResponseItem[]) { @@ -200,7 +236,6 @@ export class Agent { } } - // Return sanitized messages if show_images is false return !this.show_images ? newItems.map((msg) => utils.sanitizeMessage(msg) as ResponseItem) : newItems; diff --git a/pkg/templates/typescript/openai-computer-use/lib/computers.ts b/pkg/templates/typescript/openai-computer-use/lib/computers.ts deleted file mode 100644 index 5828fc8e..00000000 --- a/pkg/templates/typescript/openai-computer-use/lib/computers.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { KernelPlaywrightComputer } from './playwright/kernel'; -import { LocalPlaywrightComputer } from './playwright/local'; - -interface KernelConfig { - type: 'kernel'; - cdp_ws_url: string; -} -interface LocalConfig { - type: 'local'; - headless?: boolean; -} -type ComputerConfig = KernelConfig | LocalConfig; - -export default { - async create( - cfg: ComputerConfig, - ): Promise<{ computer: KernelPlaywrightComputer | LocalPlaywrightComputer }> { - if (cfg.type === 'kernel') { - const computer = new KernelPlaywrightComputer(cfg.cdp_ws_url); - await computer.enter(); - return { computer }; - } else { - const computer = new LocalPlaywrightComputer(cfg.headless ?? false); - await computer.enter(); - return { computer }; - } - }, -}; diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts new file mode 100644 index 00000000..c2f32264 --- /dev/null +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -0,0 +1,243 @@ +import { Kernel } from '@onkernel/sdk'; + +// CUA model key names -> X11 keysym names for the Kernel computer API +const KEYSYM_MAP: Record = { + ENTER: 'Return', + Enter: 'Return', + RETURN: 'Return', + BACKSPACE: 'BackSpace', + Backspace: 'BackSpace', + DELETE: 'Delete', + TAB: 'Tab', + ESCAPE: 'Escape', + Escape: 'Escape', + ESC: 'Escape', + SPACE: 'space', + Space: 'space', + UP: 'Up', + DOWN: 'Down', + LEFT: 'Left', + RIGHT: 'Right', + HOME: 'Home', + END: 'End', + PAGEUP: 'Prior', + PAGE_UP: 'Prior', + PageUp: 'Prior', + PAGEDOWN: 'Next', + PAGE_DOWN: 'Next', + PageDown: 'Next', + CAPS_LOCK: 'Caps_Lock', + CapsLock: 'Caps_Lock', + CTRL: 'Control_L', + Ctrl: 'Control_L', + CONTROL: 'Control_L', + Control: 'Control_L', + ALT: 'Alt_L', + Alt: 'Alt_L', + SHIFT: 'Shift_L', + Shift: 'Shift_L', + META: 'Super_L', + Meta: 'Super_L', + SUPER: 'Super_L', + Super: 'Super_L', + CMD: 'Super_L', + COMMAND: 'Super_L', + F1: 'F1', + F2: 'F2', + F3: 'F3', + F4: 'F4', + F5: 'F5', + F6: 'F6', + F7: 'F7', + F8: 'F8', + F9: 'F9', + F10: 'F10', + F11: 'F11', + F12: 'F12', + INSERT: 'Insert', + Insert: 'Insert', + PRINT: 'Print', + SCROLLLOCK: 'Scroll_Lock', + PAUSE: 'Pause', + NUMLOCK: 'Num_Lock', +}; + +function translateKeys(keys: string[]): string[] { + return keys.map((k) => KEYSYM_MAP[k] ?? k); +} + +interface CuaAction { + type: string; + x?: number; + y?: number; + text?: string; + keys?: string[]; + button?: string | number; + scroll_x?: number; + scroll_y?: number; + ms?: number; + path?: Array<{ x: number; y: number }>; +} + +type BatchAction = { + type: 'click_mouse' | 'move_mouse' | 'type_text' | 'press_key' | 'scroll' | 'drag_mouse' | 'sleep'; + click_mouse?: { x: number; y: number; button?: string; num_clicks?: number }; + move_mouse?: { x: number; y: number }; + type_text?: { text: string }; + press_key?: { keys: string[] }; + scroll?: { x: number; y: number; delta_x?: number; delta_y?: number }; + drag_mouse?: { path: number[][] }; + sleep?: { duration_ms: number }; +}; + +function normalizeButton(button?: string | number): string { + if (button === undefined || button === null) return 'left'; + if (typeof button === 'number') { + switch (button) { + case 1: return 'left'; + case 2: return 'middle'; + case 3: return 'right'; + default: return 'left'; + } + } + return button; +} + +function translateCuaAction(action: CuaAction): BatchAction { + switch (action.type) { + case 'click': + return { + type: 'click_mouse', + click_mouse: { x: action.x ?? 0, y: action.y ?? 0, button: normalizeButton(action.button) }, + }; + case 'double_click': + return { + type: 'click_mouse', + click_mouse: { x: action.x ?? 0, y: action.y ?? 0, num_clicks: 2 }, + }; + case 'type': + return { type: 'type_text', type_text: { text: action.text ?? '' } }; + case 'keypress': + return { type: 'press_key', press_key: { keys: translateKeys(action.keys ?? []) } }; + case 'scroll': + return { + type: 'scroll', + scroll: { + x: action.x ?? 0, + y: action.y ?? 0, + delta_x: action.scroll_x ?? 0, + delta_y: action.scroll_y ?? 0, + }, + }; + case 'move': + return { type: 'move_mouse', move_mouse: { x: action.x ?? 0, y: action.y ?? 0 } }; + case 'drag': { + const path = (action.path ?? []).map((p) => [p.x, p.y]); + return { type: 'drag_mouse', drag_mouse: { path } }; + } + case 'wait': + return { type: 'sleep', sleep: { duration_ms: action.ms ?? 1000 } }; + default: + throw new Error(`Unknown CUA action type: ${action.type}`); + } +} + +export class KernelComputer { + private client: Kernel; + private sessionId: string; + private width = 1024; + private height = 768; + + constructor(client: Kernel, sessionId: string) { + this.client = client; + this.sessionId = sessionId; + } + + getEnvironment(): 'browser' { + return 'browser'; + } + + getDimensions(): [number, number] { + return [this.width, this.height]; + } + + async screenshot(): Promise { + const resp = await this.client.browsers.computer.captureScreenshot(this.sessionId); + const buf = Buffer.from(await resp.arrayBuffer()); + return buf.toString('base64'); + } + + async click(x: number, y: number, button: string | number = 'left'): Promise { + await this.client.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: normalizeButton(button) as 'left' | 'right' | 'middle', + }); + } + + async doubleClick(x: number, y: number): Promise { + await this.client.browsers.computer.clickMouse(this.sessionId, { x, y, num_clicks: 2 }); + } + + async type(text: string): Promise { + await this.client.browsers.computer.typeText(this.sessionId, { text }); + } + + async keypress(keys: string[]): Promise { + await this.client.browsers.computer.pressKey(this.sessionId, { keys: translateKeys(keys) }); + } + + async scroll(x: number, y: number, scrollX: number, scrollY: number): Promise { + await this.client.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x: scrollX, + delta_y: scrollY, + }); + } + + async move(x: number, y: number): Promise { + await this.client.browsers.computer.moveMouse(this.sessionId, { x, y }); + } + + async drag(path: Array<{ x: number; y: number }>): Promise { + const p = path.map((pt) => [pt.x, pt.y]); + await this.client.browsers.computer.dragMouse(this.sessionId, { path: p }); + } + + async wait(ms = 1000): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); + } + + async batchActions(actions: CuaAction[]): Promise { + const translated = actions.map(translateCuaAction); + await this.client.browsers.computer.batch(this.sessionId, { + actions: translated as Parameters[1]['actions'], + }); + } + + async goto(url: string): Promise { + await this.client.browsers.playwright.execute(this.sessionId, { + code: `await page.goto(${JSON.stringify(url)})`, + }); + } + + async back(): Promise { + await this.client.browsers.playwright.execute(this.sessionId, { + code: 'await page.goBack()', + }); + } + + async forward(): Promise { + await this.client.browsers.playwright.execute(this.sessionId, { + code: 'await page.goForward()', + }); + } + + async getCurrentUrl(): Promise { + const result = await this.client.browsers.playwright.execute(this.sessionId, { + code: 'return page.url()', + }); + return (result.result as string) ?? ''; + } +} diff --git a/pkg/templates/typescript/openai-computer-use/lib/playwright/base.ts b/pkg/templates/typescript/openai-computer-use/lib/playwright/base.ts deleted file mode 100644 index b43a7d2d..00000000 --- a/pkg/templates/typescript/openai-computer-use/lib/playwright/base.ts +++ /dev/null @@ -1,242 +0,0 @@ -import type { Browser, Page, Request, Response, Route } from 'playwright-core'; -import sharp from 'sharp'; -import utils from '../utils'; - -// CUA key -> Playwright key mapping -const KEY_MAP: Record = { - '/': '/', - '\\': '\\', - alt: 'Alt', - arrowdown: 'ArrowDown', - arrowleft: 'ArrowLeft', - arrowright: 'ArrowRight', - arrowup: 'ArrowUp', - backspace: 'Backspace', - capslock: 'CapsLock', - cmd: 'Meta', - ctrl: 'Control', - delete: 'Delete', - end: 'End', - enter: 'Enter', - esc: 'Escape', - home: 'Home', - insert: 'Insert', - option: 'Alt', - pagedown: 'PageDown', - pageup: 'PageUp', - shift: 'Shift', - space: ' ', - super: 'Meta', - tab: 'Tab', - win: 'Meta', -}; - -interface Point { - x: number; - y: number; -} - -export class BasePlaywrightComputer { - protected _browser: Browser | null = null; - protected _page: Page | null = null; - - constructor() { - this._browser = null; - this._page = null; - } - - /** - * Type guard to assert that this._page is present and is a Playwright Page. - * Throws an error if not present. - */ - protected _assertPage(): asserts this is { _page: Page } { - if (!this._page) { - throw new Error('Playwright Page is not initialized. Did you forget to call enter()?'); - } - } - - protected _handleNewPage = (page: Page): void => { - /** Handle the creation of a new page. */ - console.log('New page created'); - this._page = page; - page.on('close', this._handlePageClose.bind(this)); - }; - - protected _handlePageClose = (page: Page): void => { - /** Handle the closure of a page. */ - console.log('Page closed'); - try { - this._assertPage(); - } catch { - return; - } - if (this._page !== page) return; - - const browser = this._browser; - if (!browser || typeof browser.contexts !== 'function') { - console.log('Warning: Browser or context not available.'); - this._page = undefined as unknown as Page; - return; - } - - const contexts = browser.contexts(); - if (!contexts.length) { - console.log('Warning: No browser contexts available.'); - this._page = undefined as unknown as Page; - return; - } - - const context = contexts[0]; - if (!context || typeof context.pages !== 'function') { - console.log('Warning: Context pages not available.'); - this._page = undefined as unknown as Page; - return; - } - - const pages = context.pages(); - if (pages.length) { - this._page = pages[pages.length - 1] as Page; - } else { - console.log('Warning: All pages have been closed.'); - this._page = undefined as unknown as Page; - } - }; - - // Subclass hook - protected _getBrowserAndPage = async (): Promise<[Browser, Page]> => { - // Subclasses must implement, returning [Browser, Page] - throw new Error('Subclasses must implement _getBrowserAndPage()'); - }; - - getEnvironment = (): 'windows' | 'mac' | 'linux' | 'ubuntu' | 'browser' => { - return 'browser'; - }; - - getDimensions = (): [number, number] => { - return [1024, 768]; - }; - - enter = async (): Promise => { - // Call the subclass hook for getting browser/page - [this._browser, this._page] = await this._getBrowserAndPage(); - - // Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS - const handleRoute = (route: Route, request: Request): void => { - const url = request.url(); - if (utils.checkBlocklistedUrl(url)) { - console.log(`Flagging blocked domain: ${url}`); - route.abort(); - } else { - route.continue(); - } - }; - - this._assertPage(); - await this._page.route('**/*', handleRoute); - return this; - }; - - exit = async (): Promise => { - if (this._browser) await this._browser.close(); - }; - - getCurrentUrl = (): string => { - this._assertPage(); - return this._page.url(); - }; - - screenshot = async (): Promise => { - this._assertPage(); - const buf = await this._page.screenshot({ fullPage: false }); - const webp = await sharp(buf).webp().toBuffer(); - return webp.toString('base64'); - }; - - click = async ( - button: 'left' | 'right' | 'back' | 'forward' | 'wheel', - x: number, - y: number, - ): Promise => { - this._assertPage(); - switch (button) { - case 'back': - await this.back(); - return; - case 'forward': - await this.forward(); - return; - case 'wheel': - await this._page.mouse.wheel(x, y); - return; - default: { - const btn = button === 'right' ? 'right' : 'left'; - await this._page.mouse.click(x, y, { button: btn }); - return; - } - } - }; - - doubleClick = async (x: number, y: number): Promise => { - this._assertPage(); - await this._page.mouse.dblclick(x, y); - }; - - scroll = async (x: number, y: number, scrollX: number, scrollY: number): Promise => { - this._assertPage(); - await this._page.mouse.move(x, y); - await this._page.evaluate( - (params: { dx: number; dy: number }) => window.scrollBy(params.dx, params.dy), - { dx: scrollX, dy: scrollY }, - ); - }; - - type = async (text: string): Promise => { - this._assertPage(); - await this._page.keyboard.type(text); - }; - - keypress = async (keys: string[]): Promise => { - this._assertPage(); - const mapped = keys.map((k) => KEY_MAP[k.toLowerCase()] ?? k); - for (const k of mapped) await this._page.keyboard.down(k); - for (const k of [...mapped].reverse()) await this._page.keyboard.up(k); - }; - - wait = async (ms = 1000): Promise => { - await new Promise((resolve) => setTimeout(resolve, ms)); - }; - - move = async (x: number, y: number): Promise => { - this._assertPage(); - await this._page.mouse.move(x, y); - }; - - drag = async (path: Point[]): Promise => { - this._assertPage(); - const first = path[0]; - if (!first) return; - await this._page.mouse.move(first.x, first.y); - await this._page.mouse.down(); - for (const pt of path.slice(1)) await this._page.mouse.move(pt.x, pt.y); - await this._page.mouse.up(); - }; - - goto = async (url: string): Promise => { - this._assertPage(); - try { - return await this._page.goto(url); - } catch { - return null; - } - }; - - back = async (): Promise => { - this._assertPage(); - return (await this._page.goBack()) || null; - }; - - forward = async (): Promise => { - this._assertPage(); - return (await this._page.goForward()) || null; - }; -} diff --git a/pkg/templates/typescript/openai-computer-use/lib/playwright/kernel.ts b/pkg/templates/typescript/openai-computer-use/lib/playwright/kernel.ts deleted file mode 100644 index 4dd0c869..00000000 --- a/pkg/templates/typescript/openai-computer-use/lib/playwright/kernel.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { chromium, type Browser, type Page } from 'playwright-core'; -import { BasePlaywrightComputer } from './base'; - -/** - * KernelPlaywrightComputer connects to a remote browser instance via CDP WebSocket URL. - * Similar to LocalPlaywrightComputer but uses an existing browser instance instead of launching one. - */ -export class KernelPlaywrightComputer extends BasePlaywrightComputer { - private cdp_ws_url: string; - - constructor(cdp_ws_url: string) { - super(); - this.cdp_ws_url = cdp_ws_url; - } - - _getBrowserAndPage = async (): Promise<[Browser, Page]> => { - const [width, height] = this.getDimensions(); - - // Connect to existing browser instance via CDP - const browser = await chromium.connectOverCDP(this.cdp_ws_url); - - // Get existing context or create new one - let context = browser.contexts()[0]; - if (!context) { - context = await browser.newContext(); - } - - // Add event listeners for page creation and closure - context.on('page', this._handleNewPage.bind(this)); - - // Get existing page or create new one - let page = context.pages()[0]; - if (!page) { - page = await context.newPage(); - } - - // Set viewport size - await page.setViewportSize({ width, height }); - page.on('close', this._handlePageClose.bind(this)); - - return [browser, page]; - }; -} diff --git a/pkg/templates/typescript/openai-computer-use/lib/playwright/local.ts b/pkg/templates/typescript/openai-computer-use/lib/playwright/local.ts deleted file mode 100644 index d0437801..00000000 --- a/pkg/templates/typescript/openai-computer-use/lib/playwright/local.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { chromium, type Browser, type Page } from 'playwright-core'; -import { BasePlaywrightComputer } from './base'; - -/** - * Launches a local Chromium instance using Playwright. - */ -export class LocalPlaywrightComputer extends BasePlaywrightComputer { - private headless: boolean; - - constructor(headless = false) { - super(); - this.headless = headless; - } - - _getBrowserAndPage = async (): Promise<[Browser, Page]> => { - const [width, height] = this.getDimensions(); - const launchArgs = [ - `--window-size=${width},${height}`, - '--disable-extensions', - '--disable-file-system', - ]; - - const browser = await chromium.launch({ - headless: this.headless, - args: launchArgs, - env: { DISPLAY: ':0' }, - }); - - const context = await browser.newContext(); - - // Add event listeners for page creation and closure - context.on('page', this._handleNewPage.bind(this)); - - const page = await context.newPage(); - await page.setViewportSize({ width, height }); - page.on('close', this._handlePageClose.bind(this)); - - await page.goto('https://duckduckgo.com'); - - // console.dir({debug_getBrowserAndPage: [browser, page]}); - return [browser, page]; - }; -} diff --git a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts index 2999d0bd..4cd39321 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts @@ -1,6 +1,57 @@ -const shared = [ +export const batchInstructions = `You have two ways to perform actions: +1. The standard computer tool — use for single actions when you need screenshot feedback after each step. +2. batch_computer_actions — use to execute multiple actions at once when you can predict the outcome. + +ALWAYS prefer batch_computer_actions when performing predictable sequences like: +- Clicking a text field, typing text, and pressing Enter +- Typing a URL and pressing Enter +- Any sequence where you don't need to see intermediate results`; + +export const batchComputerTool = { + type: 'function' as const, + name: 'batch_computer_actions', + description: + 'Execute multiple computer actions in sequence without waiting for ' + + 'screenshots between them. Use this when you can predict the outcome of a ' + + 'sequence of actions without needing intermediate visual feedback. After all ' + + 'actions execute, a single screenshot is taken and returned.\n\n' + + 'PREFER this over individual computer actions when:\n' + + '- Typing text followed by pressing Enter\n' + + '- Clicking a field and then typing into it\n' + + '- Any sequence where intermediate screenshots are not needed', + parameters: { + type: 'object', + properties: { + actions: { + type: 'array', + description: 'Ordered list of actions to execute', + items: { + type: 'object', + properties: { + type: { + type: 'string', + enum: ['click', 'double_click', 'type', 'keypress', 'scroll', 'move', 'drag', 'wait'], + }, + x: { type: 'number' }, + y: { type: 'number' }, + text: { type: 'string' }, + keys: { type: 'array', items: { type: 'string' } }, + button: { type: 'string' }, + scroll_x: { type: 'number' }, + scroll_y: { type: 'number' }, + }, + required: ['type'], + }, + }, + }, + required: ['actions'], + }, + strict: false, +}; + +export const navigationTools = [ { - type: 'function', + type: 'function' as const, name: 'goto', description: 'Go to a specific URL.', parameters: { @@ -14,9 +65,10 @@ const shared = [ additionalProperties: false, required: ['url'], }, + strict: false, }, { - type: 'function', + type: 'function' as const, name: 'back', description: 'Navigate back in the browser history.', parameters: { @@ -24,9 +76,10 @@ const shared = [ properties: {}, additionalProperties: false, }, + strict: false, }, { - type: 'function', + type: 'function' as const, name: 'forward', description: 'Navigate forward in the browser history.', parameters: { @@ -34,7 +87,6 @@ const shared = [ properties: {}, additionalProperties: false, }, + strict: false, }, ]; - -export default { shared }; diff --git a/pkg/templates/typescript/openai-computer-use/lib/utils.ts b/pkg/templates/typescript/openai-computer-use/lib/utils.ts index f2dc0fd5..da503cd8 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/utils.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/utils.ts @@ -1,5 +1,4 @@ import 'dotenv/config'; -import sharp from 'sharp'; import OpenAI from 'openai'; import { type ResponseItem } from 'openai/resources/responses/responses'; const openai = new OpenAI(); @@ -13,13 +12,6 @@ const BLOCKED_DOMAINS: readonly string[] = [ 'ilanbigio.com', ] as const; -export async function calculateImageDimensions( - base64Image: string, -): Promise<{ width: number; height: number }> { - const buf = Buffer.from(base64Image, 'base64'); - const meta = await sharp(buf).metadata(); - return { width: meta.width ?? 0, height: meta.height ?? 0 }; -} export function sanitizeMessage(msg: ResponseItem): ResponseItem { const sanitizedMsg = { ...msg } as ResponseItem; if ( @@ -49,12 +41,15 @@ export async function createResponse( } export function checkBlocklistedUrl(url: string): boolean { - const host = new URL(url).hostname; - return BLOCKED_DOMAINS.some((d) => host === d || host.endsWith(`.${d}`)); + try { + const host = new URL(url).hostname; + return BLOCKED_DOMAINS.some((d) => host === d || host.endsWith(`.${d}`)); + } catch { + return false; + } } export default { - calculateImageDimensions, sanitizeMessage, createResponse, checkBlocklistedUrl, diff --git a/pkg/templates/typescript/openai-computer-use/package.json b/pkg/templates/typescript/openai-computer-use/package.json index bdfa99dc..7fdc55b4 100644 --- a/pkg/templates/typescript/openai-computer-use/package.json +++ b/pkg/templates/typescript/openai-computer-use/package.json @@ -2,17 +2,17 @@ "type": "module", "private": true, "scripts": { - "build": "tsc" + "build": "tsc", + "test:local": "npx tsx test.local.ts" }, "dependencies": { - "@onkernel/sdk": "^0.23.0", + "@onkernel/sdk": "^0.38.0", "dotenv": "^17.2.3", - "openai": "^6.13.0", - "playwright-core": "^1.57.0", - "sharp": "^0.34.5" + "openai": "^6.13.0" }, "devDependencies": { "@types/node": "^22.15.17", + "tsx": "^4.19.0", "typescript": "^5.9.3" } } diff --git a/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml b/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml index c3737350..39dc64d1 100644 --- a/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml +++ b/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml @@ -9,186 +9,208 @@ importers: .: dependencies: '@onkernel/sdk': - specifier: ^0.23.0 - version: 0.23.0 + specifier: ^0.38.0 + version: 0.38.0 dotenv: specifier: ^17.2.3 - version: 17.2.3 + version: 17.3.1 openai: specifier: ^6.13.0 - version: 6.13.0 - playwright-core: - specifier: ^1.57.0 - version: 1.57.0 - sharp: - specifier: ^0.34.5 - version: 0.34.5 + version: 6.25.0 devDependencies: '@types/node': specifier: ^22.15.17 - version: 22.19.3 + version: 22.19.11 + tsx: + specifier: ^4.19.0 + version: 4.21.0 typescript: specifier: ^5.9.3 version: 5.9.3 packages: - '@emnapi/runtime@1.7.1': - resolution: {integrity: sha512-PVtJr5CmLwYAU9PZDMITZoR5iAOShYREoR45EyyLrbntV50mdePTgUn4AmOw90Ifcj+x2kRjdzr1HP3RrNiHGA==} + '@esbuild/aix-ppc64@0.27.3': + resolution: {integrity: sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + + '@esbuild/android-arm64@0.27.3': + resolution: {integrity: sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] - '@img/colour@1.0.0': - resolution: {integrity: sha512-A5P/LfWGFSl6nsckYtjw9da+19jB8hkJ6ACTGcDfEJ0aE+l2n2El7dsVM7UVHZQ9s2lmYMWlrS21YLy2IR1LUw==} + '@esbuild/android-arm@0.27.3': + resolution: {integrity: sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==} engines: {node: '>=18'} + cpu: [arm] + os: [android] - '@img/sharp-darwin-arm64@0.34.5': - resolution: {integrity: sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/android-x64@0.27.3': + resolution: {integrity: sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + + '@esbuild/darwin-arm64@0.27.3': + resolution: {integrity: sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==} + engines: {node: '>=18'} cpu: [arm64] os: [darwin] - '@img/sharp-darwin-x64@0.34.5': - resolution: {integrity: sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/darwin-x64@0.27.3': + resolution: {integrity: sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==} + engines: {node: '>=18'} cpu: [x64] os: [darwin] - '@img/sharp-libvips-darwin-arm64@1.2.4': - resolution: {integrity: sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==} + '@esbuild/freebsd-arm64@0.27.3': + resolution: {integrity: sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==} + engines: {node: '>=18'} cpu: [arm64] - os: [darwin] + os: [freebsd] - '@img/sharp-libvips-darwin-x64@1.2.4': - resolution: {integrity: sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==} + '@esbuild/freebsd-x64@0.27.3': + resolution: {integrity: sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==} + engines: {node: '>=18'} cpu: [x64] - os: [darwin] + os: [freebsd] - '@img/sharp-libvips-linux-arm64@1.2.4': - resolution: {integrity: sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==} + '@esbuild/linux-arm64@0.27.3': + resolution: {integrity: sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==} + engines: {node: '>=18'} cpu: [arm64] os: [linux] - '@img/sharp-libvips-linux-arm@1.2.4': - resolution: {integrity: sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==} + '@esbuild/linux-arm@0.27.3': + resolution: {integrity: sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==} + engines: {node: '>=18'} cpu: [arm] os: [linux] - '@img/sharp-libvips-linux-ppc64@1.2.4': - resolution: {integrity: sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==} + '@esbuild/linux-ia32@0.27.3': + resolution: {integrity: sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + + '@esbuild/linux-loong64@0.27.3': + resolution: {integrity: sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + + '@esbuild/linux-mips64el@0.27.3': + resolution: {integrity: sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + + '@esbuild/linux-ppc64@0.27.3': + resolution: {integrity: sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==} + engines: {node: '>=18'} cpu: [ppc64] os: [linux] - '@img/sharp-libvips-linux-riscv64@1.2.4': - resolution: {integrity: sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==} + '@esbuild/linux-riscv64@0.27.3': + resolution: {integrity: sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==} + engines: {node: '>=18'} cpu: [riscv64] os: [linux] - '@img/sharp-libvips-linux-s390x@1.2.4': - resolution: {integrity: sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==} + '@esbuild/linux-s390x@0.27.3': + resolution: {integrity: sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==} + engines: {node: '>=18'} cpu: [s390x] os: [linux] - '@img/sharp-libvips-linux-x64@1.2.4': - resolution: {integrity: sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==} + '@esbuild/linux-x64@0.27.3': + resolution: {integrity: sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==} + engines: {node: '>=18'} cpu: [x64] os: [linux] - '@img/sharp-libvips-linuxmusl-arm64@1.2.4': - resolution: {integrity: sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==} + '@esbuild/netbsd-arm64@0.27.3': + resolution: {integrity: sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==} + engines: {node: '>=18'} cpu: [arm64] - os: [linux] + os: [netbsd] - '@img/sharp-libvips-linuxmusl-x64@1.2.4': - resolution: {integrity: sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==} + '@esbuild/netbsd-x64@0.27.3': + resolution: {integrity: sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==} + engines: {node: '>=18'} cpu: [x64] - os: [linux] + os: [netbsd] - '@img/sharp-linux-arm64@0.34.5': - resolution: {integrity: sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/openbsd-arm64@0.27.3': + resolution: {integrity: sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==} + engines: {node: '>=18'} cpu: [arm64] - os: [linux] + os: [openbsd] - '@img/sharp-linux-arm@0.34.5': - resolution: {integrity: sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - cpu: [arm] - os: [linux] - - '@img/sharp-linux-ppc64@0.34.5': - resolution: {integrity: sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - cpu: [ppc64] - os: [linux] - - '@img/sharp-linux-riscv64@0.34.5': - resolution: {integrity: sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - cpu: [riscv64] - os: [linux] - - '@img/sharp-linux-s390x@0.34.5': - resolution: {integrity: sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - cpu: [s390x] - os: [linux] - - '@img/sharp-linux-x64@0.34.5': - resolution: {integrity: sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/openbsd-x64@0.27.3': + resolution: {integrity: sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==} + engines: {node: '>=18'} cpu: [x64] - os: [linux] + os: [openbsd] - '@img/sharp-linuxmusl-arm64@0.34.5': - resolution: {integrity: sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/openharmony-arm64@0.27.3': + resolution: {integrity: sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==} + engines: {node: '>=18'} cpu: [arm64] - os: [linux] + os: [openharmony] - '@img/sharp-linuxmusl-x64@0.34.5': - resolution: {integrity: sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/sunos-x64@0.27.3': + resolution: {integrity: sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==} + engines: {node: '>=18'} cpu: [x64] - os: [linux] + os: [sunos] - '@img/sharp-wasm32@0.34.5': - resolution: {integrity: sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - cpu: [wasm32] - - '@img/sharp-win32-arm64@0.34.5': - resolution: {integrity: sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/win32-arm64@0.27.3': + resolution: {integrity: sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==} + engines: {node: '>=18'} cpu: [arm64] os: [win32] - '@img/sharp-win32-ia32@0.34.5': - resolution: {integrity: sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/win32-ia32@0.27.3': + resolution: {integrity: sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==} + engines: {node: '>=18'} cpu: [ia32] os: [win32] - '@img/sharp-win32-x64@0.34.5': - resolution: {integrity: sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} + '@esbuild/win32-x64@0.27.3': + resolution: {integrity: sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==} + engines: {node: '>=18'} cpu: [x64] os: [win32] - '@onkernel/sdk@0.23.0': - resolution: {integrity: sha512-P/ez6HU8sO2QvqWATkvC+Wdv+fgto4KfBCHLl2T6EUpoU3LhgOZ/sJP2ZRf/vh5Vh7QR2Vf05RgMaFcIGBGD9Q==} - - '@types/node@22.19.3': - resolution: {integrity: sha512-1N9SBnWYOJTrNZCdh/yJE+t910Y128BoyY+zBLWhL3r0TYzlTmFdXrPwHL9DyFZmlEXNQQolTZh3KHV31QDhyA==} + '@onkernel/sdk@0.38.0': + resolution: {integrity: sha512-BwbC3OkUg9xhdTshyyUi7+vqwC6gjsHpfpFsDAlVe/rzzledBsL3Usf5rrYfk1Bpk72P+OfF2NtUt5HLaVrjvQ==} - detect-libc@2.1.2: - resolution: {integrity: sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==} - engines: {node: '>=8'} + '@types/node@22.19.11': + resolution: {integrity: sha512-BH7YwL6rA93ReqeQS1c4bsPpcfOmJasG+Fkr6Y59q83f9M1WcBRHR2vM+P9eOisYRcN3ujQoiZY8uk5W+1WL8w==} - dotenv@17.2.3: - resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} + dotenv@17.3.1: + resolution: {integrity: sha512-IO8C/dzEb6O3F9/twg6ZLXz164a2fhTnEWb95H23Dm4OuN+92NmEAlTrupP9VW6Jm3sO26tQlqyvyi4CsnY9GA==} engines: {node: '>=12'} - openai@6.13.0: - resolution: {integrity: sha512-yHbMo+EpNGPG3sRrXvmo0LhUPFN4bAURJw3G17bE+ax1G4tcTFCa9ZjvCWh3cvni0aHY0uWlk2IxcsPH4NR9Ow==} + esbuild@0.27.3: + resolution: {integrity: sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==} + engines: {node: '>=18'} + hasBin: true + + fsevents@2.3.3: + resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==} + engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} + os: [darwin] + + get-tsconfig@4.13.6: + resolution: {integrity: sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==} + + openai@6.25.0: + resolution: {integrity: sha512-mEh6VZ2ds2AGGokWARo18aPISI1OhlgdEIC1ewhkZr8pSIT31dec0ecr9Nhxx0JlybyOgoAT1sWeKtwPZzJyww==} hasBin: true peerDependencies: ws: ^8.18.0 @@ -199,23 +221,14 @@ packages: zod: optional: true - playwright-core@1.57.0: - resolution: {integrity: sha512-agTcKlMw/mjBWOnD6kFZttAAGHgi/Nw0CZ2o6JqWSbMlI219lAFLZZCyqByTsvVAJq5XA5H8cA6PrvBRpBWEuQ==} - engines: {node: '>=18'} - hasBin: true + resolve-pkg-maps@1.0.0: + resolution: {integrity: sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==} - semver@7.7.3: - resolution: {integrity: sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==} - engines: {node: '>=10'} + tsx@4.21.0: + resolution: {integrity: sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==} + engines: {node: '>=18.0.0'} hasBin: true - sharp@0.34.5: - resolution: {integrity: sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==} - engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} - - tslib@2.8.1: - resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} - typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} @@ -226,156 +239,138 @@ packages: snapshots: - '@emnapi/runtime@1.7.1': - dependencies: - tslib: 2.8.1 + '@esbuild/aix-ppc64@0.27.3': optional: true - '@img/colour@1.0.0': {} + '@esbuild/android-arm64@0.27.3': + optional: true - '@img/sharp-darwin-arm64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-darwin-arm64': 1.2.4 + '@esbuild/android-arm@0.27.3': optional: true - '@img/sharp-darwin-x64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-darwin-x64': 1.2.4 + '@esbuild/android-x64@0.27.3': optional: true - '@img/sharp-libvips-darwin-arm64@1.2.4': + '@esbuild/darwin-arm64@0.27.3': optional: true - '@img/sharp-libvips-darwin-x64@1.2.4': + '@esbuild/darwin-x64@0.27.3': optional: true - '@img/sharp-libvips-linux-arm64@1.2.4': + '@esbuild/freebsd-arm64@0.27.3': optional: true - '@img/sharp-libvips-linux-arm@1.2.4': + '@esbuild/freebsd-x64@0.27.3': optional: true - '@img/sharp-libvips-linux-ppc64@1.2.4': + '@esbuild/linux-arm64@0.27.3': optional: true - '@img/sharp-libvips-linux-riscv64@1.2.4': + '@esbuild/linux-arm@0.27.3': optional: true - '@img/sharp-libvips-linux-s390x@1.2.4': + '@esbuild/linux-ia32@0.27.3': optional: true - '@img/sharp-libvips-linux-x64@1.2.4': + '@esbuild/linux-loong64@0.27.3': optional: true - '@img/sharp-libvips-linuxmusl-arm64@1.2.4': + '@esbuild/linux-mips64el@0.27.3': optional: true - '@img/sharp-libvips-linuxmusl-x64@1.2.4': + '@esbuild/linux-ppc64@0.27.3': optional: true - '@img/sharp-linux-arm64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-arm64': 1.2.4 + '@esbuild/linux-riscv64@0.27.3': optional: true - '@img/sharp-linux-arm@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-arm': 1.2.4 + '@esbuild/linux-s390x@0.27.3': optional: true - '@img/sharp-linux-ppc64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-ppc64': 1.2.4 + '@esbuild/linux-x64@0.27.3': optional: true - '@img/sharp-linux-riscv64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-riscv64': 1.2.4 + '@esbuild/netbsd-arm64@0.27.3': optional: true - '@img/sharp-linux-s390x@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-s390x': 1.2.4 + '@esbuild/netbsd-x64@0.27.3': optional: true - '@img/sharp-linux-x64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linux-x64': 1.2.4 + '@esbuild/openbsd-arm64@0.27.3': optional: true - '@img/sharp-linuxmusl-arm64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linuxmusl-arm64': 1.2.4 + '@esbuild/openbsd-x64@0.27.3': optional: true - '@img/sharp-linuxmusl-x64@0.34.5': - optionalDependencies: - '@img/sharp-libvips-linuxmusl-x64': 1.2.4 + '@esbuild/openharmony-arm64@0.27.3': optional: true - '@img/sharp-wasm32@0.34.5': - dependencies: - '@emnapi/runtime': 1.7.1 + '@esbuild/sunos-x64@0.27.3': optional: true - '@img/sharp-win32-arm64@0.34.5': + '@esbuild/win32-arm64@0.27.3': optional: true - '@img/sharp-win32-ia32@0.34.5': + '@esbuild/win32-ia32@0.27.3': optional: true - '@img/sharp-win32-x64@0.34.5': + '@esbuild/win32-x64@0.27.3': optional: true - '@onkernel/sdk@0.23.0': {} + '@onkernel/sdk@0.38.0': {} - '@types/node@22.19.3': + '@types/node@22.19.11': dependencies: undici-types: 6.21.0 - detect-libc@2.1.2: {} + dotenv@17.3.1: {} - dotenv@17.2.3: {} + esbuild@0.27.3: + optionalDependencies: + '@esbuild/aix-ppc64': 0.27.3 + '@esbuild/android-arm': 0.27.3 + '@esbuild/android-arm64': 0.27.3 + '@esbuild/android-x64': 0.27.3 + '@esbuild/darwin-arm64': 0.27.3 + '@esbuild/darwin-x64': 0.27.3 + '@esbuild/freebsd-arm64': 0.27.3 + '@esbuild/freebsd-x64': 0.27.3 + '@esbuild/linux-arm': 0.27.3 + '@esbuild/linux-arm64': 0.27.3 + '@esbuild/linux-ia32': 0.27.3 + '@esbuild/linux-loong64': 0.27.3 + '@esbuild/linux-mips64el': 0.27.3 + '@esbuild/linux-ppc64': 0.27.3 + '@esbuild/linux-riscv64': 0.27.3 + '@esbuild/linux-s390x': 0.27.3 + '@esbuild/linux-x64': 0.27.3 + '@esbuild/netbsd-arm64': 0.27.3 + '@esbuild/netbsd-x64': 0.27.3 + '@esbuild/openbsd-arm64': 0.27.3 + '@esbuild/openbsd-x64': 0.27.3 + '@esbuild/openharmony-arm64': 0.27.3 + '@esbuild/sunos-x64': 0.27.3 + '@esbuild/win32-arm64': 0.27.3 + '@esbuild/win32-ia32': 0.27.3 + '@esbuild/win32-x64': 0.27.3 + + fsevents@2.3.3: + optional: true - openai@6.13.0: {} + get-tsconfig@4.13.6: + dependencies: + resolve-pkg-maps: 1.0.0 - playwright-core@1.57.0: {} + openai@6.25.0: {} - semver@7.7.3: {} + resolve-pkg-maps@1.0.0: {} - sharp@0.34.5: + tsx@4.21.0: dependencies: - '@img/colour': 1.0.0 - detect-libc: 2.1.2 - semver: 7.7.3 + esbuild: 0.27.3 + get-tsconfig: 4.13.6 optionalDependencies: - '@img/sharp-darwin-arm64': 0.34.5 - '@img/sharp-darwin-x64': 0.34.5 - '@img/sharp-libvips-darwin-arm64': 1.2.4 - '@img/sharp-libvips-darwin-x64': 1.2.4 - '@img/sharp-libvips-linux-arm': 1.2.4 - '@img/sharp-libvips-linux-arm64': 1.2.4 - '@img/sharp-libvips-linux-ppc64': 1.2.4 - '@img/sharp-libvips-linux-riscv64': 1.2.4 - '@img/sharp-libvips-linux-s390x': 1.2.4 - '@img/sharp-libvips-linux-x64': 1.2.4 - '@img/sharp-libvips-linuxmusl-arm64': 1.2.4 - '@img/sharp-libvips-linuxmusl-x64': 1.2.4 - '@img/sharp-linux-arm': 0.34.5 - '@img/sharp-linux-arm64': 0.34.5 - '@img/sharp-linux-ppc64': 0.34.5 - '@img/sharp-linux-riscv64': 0.34.5 - '@img/sharp-linux-s390x': 0.34.5 - '@img/sharp-linux-x64': 0.34.5 - '@img/sharp-linuxmusl-arm64': 0.34.5 - '@img/sharp-linuxmusl-x64': 0.34.5 - '@img/sharp-wasm32': 0.34.5 - '@img/sharp-win32-arm64': 0.34.5 - '@img/sharp-win32-ia32': 0.34.5 - '@img/sharp-win32-x64': 0.34.5 - - tslib@2.8.1: - optional: true + fsevents: 2.3.3 typescript@5.9.3: {} diff --git a/pkg/templates/typescript/openai-computer-use/test.local.ts b/pkg/templates/typescript/openai-computer-use/test.local.ts index 23f9a5cc..90375999 100644 --- a/pkg/templates/typescript/openai-computer-use/test.local.ts +++ b/pkg/templates/typescript/openai-computer-use/test.local.ts @@ -1,49 +1,69 @@ import 'dotenv/config'; +import { Kernel } from '@onkernel/sdk'; import { Agent } from './lib/agent'; -import computers from './lib/computers'; +import { KernelComputer } from './lib/kernel-computer'; -/* - to run a local browser test before deploying to kernel -*/ +/** + * Local test script that creates a remote Kernel browser and runs the CUA agent. + * No Kernel app deployment needed. + * + * Usage: + * KERNEL_API_KEY=... OPENAI_API_KEY=... npx tsx test.local.ts + */ async function test(): Promise { - const { computer } = await computers.create({ type: 'local' }); - const agent = new Agent({ - model: 'computer-use-preview', - computer, - tools: [], - acknowledge_safety_check_callback: (m: string): boolean => { - console.log(`> safety check: ${m}`); - return true; - }, - }); - - // run agent and get response - const logs = await agent.runFullTurn({ - messages: [ - { - role: 'system', - content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( - 'en-US', - { weekday: 'long' }, - )})`, - }, - { - type: 'message', - role: 'user', - content: [ - { - type: 'input_text', - text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', - }, - ], + if (!process.env.KERNEL_API_KEY) throw new Error('KERNEL_API_KEY is not set'); + if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); + + const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); + const browser = await client.browsers.create({ timeout_seconds: 300 }); + console.log('> Browser session:', browser.session_id); + console.log('> Live view:', browser.browser_live_view_url); + + const computer = new KernelComputer(client, browser.session_id); + + try { + await computer.goto('https://duckduckgo.com'); + + const agent = new Agent({ + model: 'computer-use-preview', + computer, + tools: [], + acknowledge_safety_check_callback: (m: string): boolean => { + console.log(`> safety check: ${m}`); + return true; }, - ], - print_steps: true, - debug: true, - show_images: false, - }); - console.dir(logs, { depth: null }); + }); + + const logs = await agent.runFullTurn({ + messages: [ + { + role: 'system', + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( + 'en-US', + { weekday: 'long' }, + )})`, + }, + { + type: 'message', + role: 'user', + content: [ + { + type: 'input_text', + text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', + }, + ], + }, + ], + print_steps: true, + debug: true, + show_images: false, + }); + console.dir(logs, { depth: null }); + } finally { + await client.browsers.deleteByID(browser.session_id); + console.log('> Browser session deleted'); + } } test(); From dcb16c73bf11aa32cf632bd36ba52f6b03eb9aad Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Tue, 3 Mar 2026 12:00:00 -0500 Subject: [PATCH 02/17] Improve OpenAI CUA template logging ergonomics and local runner parity. This adds CUA-style backend/action event rendering (with JSONL mode support), aligns dotenv/local-run behavior across TypeScript and Python templates, and renames local entry scripts to run_local for clearer usage. Made-with: Cursor --- .../python/openai-computer-use/README.md | 8 +- .../python/openai-computer-use/agent/agent.py | 203 ++++++++++++-- .../openai-computer-use/agent/logging.py | 265 ++++++++++++++++++ .../computers/kernel_computer.py | 195 +++++++++++-- .../python/openai-computer-use/main.py | 66 ++++- .../python/openai-computer-use/run_local.py | 129 +++++++++ .../python/openai-computer-use/test_local.py | 70 ----- .../typescript/openai-computer-use/README.md | 10 + .../typescript/openai-computer-use/index.ts | 60 +++- .../openai-computer-use/lib/agent.ts | 159 +++++++++-- .../lib/kernel-computer.ts | 134 +++++++-- .../openai-computer-use/lib/log-events.ts | 73 +++++ .../openai-computer-use/lib/logging.ts | 218 ++++++++++++++ .../openai-computer-use/lib/utils.ts | 5 +- .../openai-computer-use/package.json | 2 +- .../openai-computer-use/run_local.ts | 123 ++++++++ .../openai-computer-use/test.local.ts | 69 ----- 17 files changed, 1532 insertions(+), 257 deletions(-) create mode 100644 pkg/templates/python/openai-computer-use/agent/logging.py create mode 100644 pkg/templates/python/openai-computer-use/run_local.py delete mode 100644 pkg/templates/python/openai-computer-use/test_local.py create mode 100644 pkg/templates/typescript/openai-computer-use/lib/log-events.ts create mode 100644 pkg/templates/typescript/openai-computer-use/lib/logging.ts create mode 100644 pkg/templates/typescript/openai-computer-use/run_local.ts delete mode 100644 pkg/templates/typescript/openai-computer-use/test.local.ts diff --git a/pkg/templates/python/openai-computer-use/README.md b/pkg/templates/python/openai-computer-use/README.md index f0227f8f..6c4d6f1a 100644 --- a/pkg/templates/python/openai-computer-use/README.md +++ b/pkg/templates/python/openai-computer-use/README.md @@ -11,14 +11,20 @@ You can test against a remote Kernel browser without deploying: ```bash cp .env.example .env # Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env -uv run test_local.py +uv run run_local.py +# JSONL event output +uv run run_local.py --output jsonl ``` +The local runner defaults to concise CUA-style logs (`text`), including `kernel>` backend SDK call lines with elapsed timing and `agent>` model output lines. Use `--output jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. + ## Deploy to Kernel ```bash kernel deploy main.py --env-file .env kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}' +# JSONL logs for invocation +kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles","output":"jsonl"}' ``` See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index 4a6dc5d1..b6d71ef8 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -1,5 +1,6 @@ import json -from typing import Callable +import time +from typing import Any, Callable from computers.kernel_computer import KernelComputer from utils import ( create_response, @@ -80,13 +81,15 @@ def __init__( self.print_steps = True self.debug = False self.show_images = False + self.on_event: Callable[[dict], None] | None = None + self._model_request_started_at: float | None = None self.acknowledge_safety_check_callback = acknowledge_safety_check_callback if computer: dimensions = computer.get_dimensions() self.tools += [ { - "type": "computer-preview", + "type": "computer_use_preview", "display_width": dimensions[0], "display_height": dimensions[1], "environment": computer.get_environment(), @@ -126,6 +129,86 @@ def debug_print(self, *args): if self.debug: pp(*args) + def _emit_event(self, event: str, data: dict | None = None) -> None: + if self.print_steps and self.on_event: + self.on_event({"event": event, "data": data or {}}) + + def _current_model_elapsed_ms(self) -> int | None: + if self._model_request_started_at is None: + return None + return int((time.time() - self._model_request_started_at) * 1000) + + def _extract_reasoning_text(self, item: dict[str, Any]) -> str: + summary = item.get("summary") + if not isinstance(summary, list): + return "" + pieces: list[str] = [] + for part in summary: + if not isinstance(part, dict): + continue + text = part.get("text") + if isinstance(text, str) and text: + pieces.append(text) + return " ".join(pieces).strip() + + def _extract_prompt_text(self, item: dict[str, Any]) -> str | None: + if item.get("role") != "user": + return None + content = item.get("content") + if isinstance(content, str): + return content + if not isinstance(content, list): + return None + parts: list[str] = [] + for entry in content: + if not isinstance(entry, dict): + continue + text = entry.get("text") + if isinstance(text, str) and text: + parts.append(text) + return " ".join(parts) if parts else None + + def _describe_action(self, action_type: str, action_args: dict[str, Any]) -> str: + if action_type == "click": + x = int(action_args.get("x", 0)) + y = int(action_args.get("y", 0)) + button = action_args.get("button", "left") + if button in ("", "left"): + return f"click({x}, {y})" + return f"click({x}, {y}, {button})" + if action_type == "double_click": + return f"double_click({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" + if action_type == "type": + text = str(action_args.get("text", "")) + if len(text) > 60: + text = f"{text[:57]}..." + return f"type({text!r})" + if action_type == "keypress": + keys = action_args.get("keys", []) + return f"keypress({keys})" + if action_type == "scroll": + return ( + f"scroll({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))}, " + f"dx={int(action_args.get('scroll_x', 0))}, dy={int(action_args.get('scroll_y', 0))})" + ) + if action_type == "move": + return f"move({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" + if action_type == "drag": + return "drag(...)" + if action_type == "wait": + return f"wait({int(action_args.get('ms', 1000))}ms)" + if action_type == "screenshot": + return "screenshot()" + return action_type + + def _describe_batch_actions(self, actions: list[dict[str, Any]]) -> str: + pieces: list[str] = [] + for action in actions: + action_type = str(action.get("type", "unknown")) + action_args = {k: v for k, v in action.items() if k != "type"} + pieces.append(self._describe_action(action_type, action_args)) + return "batch[" + " -> ".join(pieces) + "]" + def _execute_computer_action(self, action_type, action_args): if action_type == "click": self.computer.click(**action_args) @@ -150,14 +233,50 @@ def _execute_computer_action(self, action_type, action_args): def handle_item(self, item): """Handle each item; may cause a computer action + screenshot.""" + if item["type"] == "reasoning": + text = self._extract_reasoning_text(item) + if text: + self._emit_event("reasoning_delta", {"text": text}) + if item["type"] == "message": - if self.print_steps: - print(item["content"][0]["text"]) + if item.get("role") == "assistant": + content = item.get("content", []) + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and isinstance(part.get("text"), str): + self._emit_event("text_delta", {"text": part["text"]}) + self._emit_event("text_done", {}) if item["type"] == "function_call": name, args = item["name"], json.loads(item["arguments"]) - if self.print_steps: - print(f"{name}({args})") + elapsed_ms = self._current_model_elapsed_ms() + if name == BATCH_FUNC_NAME: + actions = args.get("actions", []) + if isinstance(actions, list): + typed_actions = [a for a in actions if isinstance(a, dict)] + payload = { + "action_type": "batch", + "description": self._describe_batch_actions(typed_actions), + "action": {"type": "batch", "actions": typed_actions}, + } + if elapsed_ms is not None: + payload["elapsed_ms"] = elapsed_ms + self._emit_event( + "action", + payload, + ) + else: + payload = { + "action_type": name, + "description": f"{name}({json.dumps(args)})", + "action": args, + } + if elapsed_ms is not None: + payload["elapsed_ms"] = elapsed_ms + self._emit_event( + "action", + payload, + ) if name == BATCH_FUNC_NAME: return self._handle_batch_call(item["call_id"], args) @@ -177,12 +296,26 @@ def handle_item(self, item): action = item["action"] action_type = action["type"] action_args = {k: v for k, v in action.items() if k != "type"} - if self.print_steps: - print(f"{action_type}({action_args})") + elapsed_ms = self._current_model_elapsed_ms() + payload = { + "action_type": action_type, + "description": self._describe_action(action_type, action_args), + "action": action, + } + if elapsed_ms is not None: + payload["elapsed_ms"] = elapsed_ms + self._emit_event( + "action", + payload, + ) self._execute_computer_action(action_type, action_args) screenshot_base64 = self.computer.screenshot() + self._emit_event( + "screenshot", + {"captured": True, "bytes_base64": len(screenshot_base64)}, + ) if self.show_images: show_image(screenshot_base64) @@ -228,31 +361,55 @@ def _handle_batch_call(self, call_id, args): ] def run_full_turn( - self, input_items, print_steps=True, debug=False, show_images=False + self, + input_items, + print_steps=True, + debug=False, + show_images=False, + on_event: Callable[[dict], None] | None = None, ): self.print_steps = print_steps self.debug = debug self.show_images = show_images + self.on_event = on_event new_items = [] + turns = 0 - while new_items[-1].get("role") != "assistant" if new_items else True: - self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + for message in input_items: + if isinstance(message, dict): + prompt = self._extract_prompt_text(message) + if prompt: + self._emit_event("prompt", {"text": prompt}) - response = create_response( - model=self.model, - input=input_items + new_items, - tools=self.tools, - truncation="auto", - instructions=BATCH_INSTRUCTIONS, - ) - self.debug_print(response) + try: + while new_items[-1].get("role") != "assistant" if new_items else True: + turns += 1 + self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + + self._model_request_started_at = time.time() + response = create_response( + model=self.model, + input=input_items + new_items, + tools=self.tools, + truncation="auto", + instructions=BATCH_INSTRUCTIONS, + ) + self.debug_print(response) + + if "output" not in response: + if self.debug: + print(response) + raise ValueError("No output from model") - if "output" not in response and self.debug: - print(response) - raise ValueError("No output from model") - else: new_items += response["output"] for item in response["output"]: new_items += self.handle_item(item) + self._model_request_started_at = None + self._emit_event("turn_done", {"turn": turns}) + except Exception as exc: + self._model_request_started_at = None + self._emit_event("error", {"message": str(exc)}) + raise + self._emit_event("run_complete", {"turns": turns}) return new_items diff --git a/pkg/templates/python/openai-computer-use/agent/logging.py b/pkg/templates/python/openai-computer-use/agent/logging.py new file mode 100644 index 00000000..31edc122 --- /dev/null +++ b/pkg/templates/python/openai-computer-use/agent/logging.py @@ -0,0 +1,265 @@ +import json +import sys +import threading +import time +from datetime import datetime +from typing import Callable, Literal + +OutputMode = Literal["text", "jsonl"] +MAX_LINE_WIDTH = 120 + + +def _timestamp() -> str: + return datetime.now().strftime("%H:%M:%S.%f")[:-3] + + +def _truncate_one_line(text: str, max_len: int = 90) -> str: + one_line = " ".join(text.split()) + if len(one_line) <= max_len: + return one_line + return f"{one_line[: max_len - 3]}..." + + +class _ThinkingSpinner: + def __init__(self, enabled: bool): + self.enabled = enabled + self.active = False + self.frame = 0 + self.start_at = 0.0 + self.start_ts = "" + self.reasoning = "" + self._thread: threading.Thread | None = None + self._stop_event = threading.Event() + self._lock = threading.Lock() + + def start(self) -> None: + if not self.enabled: + return + with self._lock: + if self.active: + return + self.active = True + self.frame = 0 + self.reasoning = "" + self.start_at = time.time() + self.start_ts = _timestamp() + self._stop_event.clear() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def add_reasoning(self, text: str) -> None: + with self._lock: + if not self.active: + return + self.reasoning += text + + def stop(self, action: str | None = None, elapsed_seconds: float | None = None) -> None: + with self._lock: + if not self.active: + if action: + elapsed_prefix = ( + f"[{elapsed_seconds:.3f}s] " + if isinstance(elapsed_seconds, (int, float)) + else "" + ) + sys.stdout.write(f"{_timestamp()} agent> {elapsed_prefix}{action}\n") + sys.stdout.flush() + return + self.active = False + self._stop_event.set() + elapsed = ( + float(elapsed_seconds) + if isinstance(elapsed_seconds, (int, float)) + else (time.time() - self.start_at) + ) + elapsed_text = f"{elapsed:.3f}s" + if self.reasoning.strip(): + reasoning = _truncate_one_line(self.reasoning, 70) + suffix = f" -> {action}" if action else "" + sys.stdout.write( + f"\r\033[2K{self.start_ts} agent> [{elapsed_text}] {reasoning}{suffix}\n" + ) + elif action: + sys.stdout.write( + f"\r\033[2K{self.start_ts} agent> [{elapsed_text}] {action}\n" + ) + else: + sys.stdout.write( + f"\r\033[2K{self.start_ts} agent> [{elapsed_text}] thinking...\n" + ) + sys.stdout.flush() + + def _run(self) -> None: + while not self._stop_event.wait(0.1): + with self._lock: + if not self.active: + return + self.frame += 1 + elapsed = time.time() - self.start_at + elapsed_text = f"{elapsed:.3f}s" + if self.reasoning.strip(): + prefix = f"{self.start_ts} agent> [{elapsed_text}] " + max_text = max(20, MAX_LINE_WIDTH - len(prefix)) + reasoning = _truncate_one_line(self.reasoning, max_text) + sys.stdout.write(f"\r\033[2K{prefix}{reasoning}") + else: + dots = "." * ((self.frame % 3) + 1) + dots = f"{dots:<3}" + sys.stdout.write( + f"\r\033[2K{self.start_ts} agent> [{elapsed_text}] thinking{dots}" + ) + sys.stdout.flush() + + +def create_event_logger( + output: OutputMode = "text", verbose: bool = False +) -> Callable[[dict], None]: + if output == "jsonl": + def render_jsonl(event: dict) -> None: + payload = {"event": event.get("event"), "data": event.get("data", {})} + sys.stdout.write(json.dumps(payload, default=str) + "\n") + sys.stdout.flush() + + return render_jsonl + + spinner = _ThinkingSpinner(sys.stdout.isatty()) + in_text = False + + def render_text(event: dict) -> None: + nonlocal in_text + + event_name = event.get("event", "") + data = event.get("data", {}) + if not isinstance(data, dict): + data = {} + + if event_name == "session_state": + live_url = data.get("live_view_url") + if isinstance(live_url, str) and live_url: + sys.stdout.write(f"{_timestamp()} kernel> live view: {live_url}\n") + sys.stdout.flush() + return + + if event_name == "backend": + op = data.get("op") + if not isinstance(op, str) or not op: + return + + if in_text: + sys.stdout.write("\n") + sys.stdout.flush() + in_text = False + + if op == "live_url": + detail = data.get("detail") + if isinstance(detail, str) and detail: + sys.stdout.write(f"{_timestamp()} kernel> live view: {detail}\n") + sys.stdout.flush() + return + + if op.endswith(".done"): + base_op = op[: -len(".done")] + if base_op.startswith("get_current_url") and not verbose: + return + detail = data.get("detail") + detail_text = detail if isinstance(detail, str) else "" + elapsed_ms = data.get("elapsed_ms") + elapsed_prefix = "" + if isinstance(elapsed_ms, (int, float)) and not isinstance(elapsed_ms, bool): + elapsed_prefix = f"[{float(elapsed_ms) / 1000:.3f}s] " + suffix = f" {detail_text}" if detail_text else "" + sys.stdout.write( + f"{_timestamp()} kernel> {elapsed_prefix}{base_op}{suffix}\n" + ) + sys.stdout.flush() + return + + if verbose: + sys.stdout.write(f"{_timestamp()} kernel> {op}\n") + sys.stdout.flush() + return + + if event_name == "prompt": + text = data.get("text") + if isinstance(text, str) and text: + sys.stdout.write(f"{_timestamp()} user> {text}\n") + sys.stdout.flush() + return + + if event_name == "reasoning_delta": + text = data.get("text") + if not isinstance(text, str): + return + if sys.stdout.isatty(): + spinner.start() + spinner.add_reasoning(text) + elif verbose and text: + sys.stdout.write( + f"{_timestamp()} agent> thinking: {_truncate_one_line(text)}\n" + ) + sys.stdout.flush() + return + + if event_name == "text_delta": + spinner.stop() + text = data.get("text") + if not isinstance(text, str) or not text: + return + if not in_text: + sys.stdout.write(f"{_timestamp()} agent> ") + in_text = True + sys.stdout.write(text) + sys.stdout.flush() + return + + if event_name == "text_done": + if in_text: + sys.stdout.write("\n") + sys.stdout.flush() + in_text = False + return + + if event_name == "action": + action_type = data.get("action_type") + description = data.get("description") + if not isinstance(description, str) or not description: + description = action_type if isinstance(action_type, str) else "action" + elapsed_ms = data.get("elapsed_ms") + elapsed_seconds = ( + float(elapsed_ms) / 1000 + if isinstance(elapsed_ms, (int, float)) and not isinstance(elapsed_ms, bool) + else None + ) + if in_text: + sys.stdout.write("\n") + in_text = False + spinner.stop(description, elapsed_seconds=elapsed_seconds) + return + + if event_name == "screenshot": + if verbose: + sys.stdout.write(f"{_timestamp()} debug> screenshot captured\n") + sys.stdout.flush() + return + + if event_name in ("turn_done", "run_complete"): + spinner.stop() + if in_text: + sys.stdout.write("\n") + sys.stdout.flush() + in_text = False + return + + if event_name == "error": + spinner.stop() + if in_text: + sys.stdout.write("\n") + sys.stdout.flush() + in_text = False + message = data.get("message") + if not isinstance(message, str) or not message: + message = "unknown error" + sys.stderr.write(f"{_timestamp()} error> {message}\n") + sys.stderr.flush() + + return render_text diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 1c2e1936..623e37fa 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -1,7 +1,7 @@ import base64 import json import time -from typing import List, Dict, Any +from typing import List, Dict, Any, Callable from kernel import Kernel @@ -116,12 +116,62 @@ def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: raise ValueError(f"Unknown CUA action type: {action_type}") +def _truncate(text: str, max_len: int = 60) -> str: + if len(text) <= max_len: + return text + return f"{text[: max_len - 3]}..." + + +def _describe_action(action_type: str, action_args: Dict[str, Any]) -> str: + if action_type == "click": + x = int(action_args.get("x", 0)) + y = int(action_args.get("y", 0)) + button = str(action_args.get("button", "left")) + if button in ("", "left"): + return f"click({x}, {y})" + return f"click({x}, {y}, {button})" + if action_type == "double_click": + return f"double_click({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" + if action_type == "type": + text = _truncate(str(action_args.get("text", ""))) + return f"type({text!r})" + if action_type == "keypress": + return f"keypress({action_args.get('keys', [])})" + if action_type == "scroll": + return ( + f"scroll({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))}, " + f"dx={int(action_args.get('scroll_x', 0))}, dy={int(action_args.get('scroll_y', 0))})" + ) + if action_type == "move": + return f"move({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" + if action_type == "drag": + return "drag(...)" + if action_type == "wait": + return f"wait({int(action_args.get('ms', 1000))}ms)" + return action_type + + +def _describe_batch_actions(actions: List[Dict[str, Any]]) -> str: + pieces = [] + for action in actions: + action_type = str(action.get("type", "unknown")) + action_args = {k: v for k, v in action.items() if k != "type"} + pieces.append(_describe_action(action_type, action_args)) + return "batch[" + " -> ".join(pieces) + "]" + + class KernelComputer: """Wraps Kernel's native computer control API for browser automation.""" - def __init__(self, client: Kernel, session_id: str): + def __init__( + self, + client: Kernel, + session_id: str, + on_event: Callable[[dict], None] | None = None, + ): self.client = client self.session_id = session_id + self.on_event = on_event def get_environment(self): return "browser" @@ -129,50 +179,155 @@ def get_environment(self): def get_dimensions(self): return (1024, 768) + def _emit_backend( + self, op: str, detail: str | None = None, elapsed_ms: int | None = None + ) -> None: + if not self.on_event: + return + data: Dict[str, Any] = {"op": op} + if detail: + data["detail"] = detail + if elapsed_ms is not None: + data["elapsed_ms"] = elapsed_ms + self.on_event({"event": "backend", "data": data}) + + def _trace_backend( + self, + op: str, + fn: Callable[[], Any], + detail: str | Callable[[Any], str | None] | None = None, + ) -> Any: + self._emit_backend(op) + started_at = time.time() + completed = False + result = None + try: + result = fn() + completed = True + return result + finally: + resolved_detail = None + if completed: + if callable(detail): + try: + resolved_detail = detail(result) + except Exception: + resolved_detail = None + elif isinstance(detail, str): + resolved_detail = detail + elapsed_ms = int((time.time() - started_at) * 1000) + self._emit_backend(f"{op}.done", resolved_detail, elapsed_ms) + def screenshot(self) -> str: - resp = self.client.browsers.computer.capture_screenshot(self.session_id) - return base64.b64encode(resp.read()).decode("utf-8") + def _do() -> str: + resp = self.client.browsers.computer.capture_screenshot(self.session_id) + return base64.b64encode(resp.read()).decode("utf-8") + + return self._trace_backend("screenshot", _do) def click(self, x: int, y: int, button="left") -> None: - self.client.browsers.computer.click_mouse(self.session_id, x=x, y=y, button=_normalize_button(button)) + normalized_button = _normalize_button(button) + op = _describe_action("click", {"x": x, "y": y, "button": normalized_button}) + self._trace_backend( + op, + lambda: self.client.browsers.computer.click_mouse( + self.session_id, x=x, y=y, button=normalized_button + ), + ) def double_click(self, x: int, y: int) -> None: - self.client.browsers.computer.click_mouse(self.session_id, x=x, y=y, num_clicks=2) + op = _describe_action("double_click", {"x": x, "y": y}) + self._trace_backend( + op, + lambda: self.client.browsers.computer.click_mouse( + self.session_id, x=x, y=y, num_clicks=2 + ), + ) def type(self, text: str) -> None: - self.client.browsers.computer.type_text(self.session_id, text=text) + op = _describe_action("type", {"text": text}) + self._trace_backend( + op, lambda: self.client.browsers.computer.type_text(self.session_id, text=text) + ) def keypress(self, keys: List[str]) -> None: - self.client.browsers.computer.press_key(self.session_id, keys=_translate_keys(keys)) + translated_keys = _translate_keys(keys) + op = _describe_action("keypress", {"keys": translated_keys}) + self._trace_backend( + op, + lambda: self.client.browsers.computer.press_key( + self.session_id, keys=translated_keys + ), + ) def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - self.client.browsers.computer.scroll(self.session_id, x=x, y=y, delta_x=scroll_x, delta_y=scroll_y) + op = _describe_action( + "scroll", {"x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y} + ) + self._trace_backend( + op, + lambda: self.client.browsers.computer.scroll( + self.session_id, x=x, y=y, delta_x=scroll_x, delta_y=scroll_y + ), + ) def move(self, x: int, y: int) -> None: - self.client.browsers.computer.move_mouse(self.session_id, x=x, y=y) + op = _describe_action("move", {"x": x, "y": y}) + self._trace_backend( + op, lambda: self.client.browsers.computer.move_mouse(self.session_id, x=x, y=y) + ) def drag(self, path: List[Dict[str, int]]) -> None: - p = [[pt["x"], pt["y"]] for pt in path] - self.client.browsers.computer.drag_mouse(self.session_id, path=p) + op = _describe_action("drag", {"path": path}) + + def _do() -> None: + p = [[pt["x"], pt["y"]] for pt in path] + self.client.browsers.computer.drag_mouse(self.session_id, path=p) + + self._trace_backend(op, _do) def wait(self, ms: int = 1000) -> None: time.sleep(ms / 1000) def batch_actions(self, actions: List[Dict[str, Any]]) -> None: - translated = [_translate_cua_action(a) for a in actions] - self.client.browsers.computer.batch(self.session_id, actions=translated) + op = _describe_batch_actions(actions) + + def _do() -> None: + translated = [_translate_cua_action(a) for a in actions] + self.client.browsers.computer.batch(self.session_id, actions=translated) + + self._trace_backend(op, _do) def goto(self, url: str) -> None: - self.client.browsers.playwright.execute( - self.session_id, code=f"await page.goto({json.dumps(url)})" + op = f"goto({json.dumps(url)})" + self._trace_backend( + op, + lambda: self.client.browsers.playwright.execute( + self.session_id, code=f"await page.goto({json.dumps(url)})" + ), ) def back(self) -> None: - self.client.browsers.playwright.execute(self.session_id, code="await page.goBack()") + self._trace_backend( + "back()", + lambda: self.client.browsers.playwright.execute( + self.session_id, code="await page.goBack()" + ), + ) def forward(self) -> None: - self.client.browsers.playwright.execute(self.session_id, code="await page.goForward()") + self._trace_backend( + "forward()", + lambda: self.client.browsers.playwright.execute( + self.session_id, code="await page.goForward()" + ), + ) def get_current_url(self) -> str: - result = self.client.browsers.playwright.execute(self.session_id, code="return page.url()") - return result.result if result.result else "" + def _do() -> str: + result = self.client.browsers.playwright.execute( + self.session_id, code="return page.url()" + ) + return result.result if result.result else "" + + return self._trace_backend("get_current_url()", _do) diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index 77c6964b..675139b4 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -1,10 +1,13 @@ import asyncio import datetime import os -from typing import TypedDict +import subprocess +import sys +from typing import NotRequired, TypedDict import kernel from agent import Agent +from agent.logging import create_event_logger from computers.kernel_computer import KernelComputer from kernel import Kernel @@ -24,6 +27,7 @@ class CuaInput(TypedDict): task: str + output: NotRequired[str] class CuaOutput(TypedDict): @@ -46,13 +50,41 @@ async def cua_task( if not payload or not payload.get("task"): raise ValueError("task is required") + output_mode = payload.get("output", "text") + if output_mode not in ("text", "jsonl"): + output_mode = "text" + on_event = create_event_logger(output=output_mode) + + browser_create_started_at = datetime.datetime.now() + on_event({"event": "backend", "data": {"op": "browsers.new"}}) kernel_browser = await asyncio.to_thread( client.browsers.create, invocation_id=ctx.invocation_id, stealth=True ) - print("Kernel browser live view url: ", kernel_browser.browser_live_view_url) + on_event( + { + "event": "backend", + "data": { + "op": "browsers.new.done", + "detail": kernel_browser.browser_live_view_url or "", + "elapsed_ms": int( + (datetime.datetime.now() - browser_create_started_at).total_seconds() + * 1000 + ), + }, + } + ) + on_event( + { + "event": "session_state", + "data": { + "session_id": kernel_browser.session_id, + "live_view_url": kernel_browser.browser_live_view_url or "", + }, + } + ) def run_agent(): - computer = KernelComputer(client, kernel_browser.session_id) + computer = KernelComputer(client, kernel_browser.session_id, on_event=on_event) computer.goto("https://duckduckgo.com") items = [ @@ -74,8 +106,9 @@ def run_agent(): response_items = agent.run_full_turn( items, - debug=True, + debug=False, show_images=False, + on_event=on_event, ) if not response_items or "content" not in response_items[-1]: @@ -97,4 +130,27 @@ def run_agent(): try: return await asyncio.to_thread(run_agent) finally: - await asyncio.to_thread(client.browsers.delete_by_id, kernel_browser.session_id) + browser_delete_started_at = datetime.datetime.now() + on_event({"event": "backend", "data": {"op": "browsers.delete"}}) + try: + await asyncio.to_thread(client.browsers.delete_by_id, kernel_browser.session_id) + finally: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.delete.done", + "elapsed_ms": int( + (datetime.datetime.now() - browser_delete_started_at).total_seconds() + * 1000 + ), + }, + } + ) + + +if __name__ == "__main__": + # `main.py` is the deployable Kernel app entrypoint. + # For local execution, forward to the existing local harness. + command = [sys.executable, "run_local.py", *sys.argv[1:]] + raise SystemExit(subprocess.call(command)) diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py new file mode 100644 index 00000000..bda8aef2 --- /dev/null +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -0,0 +1,129 @@ +""" +Local test script that creates a remote Kernel browser and runs the CUA agent. +No Kernel app deployment needed. + +Usage: + KERNEL_API_KEY=... OPENAI_API_KEY=... uv run run_local.py --output text +""" + +import argparse +import datetime +import os + +from dotenv import load_dotenv + +load_dotenv(override=True) + +from kernel import Kernel +from agent import Agent +from agent.logging import create_event_logger +from computers.kernel_computer import KernelComputer + + +def parse_args(): + parser = argparse.ArgumentParser(description="Run OpenAI CUA local test") + parser.add_argument( + "--output", + choices=["text", "jsonl"], + default="text", + help="Log output mode", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable verbose debug payload logging", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + if not os.getenv("KERNEL_API_KEY"): + raise ValueError("KERNEL_API_KEY is not set") + if not os.getenv("OPENAI_API_KEY"): + raise ValueError("OPENAI_API_KEY is not set") + + client = Kernel(api_key=os.getenv("KERNEL_API_KEY")) + on_event = create_event_logger(output=args.output, verbose=args.debug) + + browser_create_started_at = datetime.datetime.now() + on_event({"event": "backend", "data": {"op": "browsers.new"}}) + browser = client.browsers.create(timeout_seconds=300) + on_event( + { + "event": "backend", + "data": { + "op": "browsers.new.done", + "detail": browser.browser_live_view_url or "", + "elapsed_ms": int( + (datetime.datetime.now() - browser_create_started_at).total_seconds() + * 1000 + ), + }, + } + ) + on_event( + { + "event": "session_state", + "data": { + "session_id": browser.session_id, + "live_view_url": browser.browser_live_view_url or "", + }, + } + ) + + computer = KernelComputer(client, browser.session_id, on_event=on_event) + + try: + computer.goto("https://duckduckgo.com") + + items = [ + { + "role": "system", + "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + }, + { + "role": "user", + "content": "go to ebay.com and look up oberheim ob-x prices and give me a report", + }, + ] + + agent = Agent( + computer=computer, + tools=[], + acknowledge_safety_check_callback=lambda message: ( + print(f"> safety check: {message}") or True + ), + ) + + response_items = agent.run_full_turn( + items, + debug=args.debug, + show_images=False, + on_event=on_event, + ) + if not response_items: + raise ValueError("No response from agent") + finally: + browser_delete_started_at = datetime.datetime.now() + on_event({"event": "backend", "data": {"op": "browsers.delete"}}) + try: + client.browsers.delete_by_id(browser.session_id) + finally: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.delete.done", + "elapsed_ms": int( + (datetime.datetime.now() - browser_delete_started_at).total_seconds() + * 1000 + ), + }, + } + ) + print("> Browser session deleted") + + +if __name__ == "__main__": + main() diff --git a/pkg/templates/python/openai-computer-use/test_local.py b/pkg/templates/python/openai-computer-use/test_local.py deleted file mode 100644 index 7897cd35..00000000 --- a/pkg/templates/python/openai-computer-use/test_local.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Local test script that creates a remote Kernel browser and runs the CUA agent. -No Kernel app deployment needed. - -Usage: - KERNEL_API_KEY=... OPENAI_API_KEY=... uv run test_local.py -""" - -import datetime -import os -import json - -from dotenv import load_dotenv - -load_dotenv(override=True) - -from kernel import Kernel -from agent import Agent -from computers.kernel_computer import KernelComputer - - -def main(): - if not os.getenv("KERNEL_API_KEY"): - raise ValueError("KERNEL_API_KEY is not set") - if not os.getenv("OPENAI_API_KEY"): - raise ValueError("OPENAI_API_KEY is not set") - - client = Kernel(api_key=os.getenv("KERNEL_API_KEY")) - browser = client.browsers.create(timeout_seconds=300) - print(f"> Browser session: {browser.session_id}") - print(f"> Live view: {browser.browser_live_view_url}") - - computer = KernelComputer(client, browser.session_id) - - try: - computer.goto("https://duckduckgo.com") - - items = [ - { - "role": "system", - "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", - }, - { - "role": "user", - "content": "go to ebay.com and look up oberheim ob-x prices and give me a report", - }, - ] - - agent = Agent( - computer=computer, - tools=[], - acknowledge_safety_check_callback=lambda message: ( - print(f"> safety check: {message}") or True - ), - ) - - response_items = agent.run_full_turn( - items, - debug=True, - show_images=False, - ) - - print(json.dumps(response_items, indent=2, default=str)) - finally: - client.browsers.delete_by_id(browser.session_id) - print("> Browser session deleted") - - -if __name__ == "__main__": - main() diff --git a/pkg/templates/typescript/openai-computer-use/README.md b/pkg/templates/typescript/openai-computer-use/README.md index 36f408a9..996a8002 100644 --- a/pkg/templates/typescript/openai-computer-use/README.md +++ b/pkg/templates/typescript/openai-computer-use/README.md @@ -13,13 +13,23 @@ cp .env.example .env # Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env pnpm install pnpm run test:local +# Equivalent direct run from the app entrypoint: +pnpm exec tsx index.ts +# Direct run of the local runner file: +pnpm exec tsx run_local.ts +# JSONL event output +pnpm run test:local -- --output=jsonl ``` +The local runner defaults to concise CUA-style logs (`text`), including `kernel>` backend SDK call lines with elapsed timing and `agent>` model output lines. Use `--output=jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. + ## Deploy to Kernel ```bash kernel deploy index.ts --env-file .env kernel invoke ts-openai-cua cua-task -p '{"task":"Go to https://news.ycombinator.com and get the top 5 articles"}' +# JSONL logs for invocation +kernel invoke ts-openai-cua cua-task -p '{"task":"Go to https://news.ycombinator.com and get the top 5 articles","output":"jsonl"}' ``` See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index 014105fc..8e98d573 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -1,11 +1,18 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; -import 'dotenv/config'; +import * as dotenv from 'dotenv'; +import { resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; import type { ResponseItem, ResponseOutputMessage } from 'openai/resources/responses/responses'; import { Agent } from './lib/agent'; import { KernelComputer } from './lib/kernel-computer'; +import { createEventLogger } from './lib/logging'; +import type { OutputMode } from './lib/log-events'; + +dotenv.config({ override: true, quiet: true }); interface CuaInput { task: string; + output?: OutputMode; } interface CuaOutput { elapsed: number; @@ -38,11 +45,26 @@ app.action( async (ctx: KernelContext, payload?: CuaInput): Promise => { const start = Date.now(); if (!payload?.task) throw new Error('task is required'); + const outputMode: OutputMode = payload.output === 'jsonl' ? 'jsonl' : 'text'; + const onEvent = createEventLogger({ output: outputMode }); + onEvent({ event: 'backend', data: { op: 'browsers.new' } }); + const browserCreateStartedAt = Date.now(); const kb = await kernel.browsers.create({ invocation_id: ctx.invocation_id }); - console.log('> Kernel browser live view url:', kb.browser_live_view_url); + onEvent({ + event: 'backend', + data: { + op: 'browsers.new.done', + detail: kb.browser_live_view_url ?? '', + elapsed_ms: Date.now() - browserCreateStartedAt, + }, + }); + onEvent({ + event: 'session_state', + data: { session_id: kb.session_id, live_view_url: kb.browser_live_view_url ?? '' }, + }); - const computer = new KernelComputer(kernel, kb.session_id); + const computer = new KernelComputer(kernel, kb.session_id, onEvent); try { await computer.goto('https://duckduckgo.com'); @@ -73,8 +95,9 @@ app.action( }, ], print_steps: true, - debug: true, + debug: false, show_images: false, + onEvent, }); const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); @@ -96,7 +119,34 @@ app.action( console.error('Error in cua-task:', error); return { elapsed, answer: null }; } finally { - await kernel.browsers.deleteByID(kb.session_id); + onEvent({ event: 'backend', data: { op: 'browsers.delete' } }); + const browserDeleteStartedAt = Date.now(); + try { + await kernel.browsers.deleteByID(kb.session_id); + } finally { + onEvent({ + event: 'backend', + data: { + op: 'browsers.delete.done', + elapsed_ms: Date.now() - browserDeleteStartedAt, + }, + }); + } } }, ); + +function isDirectRun(): boolean { + const entry = process.argv[1]; + if (!entry) return false; + return resolve(entry) === resolve(fileURLToPath(import.meta.url)); +} + +if (isDirectRun()) { + void import('./run_local') + .then(({ runLocalTest }) => runLocalTest(process.argv.slice(2))) + .catch((error: unknown) => { + console.error(error); + process.exit(1); + }); +} diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 0ff0dbc4..75deafc4 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -11,6 +11,8 @@ import { } from 'openai/resources/responses/responses'; import * as utils from './utils'; +import type { AgentEvent } from './log-events'; +import { describeAction, describeBatchActions } from './log-events'; import { batchInstructions, batchComputerTool, navigationTools } from './toolset'; import type { KernelComputer } from './kernel-computer'; @@ -24,6 +26,8 @@ export class Agent { private debug = false; private show_images = false; private ackCb: (msg: string) => boolean; + private onEvent: ((event: AgentEvent) => void) | null = null; + private modelRequestStartedAt: number | null = null; constructor(opts: { model?: string; @@ -51,7 +55,6 @@ export class Agent { private debugPrint(...args: unknown[]): void { if (this.debug) { - console.warn('--- debug:agent:debugPrint'); try { console.dir( args.map((msg) => utils.sanitizeMessage(msg as ResponseItem)), @@ -63,18 +66,86 @@ export class Agent { } } + private emit(event: AgentEvent['event'], data: Record): void { + if (this.print_steps) this.onEvent?.({ event, data }); + } + + private currentModelElapsedMs(): number | null { + return this.modelRequestStartedAt === null ? null : Date.now() - this.modelRequestStartedAt; + } + + private extractReasoningText(item: Record): string { + const summary = item.summary; + if (!Array.isArray(summary)) return ''; + const chunks = summary + .map((part) => { + if (!part || typeof part !== 'object') return ''; + const text = (part as { text?: unknown }).text; + return typeof text === 'string' ? text : ''; + }) + .filter(Boolean); + return chunks.join(' ').trim(); + } + + private extractUserPrompt(item: ResponseInputItem): string | null { + const message = item as unknown as { role?: unknown; content?: unknown }; + if (message.role !== 'user') return null; + if (typeof message.content === 'string') return message.content; + if (!Array.isArray(message.content)) return null; + const pieces = message.content + .map((entry) => { + if (!entry || typeof entry !== 'object') return ''; + const text = (entry as { text?: unknown }).text; + return typeof text === 'string' ? text : ''; + }) + .filter(Boolean); + return pieces.length > 0 ? pieces.join(' ') : null; + } + private async handleItem(item: ResponseItem): Promise { - if (item.type === 'message' && this.print_steps) { + const itemType = (item as { type?: string }).type; + if (itemType === 'reasoning') { + const text = this.extractReasoningText(item as unknown as Record); + if (text) this.emit('reasoning_delta', { text }); + } + + if (item.type === 'message') { const msg = item as ResponseOutputMessage; const c = msg.content; - if (Array.isArray(c) && c[0] && 'text' in c[0] && typeof c[0].text === 'string') - console.log(c[0].text); + if (msg.role === 'assistant' && Array.isArray(c)) { + for (const part of c) { + if (part && typeof part === 'object' && 'text' in part && typeof part.text === 'string') { + this.emit('text_delta', { text: part.text }); + } + } + this.emit('text_done', {}); + } } if (item.type === 'function_call') { const fc = item as ResponseFunctionToolCallItem; const argsObj = JSON.parse(fc.arguments) as Record; - if (this.print_steps) console.log(`${fc.name}(${JSON.stringify(argsObj)})`); + if (fc.name === BATCH_FUNC_NAME && Array.isArray(argsObj.actions)) { + const actions = argsObj.actions.filter( + (action): action is Record => + typeof action === 'object' && action !== null, + ); + const elapsedMs = this.currentModelElapsedMs(); + this.emit('action', { + action_type: 'batch', + description: describeBatchActions(actions), + action: { type: 'batch', actions }, + ...(elapsedMs === null ? {} : { elapsed_ms: elapsedMs }), + }); + } else { + const elapsedMs = this.currentModelElapsedMs(); + this.emit('action', { + action_type: fc.name, + description: `${fc.name}(${JSON.stringify(argsObj)})`, + action: argsObj, + ...(elapsedMs === null ? {} : { elapsed_ms: elapsedMs }), + }); + } if (fc.name === BATCH_FUNC_NAME) { return this.handleBatchCall(fc.call_id, argsObj); @@ -100,10 +171,17 @@ export class Agent { if (item.type === 'computer_call') { const cc = item as ResponseComputerToolCall; const { type: actionType, ...actionArgs } = cc.action; - if (this.print_steps) console.log(`${actionType}(${JSON.stringify(actionArgs)})`); + const elapsedMs = this.currentModelElapsedMs(); + this.emit('action', { + action_type: actionType, + description: describeAction(actionType as string, actionArgs), + action: cc.action as unknown as Record, + ...(elapsedMs === null ? {} : { elapsed_ms: elapsedMs }), + }); await this.executeComputerAction(actionType as string, cc.action as unknown as Record); const screenshot = await this.computer.screenshot(); + this.emit('screenshot', { captured: true, bytes_base64: screenshot.length }); const pending = cc.pending_safety_checks ?? []; for (const check of pending) { @@ -199,42 +277,61 @@ export class Agent { print_steps?: boolean; debug?: boolean; show_images?: boolean; + onEvent?: (event: AgentEvent) => void; }): Promise { this.print_steps = opts.print_steps ?? true; this.debug = opts.debug ?? false; this.show_images = opts.show_images ?? false; + this.onEvent = opts.onEvent ?? null; const newItems: ResponseItem[] = []; + let turns = 0; + + for (const message of opts.messages) { + const prompt = this.extractUserPrompt(message); + if (prompt) this.emit('prompt', { text: prompt }); + } - while ( - newItems.length === 0 || - (newItems[newItems.length - 1] as ResponseItem & { role?: string }).role !== 'assistant' - ) { - const inputMessages = [...opts.messages]; + try { + while ( + newItems.length === 0 || + (newItems[newItems.length - 1] as ResponseItem & { role?: string }).role !== 'assistant' + ) { + turns += 1; + const inputMessages = [...opts.messages]; - // Append current URL context to system message - const currentUrl = await this.computer.getCurrentUrl(); - const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); - if (sysIndex >= 0) { - const msg = inputMessages[sysIndex]; - const urlInfo = `\n- Current URL: ${currentUrl}`; - if (msg && 'content' in msg && typeof msg.content === 'string') { - inputMessages[sysIndex] = { ...msg, content: msg.content + urlInfo } as typeof msg; + // Append current URL context to system message + const currentUrl = await this.computer.getCurrentUrl(); + const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); + if (sysIndex >= 0) { + const msg = inputMessages[sysIndex]; + const urlInfo = `\n- Current URL: ${currentUrl}`; + if (msg && 'content' in msg && typeof msg.content === 'string') { + inputMessages[sysIndex] = { ...msg, content: msg.content + urlInfo } as typeof msg; + } } - } - this.debugPrint(...inputMessages, ...newItems); - const response = await utils.createResponse({ - model: this.model, - input: [...inputMessages, ...newItems], - tools: this.tools, - truncation: 'auto', - instructions: batchInstructions, - }); - if (!response.output) throw new Error('No output from model'); - for (const msg of response.output as ResponseItem[]) { - newItems.push(msg, ...(await this.handleItem(msg))); + this.debugPrint(...inputMessages, ...newItems); + this.modelRequestStartedAt = Date.now(); + const response = await utils.createResponse({ + model: this.model, + input: [...inputMessages, ...newItems], + tools: this.tools, + truncation: 'auto', + instructions: batchInstructions, + }); + if (!response.output) throw new Error('No output from model'); + for (const msg of response.output as ResponseItem[]) { + newItems.push(msg, ...(await this.handleItem(msg))); + } + this.modelRequestStartedAt = null; + this.emit('turn_done', { turn: turns }); } + } catch (error) { + this.modelRequestStartedAt = null; + this.emit('error', { message: error instanceof Error ? error.message : String(error) }); + throw error; } + this.emit('run_complete', { turns }); return !this.show_images ? newItems.map((msg) => utils.sanitizeMessage(msg) as ResponseItem) diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index c2f32264..b9ab03d9 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -1,4 +1,5 @@ import { Kernel } from '@onkernel/sdk'; +import { describeAction, describeBatchActions, type AgentEvent } from './log-events'; // CUA model key names -> X11 keysym names for the Kernel computer API const KEYSYM_MAP: Record = { @@ -147,10 +148,12 @@ export class KernelComputer { private sessionId: string; private width = 1024; private height = 768; + private onEvent: ((event: AgentEvent) => void) | null; - constructor(client: Kernel, sessionId: string) { + constructor(client: Kernel, sessionId: string, onEvent?: (event: AgentEvent) => void) { this.client = client; this.sessionId = sessionId; + this.onEvent = onEvent ?? null; } getEnvironment(): 'browser' { @@ -161,48 +164,104 @@ export class KernelComputer { return [this.width, this.height]; } + private emitBackend(op: string, detail?: string, elapsedMs?: number): void { + const data: Record = { op }; + if (detail) data.detail = detail; + if (typeof elapsedMs === 'number') data.elapsed_ms = elapsedMs; + this.onEvent?.({ event: 'backend', data }); + } + + private async traceCall( + op: string, + fn: () => Promise, + detail?: string | ((result: T) => string | undefined), + ): Promise { + this.emitBackend(op); + const started = Date.now(); + let result!: T; + let completed = false; + try { + result = await fn(); + completed = true; + return result; + } finally { + const elapsedMs = Date.now() - started; + let resolvedDetail: string | undefined; + if (completed) { + resolvedDetail = + typeof detail === 'function' ? detail(result) : detail; + } + this.emitBackend(`${op}.done`, resolvedDetail, elapsedMs); + } + } + async screenshot(): Promise { - const resp = await this.client.browsers.computer.captureScreenshot(this.sessionId); - const buf = Buffer.from(await resp.arrayBuffer()); - return buf.toString('base64'); + return this.traceCall('screenshot', async () => { + const resp = await this.client.browsers.computer.captureScreenshot(this.sessionId); + const buf = Buffer.from(await resp.arrayBuffer()); + return buf.toString('base64'); + }); } async click(x: number, y: number, button: string | number = 'left'): Promise { - await this.client.browsers.computer.clickMouse(this.sessionId, { - x, - y, - button: normalizeButton(button) as 'left' | 'right' | 'middle', + const normalizedButton = normalizeButton(button) as 'left' | 'right' | 'middle'; + const op = describeAction('click', { x, y, button: normalizedButton }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: normalizedButton, + }); }); } async doubleClick(x: number, y: number): Promise { - await this.client.browsers.computer.clickMouse(this.sessionId, { x, y, num_clicks: 2 }); + const op = describeAction('double_click', { x, y }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.clickMouse(this.sessionId, { x, y, num_clicks: 2 }); + }); } async type(text: string): Promise { - await this.client.browsers.computer.typeText(this.sessionId, { text }); + const op = describeAction('type', { text }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.typeText(this.sessionId, { text }); + }); } async keypress(keys: string[]): Promise { - await this.client.browsers.computer.pressKey(this.sessionId, { keys: translateKeys(keys) }); + const translatedKeys = translateKeys(keys); + const op = describeAction('keypress', { keys: translatedKeys }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.pressKey(this.sessionId, { keys: translatedKeys }); + }); } async scroll(x: number, y: number, scrollX: number, scrollY: number): Promise { - await this.client.browsers.computer.scroll(this.sessionId, { - x, - y, - delta_x: scrollX, - delta_y: scrollY, + const op = describeAction('scroll', { x, y, scroll_x: scrollX, scroll_y: scrollY }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x: scrollX, + delta_y: scrollY, + }); }); } async move(x: number, y: number): Promise { - await this.client.browsers.computer.moveMouse(this.sessionId, { x, y }); + const op = describeAction('move', { x, y }); + await this.traceCall(op, async () => { + await this.client.browsers.computer.moveMouse(this.sessionId, { x, y }); + }); } async drag(path: Array<{ x: number; y: number }>): Promise { - const p = path.map((pt) => [pt.x, pt.y]); - await this.client.browsers.computer.dragMouse(this.sessionId, { path: p }); + const op = describeAction('drag', { path }); + await this.traceCall(op, async () => { + const p = path.map((pt) => [pt.x, pt.y]); + await this.client.browsers.computer.dragMouse(this.sessionId, { path: p }); + }); } async wait(ms = 1000): Promise { @@ -210,34 +269,47 @@ export class KernelComputer { } async batchActions(actions: CuaAction[]): Promise { - const translated = actions.map(translateCuaAction); - await this.client.browsers.computer.batch(this.sessionId, { - actions: translated as Parameters[1]['actions'], + const actionRecords = actions.map((action) => ({ ...action })) as Array>; + const op = describeBatchActions(actionRecords); + await this.traceCall(op, async () => { + const translated = actions.map(translateCuaAction); + await this.client.browsers.computer.batch(this.sessionId, { + actions: translated as Parameters[1]['actions'], + }); }); } async goto(url: string): Promise { - await this.client.browsers.playwright.execute(this.sessionId, { - code: `await page.goto(${JSON.stringify(url)})`, + const op = `goto(${JSON.stringify(url)})`; + await this.traceCall(op, async () => { + await this.client.browsers.playwright.execute(this.sessionId, { + code: `await page.goto(${JSON.stringify(url)})`, + }); }); } async back(): Promise { - await this.client.browsers.playwright.execute(this.sessionId, { - code: 'await page.goBack()', + await this.traceCall('back()', async () => { + await this.client.browsers.playwright.execute(this.sessionId, { + code: 'await page.goBack()', + }); }); } async forward(): Promise { - await this.client.browsers.playwright.execute(this.sessionId, { - code: 'await page.goForward()', + await this.traceCall('forward()', async () => { + await this.client.browsers.playwright.execute(this.sessionId, { + code: 'await page.goForward()', + }); }); } async getCurrentUrl(): Promise { - const result = await this.client.browsers.playwright.execute(this.sessionId, { - code: 'return page.url()', + return this.traceCall('get_current_url()', async () => { + const result = await this.client.browsers.playwright.execute(this.sessionId, { + code: 'return page.url()', + }); + return (result.result as string) ?? ''; }); - return (result.result as string) ?? ''; } } diff --git a/pkg/templates/typescript/openai-computer-use/lib/log-events.ts b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts new file mode 100644 index 00000000..943ded3b --- /dev/null +++ b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts @@ -0,0 +1,73 @@ +export type OutputMode = 'text' | 'jsonl'; + +export type AgentEventName = + | 'session_state' + | 'backend' + | 'prompt' + | 'reasoning_delta' + | 'text_delta' + | 'text_done' + | 'action' + | 'screenshot' + | 'turn_done' + | 'run_complete' + | 'error'; + +export interface AgentEvent { + event: AgentEventName; + data: Record; +} + +function toInt(value: unknown): number { + if (typeof value === 'number' && Number.isFinite(value)) return Math.trunc(value); + return 0; +} + +function truncate(text: string, max = 60): string { + return text.length > max ? `${text.slice(0, max - 3)}...` : text; +} + +export function describeAction(actionType: string, actionArgs: Record): string { + switch (actionType) { + case 'click': { + const x = toInt(actionArgs.x); + const y = toInt(actionArgs.y); + const button = typeof actionArgs.button === 'string' ? actionArgs.button : 'left'; + return button === 'left' ? `click(${x}, ${y})` : `click(${x}, ${y}, ${button})`; + } + case 'double_click': + return `double_click(${toInt(actionArgs.x)}, ${toInt(actionArgs.y)})`; + case 'type': { + const text = typeof actionArgs.text === 'string' ? actionArgs.text : ''; + return `type(${JSON.stringify(truncate(text))})`; + } + case 'keypress': { + const keys = Array.isArray(actionArgs.keys) ? actionArgs.keys : []; + const serializedKeys = keys.filter((k): k is string => typeof k === 'string'); + return `keypress(${JSON.stringify(serializedKeys)})`; + } + case 'scroll': + return `scroll(${toInt(actionArgs.x)}, ${toInt(actionArgs.y)}, dx=${toInt(actionArgs.scroll_x)}, dy=${toInt(actionArgs.scroll_y)})`; + case 'move': + return `move(${toInt(actionArgs.x)}, ${toInt(actionArgs.y)})`; + case 'drag': + return 'drag(...)'; + case 'wait': { + const ms = typeof actionArgs.ms === 'number' ? Math.trunc(actionArgs.ms) : 1000; + return `wait(${ms}ms)`; + } + case 'screenshot': + return 'screenshot()'; + default: + return actionType; + } +} + +export function describeBatchActions(actions: Array>): string { + const pieces = actions.map((action) => { + const actionType = typeof action.type === 'string' ? action.type : 'unknown'; + const { type: _ignored, ...actionArgs } = action; + return describeAction(actionType, actionArgs); + }); + return `batch[${pieces.join(' -> ')}]`; +} diff --git a/pkg/templates/typescript/openai-computer-use/lib/logging.ts b/pkg/templates/typescript/openai-computer-use/lib/logging.ts new file mode 100644 index 00000000..22980e72 --- /dev/null +++ b/pkg/templates/typescript/openai-computer-use/lib/logging.ts @@ -0,0 +1,218 @@ +import type { AgentEvent, OutputMode } from './log-events'; + +const MAX_LINE_WIDTH = 120; + +function timestamp(): string { + return new Date().toISOString().slice(11, 23); +} + +function asString(value: unknown): string { + return typeof value === 'string' ? value : ''; +} + +function asNumber(value: unknown): number | null { + return typeof value === 'number' && Number.isFinite(value) ? value : null; +} + +function truncateOneLine(text: string, max = 90): string { + const singleLine = text.replace(/\s+/g, ' ').trim(); + return singleLine.length > max ? `${singleLine.slice(0, max - 3)}...` : singleLine; +} + +class ThinkingSpinner { + private active = false; + private timer: NodeJS.Timeout | null = null; + private frame = 0; + private startAt = 0; + private startTS = ''; + private reasoning = ''; + + constructor(private readonly enabled: boolean) {} + + start(): void { + if (!this.enabled || this.active) return; + this.active = true; + this.frame = 0; + this.reasoning = ''; + this.startAt = Date.now(); + this.startTS = timestamp(); + this.timer = setInterval(() => this.redraw(), 100); + } + + addReasoning(text: string): void { + if (!this.active) return; + this.reasoning += text; + } + + stop(action?: string, opts?: { elapsedSeconds?: number }): void { + const elapsedSeconds = opts?.elapsedSeconds; + if (!this.active) { + if (action) { + const elapsedPrefix = + typeof elapsedSeconds === 'number' ? `[${elapsedSeconds.toFixed(3)}s] ` : ''; + process.stdout.write(`${timestamp()} agent> ${elapsedPrefix}${action}\n`); + } + return; + } + this.active = false; + if (this.timer) clearInterval(this.timer); + this.timer = null; + + const elapsed = + typeof elapsedSeconds === 'number' + ? elapsedSeconds.toFixed(3) + : ((Date.now() - this.startAt) / 1000).toFixed(3); + if (this.reasoning.trim()) { + const thinkingText = truncateOneLine(this.reasoning, 70); + const suffix = action ? ` -> ${action}` : ''; + process.stdout.write(`\r\x1b[2K${this.startTS} agent> [${elapsed}s] ${thinkingText}${suffix}\n`); + } else if (action) { + process.stdout.write(`\r\x1b[2K${this.startTS} agent> [${elapsed}s] ${action}\n`); + } else { + process.stdout.write(`\r\x1b[2K${this.startTS} agent> [${elapsed}s] thinking...\n`); + } + } + + private redraw(): void { + if (!this.active) return; + this.frame += 1; + const elapsed = ((Date.now() - this.startAt) / 1000).toFixed(3); + if (this.reasoning.trim()) { + const prefix = `${this.startTS} agent> [${elapsed}s] `; + const maxReasoningLen = Math.max(20, MAX_LINE_WIDTH - prefix.length); + const text = truncateOneLine(this.reasoning, maxReasoningLen); + process.stdout.write(`\r\x1b[2K${prefix}${text}`); + return; + } + const dots = '.'.repeat((this.frame % 3) + 1).padEnd(3, ' '); + process.stdout.write(`\r\x1b[2K${this.startTS} agent> [${elapsed}s] thinking${dots}`); + } +} + +export function createEventLogger(opts?: { + output?: OutputMode; + verbose?: boolean; +}): (event: AgentEvent) => void { + const output = opts?.output ?? 'text'; + const verbose = opts?.verbose ?? false; + + if (output === 'jsonl') { + return (event: AgentEvent): void => { + process.stdout.write(`${JSON.stringify({ event: event.event, data: event.data })}\n`); + }; + } + + let inText = false; + const spinner = new ThinkingSpinner(process.stdout.isTTY); + + return (event: AgentEvent): void => { + const data = event.data; + switch (event.event) { + case 'session_state': { + const liveUrl = asString(data.live_view_url); + if (liveUrl) process.stdout.write(`${timestamp()} kernel> live view: ${liveUrl}\n`); + break; + } + case 'backend': { + const op = asString(data.op); + if (!op) break; + + if (inText) { + process.stdout.write('\n'); + inText = false; + } + + if (op === 'live_url') { + const detail = asString(data.detail); + if (detail) process.stdout.write(`${timestamp()} kernel> live view: ${detail}\n`); + break; + } + + if (op.endsWith('.done')) { + const baseOp = op.slice(0, -'.done'.length); + if (baseOp.startsWith('get_current_url') && !verbose) break; + const detail = asString(data.detail); + const elapsedMs = asNumber(data.elapsed_ms); + const elapsed = elapsedMs === null ? '' : `[${(elapsedMs / 1000).toFixed(3)}s] `; + process.stdout.write( + `${timestamp()} kernel> ${elapsed}${baseOp}${detail ? ` ${detail}` : ''}\n`, + ); + break; + } + + if (verbose) process.stdout.write(`${timestamp()} kernel> ${op}\n`); + break; + } + case 'prompt': { + const text = asString(data.text); + if (text) process.stdout.write(`${timestamp()} user> ${text}\n`); + break; + } + case 'reasoning_delta': { + const text = asString(data.text); + if (process.stdout.isTTY) { + spinner.start(); + spinner.addReasoning(text); + } else if (verbose && text) { + process.stdout.write(`${timestamp()} agent> thinking: ${truncateOneLine(text)}\n`); + } + break; + } + case 'text_delta': { + spinner.stop(); + const text = asString(data.text); + if (!text) break; + if (!inText) { + process.stdout.write(`${timestamp()} agent> `); + inText = true; + } + process.stdout.write(text); + break; + } + case 'text_done': { + if (inText) { + process.stdout.write('\n'); + inText = false; + } + break; + } + case 'action': { + const actionType = asString(data.action_type); + const description = asString(data.description) || actionType; + const elapsedMs = asNumber(data.elapsed_ms); + const elapsedSeconds = elapsedMs === null ? undefined : elapsedMs / 1000; + if (inText) { + process.stdout.write('\n'); + inText = false; + } + spinner.stop(description, { elapsedSeconds }); + break; + } + case 'screenshot': { + if (verbose) process.stdout.write(`${timestamp()} debug> screenshot captured\n`); + break; + } + case 'turn_done': + case 'run_complete': { + spinner.stop(); + if (inText) { + process.stdout.write('\n'); + inText = false; + } + break; + } + case 'error': { + const message = asString(data.message) || 'unknown error'; + spinner.stop(); + if (inText) { + process.stdout.write('\n'); + inText = false; + } + process.stderr.write(`${timestamp()} error> ${message}\n`); + break; + } + default: + break; + } + }; +} diff --git a/pkg/templates/typescript/openai-computer-use/lib/utils.ts b/pkg/templates/typescript/openai-computer-use/lib/utils.ts index da503cd8..f1f21a92 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/utils.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/utils.ts @@ -1,6 +1,9 @@ -import 'dotenv/config'; +import * as dotenv from 'dotenv'; import OpenAI from 'openai'; import { type ResponseItem } from 'openai/resources/responses/responses'; + +dotenv.config({ override: true, quiet: true }); + const openai = new OpenAI(); const BLOCKED_DOMAINS: readonly string[] = [ diff --git a/pkg/templates/typescript/openai-computer-use/package.json b/pkg/templates/typescript/openai-computer-use/package.json index 7fdc55b4..6ccba641 100644 --- a/pkg/templates/typescript/openai-computer-use/package.json +++ b/pkg/templates/typescript/openai-computer-use/package.json @@ -3,7 +3,7 @@ "private": true, "scripts": { "build": "tsc", - "test:local": "npx tsx test.local.ts" + "test:local": "npx tsx run_local.ts" }, "dependencies": { "@onkernel/sdk": "^0.38.0", diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts new file mode 100644 index 00000000..8d1fe3f6 --- /dev/null +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -0,0 +1,123 @@ +import * as dotenv from 'dotenv'; +import { Kernel } from '@onkernel/sdk'; +import { resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { Agent } from './lib/agent'; +import { KernelComputer } from './lib/kernel-computer'; +import { createEventLogger } from './lib/logging'; +import type { OutputMode } from './lib/log-events'; + +dotenv.config({ override: true, quiet: true }); + +/** + * Local test script that creates a remote Kernel browser and runs the CUA agent. + * No Kernel app deployment needed. + * + * Usage: + * KERNEL_API_KEY=... OPENAI_API_KEY=... npx tsx run_local.ts + */ + +export async function runLocalTest(args: string[] = process.argv.slice(2)): Promise { + if (!process.env.KERNEL_API_KEY) throw new Error('KERNEL_API_KEY is not set'); + if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); + + const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); + const outputMode = parseOutputMode(args); + const debug = args.includes('--debug'); + const onEvent = createEventLogger({ output: outputMode, verbose: debug }); + + onEvent({ event: 'backend', data: { op: 'browsers.new' } }); + const browserCreateStartedAt = Date.now(); + const browser = await client.browsers.create({ timeout_seconds: 300 }); + onEvent({ + event: 'backend', + data: { + op: 'browsers.new.done', + detail: browser.browser_live_view_url ?? '', + elapsed_ms: Date.now() - browserCreateStartedAt, + }, + }); + onEvent({ + event: 'session_state', + data: { session_id: browser.session_id, live_view_url: browser.browser_live_view_url ?? '' }, + }); + + const computer = new KernelComputer(client, browser.session_id, onEvent); + + try { + await computer.goto('https://duckduckgo.com'); + + const agent = new Agent({ + model: 'computer-use-preview', + computer, + tools: [], + acknowledge_safety_check_callback: (m: string): boolean => { + console.log(`> safety check: ${m}`); + return true; + }, + }); + + await agent.runFullTurn({ + messages: [ + { + role: 'system', + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( + 'en-US', + { weekday: 'long' }, + )})`, + }, + { + type: 'message', + role: 'user', + content: [ + { + type: 'input_text', + text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', + }, + ], + }, + ], + print_steps: true, + debug, + show_images: false, + onEvent, + }); + } finally { + onEvent({ event: 'backend', data: { op: 'browsers.delete' } }); + const browserDeleteStartedAt = Date.now(); + try { + await client.browsers.deleteByID(browser.session_id); + } finally { + onEvent({ + event: 'backend', + data: { + op: 'browsers.delete.done', + elapsed_ms: Date.now() - browserDeleteStartedAt, + }, + }); + } + console.log('> Browser session deleted'); + } +} + +function parseOutputMode(args: string[]): OutputMode { + const outputArg = args.find((arg) => arg.startsWith('--output=')); + const outputFromEquals = outputArg?.split('=')[1]; + const outputFlagIndex = args.findIndex((arg) => arg === '--output'); + const outputFromNext = outputFlagIndex >= 0 ? args[outputFlagIndex + 1] : undefined; + const output = outputFromEquals ?? outputFromNext; + return output === 'jsonl' ? 'jsonl' : 'text'; +} + +function isDirectRun(): boolean { + const entry = process.argv[1]; + if (!entry) return false; + return resolve(entry) === resolve(fileURLToPath(import.meta.url)); +} + +if (isDirectRun()) { + runLocalTest().catch((error) => { + console.error(error); + process.exit(1); + }); +} diff --git a/pkg/templates/typescript/openai-computer-use/test.local.ts b/pkg/templates/typescript/openai-computer-use/test.local.ts deleted file mode 100644 index 90375999..00000000 --- a/pkg/templates/typescript/openai-computer-use/test.local.ts +++ /dev/null @@ -1,69 +0,0 @@ -import 'dotenv/config'; -import { Kernel } from '@onkernel/sdk'; -import { Agent } from './lib/agent'; -import { KernelComputer } from './lib/kernel-computer'; - -/** - * Local test script that creates a remote Kernel browser and runs the CUA agent. - * No Kernel app deployment needed. - * - * Usage: - * KERNEL_API_KEY=... OPENAI_API_KEY=... npx tsx test.local.ts - */ - -async function test(): Promise { - if (!process.env.KERNEL_API_KEY) throw new Error('KERNEL_API_KEY is not set'); - if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); - - const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); - const browser = await client.browsers.create({ timeout_seconds: 300 }); - console.log('> Browser session:', browser.session_id); - console.log('> Live view:', browser.browser_live_view_url); - - const computer = new KernelComputer(client, browser.session_id); - - try { - await computer.goto('https://duckduckgo.com'); - - const agent = new Agent({ - model: 'computer-use-preview', - computer, - tools: [], - acknowledge_safety_check_callback: (m: string): boolean => { - console.log(`> safety check: ${m}`); - return true; - }, - }); - - const logs = await agent.runFullTurn({ - messages: [ - { - role: 'system', - content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( - 'en-US', - { weekday: 'long' }, - )})`, - }, - { - type: 'message', - role: 'user', - content: [ - { - type: 'input_text', - text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', - }, - ], - }, - ], - print_steps: true, - debug: true, - show_images: false, - }); - console.dir(logs, { depth: null }); - } finally { - await client.browsers.deleteByID(browser.session_id); - console.log('> Browser session deleted'); - } -} - -test(); From 6f673770d5c9fd39e06243a5c4333ac3dd98ea3e Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 12:16:57 -0400 Subject: [PATCH 03/17] updates --- .gitignore | 3 + .../python/openai-computer-use/agent/agent.py | 236 +++++++++++------ .../openai-computer-use/agent/logging.py | 32 ++- .../openai-computer-use/computers/computer.py | 2 +- .../computers/kernel_computer.py | 248 ++++++++++++++++-- .../python/openai-computer-use/main.py | 4 +- .../python/openai-computer-use/run_local.py | 4 +- .../python/openai-computer-use/utils.py | 41 ++- .../typescript/openai-computer-use/index.ts | 2 +- .../openai-computer-use/lib/agent.ts | 189 +++++++------ .../lib/kernel-computer.ts | 231 +++++++++++++--- .../openai-computer-use/lib/log-events.ts | 13 + .../openai-computer-use/lib/logging.ts | 24 +- .../openai-computer-use/lib/toolset.ts | 92 ++++--- .../openai-computer-use/lib/utils.ts | 61 ++++- .../openai-computer-use/run_local.ts | 2 +- 16 files changed, 896 insertions(+), 288 deletions(-) diff --git a/.gitignore b/.gitignore index 010dc12a..90ca5864 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ kernel # QA testing directories qa-* + + +__pycache__ diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index b6d71ef8..d7526a7d 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -11,15 +11,25 @@ ) BATCH_FUNC_NAME = "batch_computer_actions" +EXTRA_FUNC_NAME = "computer_use_extra" -BATCH_INSTRUCTIONS = """You have two ways to perform actions: +BATCH_INSTRUCTIONS = """You have three ways to perform actions: 1. The standard computer tool — use for single actions when you need screenshot feedback after each step. 2. batch_computer_actions — use to execute multiple actions at once when you can predict the outcome. +3. computer_use_extra — use high-level browser actions: goto, back, and url. ALWAYS prefer batch_computer_actions when performing predictable sequences like: - Clicking a text field, typing text, and pressing Enter -- Typing a URL and pressing Enter -- Any sequence where you don't need to see intermediate results""" +- Any sequence where you don't need to see intermediate results + +Use computer_use_extra for: +- action="goto" only when changing the page URL +- action="back" to go back in history +- action="url" to read the exact current URL + +When interacting with page content (search boxes, forms, chat inputs): +- Click the target input first, then type. +- Do not use URL-navigation actions for in-page text entry.""" BATCH_TOOL = { "type": "function", @@ -32,7 +42,9 @@ "PREFER this over individual computer actions when:\n" "- Typing text followed by pressing Enter\n" "- Clicking a field and then typing into it\n" - "- Any sequence where intermediate screenshots are not needed" + "- Any sequence where intermediate screenshots aren't needed\n\n" + "Constraint: return-value actions (url, screenshot) can appear at most once " + "and only as the final action in the batch." ), "parameters": { "type": "object", @@ -45,12 +57,27 @@ "properties": { "type": { "type": "string", - "enum": ["click", "double_click", "type", "keypress", "scroll", "move", "drag", "wait"], + "enum": [ + "click", + "double_click", + "type", + "keypress", + "scroll", + "move", + "drag", + "wait", + "goto", + "back", + "url", + "screenshot", + ], }, "x": {"type": "number"}, "y": {"type": "number"}, "text": {"type": "string"}, + "url": {"type": "string"}, "keys": {"type": "array", "items": {"type": "string"}}, + "hold_keys": {"type": "array", "items": {"type": "string"}}, "button": {"type": "string"}, "scroll_x": {"type": "number"}, "scroll_y": {"type": "number"}, @@ -64,13 +91,35 @@ "strict": False, } +EXTRA_TOOL = { + "type": "function", + "name": EXTRA_FUNC_NAME, + "description": "High-level browser actions for navigation and URL retrieval.", + "parameters": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["goto", "back", "url"], + "description": "Action to perform: goto, back, or url.", + }, + "url": { + "type": "string", + "description": "Required when action is goto. Fully qualified URL to navigate to.", + }, + }, + "required": ["action"], + }, + "strict": False, +} + class Agent: """An agent that uses OpenAI CUA with Kernel's native computer control API.""" def __init__( self, - model="computer-use-preview", + model="gpt-5.4", computer: KernelComputer = None, tools: list[dict] = [], acknowledge_safety_check_callback: Callable = lambda message: False, @@ -86,43 +135,12 @@ def __init__( self.acknowledge_safety_check_callback = acknowledge_safety_check_callback if computer: - dimensions = computer.get_dimensions() self.tools += [ { - "type": "computer_use_preview", - "display_width": dimensions[0], - "display_height": dimensions[1], - "environment": computer.get_environment(), + "type": "computer", }, BATCH_TOOL, - { - "type": "function", - "name": "back", - "description": "Go back to the previous page.", - "parameters": {}, - }, - { - "type": "function", - "name": "goto", - "description": "Go to a specific URL.", - "parameters": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "Fully qualified URL to navigate to.", - }, - }, - "additionalProperties": False, - "required": ["url"], - }, - }, - { - "type": "function", - "name": "forward", - "description": "Go forward to the next page.", - "parameters": {}, - }, + EXTRA_TOOL, ] def debug_print(self, *args): @@ -185,6 +203,9 @@ def _describe_action(self, action_type: str, action_args: dict[str, Any]) -> str return f"type({text!r})" if action_type == "keypress": keys = action_args.get("keys", []) + hold_keys = action_args.get("hold_keys", []) + if hold_keys: + return f"keypress(hold={hold_keys}, keys={keys})" return f"keypress({keys})" if action_type == "scroll": return ( @@ -197,6 +218,12 @@ def _describe_action(self, action_type: str, action_args: dict[str, Any]) -> str return "drag(...)" if action_type == "wait": return f"wait({int(action_args.get('ms', 1000))}ms)" + if action_type == "goto": + return f"goto({action_args.get('url', '')!r})" + if action_type == "back": + return "back()" + if action_type == "url": + return "url()" if action_type == "screenshot": return "screenshot()" return action_type @@ -209,27 +236,13 @@ def _describe_batch_actions(self, actions: list[dict[str, Any]]) -> str: pieces.append(self._describe_action(action_type, action_args)) return "batch[" + " -> ".join(pieces) + "]" - def _execute_computer_action(self, action_type, action_args): - if action_type == "click": - self.computer.click(**action_args) - elif action_type == "double_click": - self.computer.double_click(**action_args) - elif action_type == "type": - self.computer.type(**action_args) - elif action_type == "keypress": - self.computer.keypress(**action_args) - elif action_type == "scroll": - self.computer.scroll(**action_args) - elif action_type == "move": - self.computer.move(**action_args) - elif action_type == "drag": - self.computer.drag(**action_args) - elif action_type == "wait": - self.computer.wait(**action_args) - elif action_type == "screenshot": - pass - else: - print(f"Warning: unknown action type: {action_type}") + def _batch_terminal_read_action(self, actions: list[dict[str, Any]]) -> str: + if not actions: + return "" + action_type = str(actions[-1].get("type", "")) + if action_type in ("url", "screenshot"): + return action_type + return "" def handle_item(self, item): """Handle each item; may cause a computer action + screenshot.""" @@ -280,36 +293,46 @@ def handle_item(self, item): if name == BATCH_FUNC_NAME: return self._handle_batch_call(item["call_id"], args) + if name == EXTRA_FUNC_NAME: + return self._handle_extra_call(item["call_id"], args) - if hasattr(self.computer, name): - method = getattr(self.computer, name) - method(**args) return [ { "type": "function_call_output", "call_id": item["call_id"], - "output": "success", + "output": f"Unsupported function call: {name}", } ] if item["type"] == "computer_call": - action = item["action"] - action_type = action["type"] - action_args = {k: v for k, v in action.items() if k != "type"} elapsed_ms = self._current_model_elapsed_ms() + actions = item.get("actions") + if not isinstance(actions, list): + single = item.get("action") + actions = [single] if isinstance(single, dict) else [] + typed_actions = [a for a in actions if isinstance(a, dict)] + + if len(typed_actions) == 1: + action_type = str(typed_actions[0].get("type", "unknown")) + action_payload: dict[str, Any] = typed_actions[0] + description = self._describe_action( + action_type, + {k: v for k, v in typed_actions[0].items() if k != "type"}, + ) + else: + action_type = "batch" + action_payload = {"type": "batch", "actions": typed_actions} + description = self._describe_batch_actions(typed_actions) + payload = { "action_type": action_type, - "description": self._describe_action(action_type, action_args), - "action": action, + "description": description, + "action": action_payload, } if elapsed_ms is not None: payload["elapsed_ms"] = elapsed_ms - self._emit_event( - "action", - payload, - ) - - self._execute_computer_action(action_type, action_args) + self._emit_event("action", payload) + self.computer.batch_actions(typed_actions) screenshot_base64 = self.computer.screenshot() self._emit_event( @@ -332,7 +355,7 @@ def handle_item(self, item): "call_id": item["call_id"], "acknowledged_safety_checks": pending_checks, "output": { - "type": "input_image", + "type": "computer_screenshot", "image_url": f"data:image/png;base64,{screenshot_base64}", }, } @@ -340,7 +363,6 @@ def handle_item(self, item): if self.computer.get_environment() == "browser": current_url = self.computer.get_current_url() check_blocklisted_url(current_url) - call_output["output"]["current_url"] = current_url return [call_output] return [] @@ -348,15 +370,61 @@ def handle_item(self, item): def _handle_batch_call(self, call_id, args): actions = args.get("actions", []) self.computer.batch_actions(actions) - screenshot_base64 = self.computer.screenshot() + status_text = "Actions executed successfully." + terminal_action = self._batch_terminal_read_action(actions if isinstance(actions, list) else []) + if terminal_action == "url": + try: + current_url = self.computer.get_current_url() + status_text = f"Actions executed successfully. Current URL: {current_url}" + except Exception as exc: + status_text = f"Actions executed, but url() failed: {exc}" + output_items: list[dict[str, Any]] = [{"type": "text", "text": status_text}] + if terminal_action != "url": + screenshot_base64 = self.computer.screenshot() + output_items.append( + { + "type": "image_url", + "image_url": f"data:image/png;base64,{screenshot_base64}", + "detail": "original", + } + ) + return [ + { + "type": "function_call_output", + "call_id": call_id, + "output": json.dumps(output_items), + } + ] + + def _handle_extra_call(self, call_id, args): + action = args.get("action", "") + url = args.get("url", "") + if action == "goto": + self.computer.batch_actions([{"type": "goto", "url": url}]) + status_text = "goto executed successfully." + elif action == "back": + self.computer.batch_actions([{"type": "back"}]) + status_text = "back executed successfully." + elif action == "url": + status_text = f"Current URL: {self.computer.get_current_url()}" + else: + status_text = f"unknown {EXTRA_FUNC_NAME} action: {action}" + + output_items: list[dict[str, Any]] = [{"type": "text", "text": status_text}] + if action != "url": + screenshot_base64 = self.computer.screenshot() + output_items.append( + { + "type": "image_url", + "image_url": f"data:image/png;base64,{screenshot_base64}", + "detail": "original", + } + ) return [ { "type": "function_call_output", "call_id": call_id, - "output": json.dumps([ - {"type": "text", "text": "Actions executed successfully."}, - {"type": "image_url", "image_url": f"data:image/png;base64,{screenshot_base64}"}, - ]), + "output": json.dumps(output_items), } ] @@ -392,6 +460,10 @@ def run_full_turn( input=input_items + new_items, tools=self.tools, truncation="auto", + reasoning={ + "effort": "low", + "summary": "concise", + }, instructions=BATCH_INSTRUCTIONS, ) self.debug_print(response) diff --git a/pkg/templates/python/openai-computer-use/agent/logging.py b/pkg/templates/python/openai-computer-use/agent/logging.py index 31edc122..93edddf7 100644 --- a/pkg/templates/python/openai-computer-use/agent/logging.py +++ b/pkg/templates/python/openai-computer-use/agent/logging.py @@ -20,6 +20,14 @@ def _truncate_one_line(text: str, max_len: int = 90) -> str: return f"{one_line[: max_len - 3]}..." +def _format_kernel_op(op: str) -> str: + if not op: + return op + if "(" in op or "[" in op: + return op + return f"{op}()" + + class _ThinkingSpinner: def __init__(self, enabled: bool): self.enabled = enabled @@ -124,9 +132,10 @@ def render_jsonl(event: dict) -> None: spinner = _ThinkingSpinner(sys.stdout.isatty()) in_text = False + last_live_view_url = "" def render_text(event: dict) -> None: - nonlocal in_text + nonlocal in_text, last_live_view_url event_name = event.get("event", "") data = event.get("data", {}) @@ -135,9 +144,14 @@ def render_text(event: dict) -> None: if event_name == "session_state": live_url = data.get("live_view_url") - if isinstance(live_url, str) and live_url: + if ( + isinstance(live_url, str) + and live_url + and live_url != last_live_view_url + ): sys.stdout.write(f"{_timestamp()} kernel> live view: {live_url}\n") sys.stdout.flush() + last_live_view_url = live_url return if event_name == "backend": @@ -152,15 +166,19 @@ def render_text(event: dict) -> None: if op == "live_url": detail = data.get("detail") - if isinstance(detail, str) and detail: + if ( + isinstance(detail, str) + and detail + and detail != last_live_view_url + ): sys.stdout.write(f"{_timestamp()} kernel> live view: {detail}\n") sys.stdout.flush() + last_live_view_url = detail return if op.endswith(".done"): base_op = op[: -len(".done")] - if base_op.startswith("get_current_url") and not verbose: - return + display_op = _format_kernel_op(base_op) detail = data.get("detail") detail_text = detail if isinstance(detail, str) else "" elapsed_ms = data.get("elapsed_ms") @@ -169,9 +187,11 @@ def render_text(event: dict) -> None: elapsed_prefix = f"[{float(elapsed_ms) / 1000:.3f}s] " suffix = f" {detail_text}" if detail_text else "" sys.stdout.write( - f"{_timestamp()} kernel> {elapsed_prefix}{base_op}{suffix}\n" + f"{_timestamp()} kernel> {elapsed_prefix}{display_op}{suffix}\n" ) sys.stdout.flush() + if base_op == "browsers.new" and detail_text: + last_live_view_url = detail_text return if verbose: diff --git a/pkg/templates/python/openai-computer-use/computers/computer.py b/pkg/templates/python/openai-computer-use/computers/computer.py index 8b389459..cc35eddd 100644 --- a/pkg/templates/python/openai-computer-use/computers/computer.py +++ b/pkg/templates/python/openai-computer-use/computers/computer.py @@ -22,7 +22,7 @@ def wait(self, ms: int = 1000) -> None: ... def move(self, x: int, y: int) -> None: ... - def keypress(self, keys: List[str]) -> None: ... + def keypress(self, keys: List[str], hold_keys: List[str] | None = None) -> None: ... def drag(self, path: List[Dict[str, int]]) -> None: ... diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 623e37fa..4bf9c97a 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -1,5 +1,4 @@ import base64 -import json import time from typing import List, Dict, Any, Callable @@ -57,12 +56,63 @@ "PAUSE": "Pause", "NUMLOCK": "Num_Lock", } +MODIFIER_KEYSYMS = { + "Control_L", + "Control_R", + "Alt_L", + "Alt_R", + "Shift_L", + "Shift_R", + "Super_L", + "Super_R", + "Meta_L", + "Meta_R", +} +GOTO_CHORD_DELAY_MS = 200 def _translate_keys(keys: List[str]) -> List[str]: return [KEYSYM_MAP.get(k, k) for k in keys] +def _expand_combo_keys(keys: List[str]) -> List[str]: + out: List[str] = [] + for raw in keys: + if not isinstance(raw, str): + continue + parts = raw.split("+") if "+" in raw else [raw] + for part in parts: + token = part.strip() + if token: + out.append(token) + return out + + +def _normalize_keypress_payload( + keys: List[str] | None = None, hold_keys: List[str] | None = None +) -> Dict[str, List[str]]: + translated_hold = _translate_keys(_expand_combo_keys(hold_keys or [])) + translated_keys = _translate_keys(_expand_combo_keys(keys or [])) + + hold_from_keys: List[str] = [] + primary_keys: List[str] = [] + for key in translated_keys: + if key in MODIFIER_KEYSYMS: + hold_from_keys.append(key) + else: + primary_keys.append(key) + + if not primary_keys: + return {"keys": translated_keys, "hold_keys": translated_hold} + + merged_hold = translated_hold + hold_from_keys + deduped_hold: List[str] = [] + for key in merged_hold: + if key not in deduped_hold: + deduped_hold.append(key) + return {"keys": primary_keys, "hold_keys": deduped_hold} + + def _normalize_button(button) -> str: if button is None: return "left" @@ -94,15 +144,21 @@ def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: elif action_type == "type": return {"type": "type_text", "type_text": {"text": action.get("text", "")}} elif action_type == "keypress": - return {"type": "press_key", "press_key": {"keys": _translate_keys(action.get("keys", []))}} + normalized = _normalize_keypress_payload( + action.get("keys", []), action.get("hold_keys", []) + ) + payload: Dict[str, Any] = {"keys": normalized["keys"]} + if normalized["hold_keys"]: + payload["hold_keys"] = normalized["hold_keys"] + return {"type": "press_key", "press_key": payload} elif action_type == "scroll": return { "type": "scroll", "scroll": { "x": action.get("x", 0), "y": action.get("y", 0), - "delta_x": action.get("scroll_x", 0), - "delta_y": action.get("scroll_y", 0), + "delta_x": int(action.get("scroll_x", 0)), + "delta_y": int(action.get("scroll_y", 0)), }, } elif action_type == "move": @@ -116,6 +172,135 @@ def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: raise ValueError(f"Unknown CUA action type: {action_type}") +def _is_batch_computer_action_type(action_type: str) -> bool: + return action_type in { + "click", + "double_click", + "type", + "keypress", + "scroll", + "move", + "drag", + "wait", + } + + +def _goto_batch_actions(url: str) -> List[Dict[str, Any]]: + return [ + { + "type": "press_key", + "press_key": {"hold_keys": ["Ctrl"], "keys": ["l"]}, + }, + { + "type": "sleep", + "sleep": {"duration_ms": GOTO_CHORD_DELAY_MS}, + }, + { + "type": "press_key", + "press_key": {"hold_keys": ["Ctrl"], "keys": ["a"]}, + }, + { + "type": "type_text", + "type_text": {"text": url}, + }, + { + "type": "press_key", + "press_key": {"keys": ["Return"]}, + }, + ] + + +def _back_batch_actions() -> List[Dict[str, Any]]: + return [ + { + "type": "press_key", + "press_key": {"hold_keys": ["Alt"], "keys": ["Left"]}, + } + ] + + +def _validate_batch_terminal_read_actions(actions: List[Dict[str, Any]]) -> None: + read_idx = -1 + read_type = "" + for idx, action in enumerate(actions): + action_type = str(action.get("type", "")) + if action_type not in ("url", "screenshot"): + continue + if read_idx >= 0: + raise ValueError( + f"batch can include at most one return-value action ({read_type} or {action_type}); " + f"found {read_type} at index {read_idx} and {action_type} at index {idx}" + ) + if idx != len(actions) - 1: + raise ValueError(f'return-value action "{action_type}" must be last in batch') + read_idx = idx + read_type = action_type + + +def _build_pending_batch(actions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + pending: List[Dict[str, Any]] = [] + for action in actions: + action_type = str(action.get("type", "")) + if _is_batch_computer_action_type(action_type): + pending.append(_translate_cua_action(action)) + continue + if action_type == "goto": + pending.extend(_goto_batch_actions(str(action.get("url", "")))) + continue + if action_type == "back": + pending.extend(_back_batch_actions()) + continue + if action_type in ("url", "screenshot"): + continue + raise ValueError(f"Unknown CUA action type: {action_type}") + return pending + + +def _describe_translated_batch(actions: List[Dict[str, Any]]) -> str: + parts: List[str] = [] + for action in actions: + action_type = str(action.get("type", "")) + if action_type == "click_mouse": + click = action.get("click_mouse", {}) + if not isinstance(click, dict): + parts.append(action_type) + continue + if int(click.get("num_clicks", 0)) > 1: + parts.append(f"double_click({int(click.get('x', 0))},{int(click.get('y', 0))})") + else: + parts.append(f"click({int(click.get('x', 0))},{int(click.get('y', 0))})") + continue + if action_type == "type_text": + type_text = action.get("type_text", {}) + text = str(type_text.get("text", "")) if isinstance(type_text, dict) else "" + parts.append(f"type({_truncate(text, 30)!r})") + continue + if action_type == "press_key": + press_key = action.get("press_key", {}) + keys = press_key.get("keys", []) if isinstance(press_key, dict) else [] + hold_keys = ( + press_key.get("hold_keys", []) if isinstance(press_key, dict) else [] + ) + parts.append(f"key(hold={hold_keys}, keys={keys})") + continue + if action_type == "scroll": + parts.append("scroll") + continue + if action_type == "move_mouse": + parts.append("move") + continue + if action_type == "drag_mouse": + parts.append("drag") + continue + if action_type == "sleep": + sleep = action.get("sleep", {}) + duration = int(sleep.get("duration_ms", 0)) if isinstance(sleep, dict) else 0 + parts.append(f"sleep({duration}ms)") + continue + parts.append(action_type) + return "batch[" + " -> ".join(parts) + "]" + + def _truncate(text: str, max_len: int = 60) -> str: if len(text) <= max_len: return text @@ -136,7 +321,11 @@ def _describe_action(action_type: str, action_args: Dict[str, Any]) -> str: text = _truncate(str(action_args.get("text", ""))) return f"type({text!r})" if action_type == "keypress": - return f"keypress({action_args.get('keys', [])})" + hold_keys = action_args.get("hold_keys", []) + keys = action_args.get("keys", []) + if hold_keys: + return f"keypress(hold={hold_keys}, keys={keys})" + return f"keypress({keys})" if action_type == "scroll": return ( f"scroll({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))}, " @@ -148,6 +337,14 @@ def _describe_action(action_type: str, action_args: Dict[str, Any]) -> str: return "drag(...)" if action_type == "wait": return f"wait({int(action_args.get('ms', 1000))}ms)" + if action_type == "goto": + return f"goto({action_args.get('url', '')!r})" + if action_type == "back": + return "back()" + if action_type == "url": + return "url()" + if action_type == "screenshot": + return "screenshot()" return action_type @@ -177,7 +374,7 @@ def get_environment(self): return "browser" def get_dimensions(self): - return (1024, 768) + return (1920, 1080) def _emit_backend( self, op: str, detail: str | None = None, elapsed_ms: int | None = None @@ -250,13 +447,21 @@ def type(self, text: str) -> None: op, lambda: self.client.browsers.computer.type_text(self.session_id, text=text) ) - def keypress(self, keys: List[str]) -> None: - translated_keys = _translate_keys(keys) - op = _describe_action("keypress", {"keys": translated_keys}) + def keypress(self, keys: List[str], hold_keys: List[str] | None = None) -> None: + normalized = _normalize_keypress_payload(keys, hold_keys or []) + op = _describe_action( + "keypress", + { + "keys": normalized["keys"], + **({"hold_keys": normalized["hold_keys"]} if normalized["hold_keys"] else {}), + }, + ) self._trace_backend( op, lambda: self.client.browsers.computer.press_key( - self.session_id, keys=translated_keys + self.session_id, + keys=normalized["keys"], + **({"hold_keys": normalized["hold_keys"]} if normalized["hold_keys"] else {}), ), ) @@ -290,30 +495,21 @@ def wait(self, ms: int = 1000) -> None: time.sleep(ms / 1000) def batch_actions(self, actions: List[Dict[str, Any]]) -> None: - op = _describe_batch_actions(actions) + _validate_batch_terminal_read_actions(actions) + pending = _build_pending_batch(actions) + op = _describe_translated_batch(pending) def _do() -> None: - translated = [_translate_cua_action(a) for a in actions] - self.client.browsers.computer.batch(self.session_id, actions=translated) + if pending: + self.client.browsers.computer.batch(self.session_id, actions=pending) self._trace_backend(op, _do) def goto(self, url: str) -> None: - op = f"goto({json.dumps(url)})" - self._trace_backend( - op, - lambda: self.client.browsers.playwright.execute( - self.session_id, code=f"await page.goto({json.dumps(url)})" - ), - ) + self.batch_actions([{"type": "goto", "url": url}]) def back(self) -> None: - self._trace_backend( - "back()", - lambda: self.client.browsers.playwright.execute( - self.session_id, code="await page.goBack()" - ), - ) + self.batch_actions([{"type": "back"}]) def forward(self) -> None: self._trace_backend( diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index 675139b4..d8a42ca6 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -87,15 +87,17 @@ def run_agent(): computer = KernelComputer(client, kernel_browser.session_id, on_event=on_event) computer.goto("https://duckduckgo.com") + now_utc = datetime.datetime.now(datetime.UTC) items = [ { "role": "system", - "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + "content": f"- Current date and time: {now_utc.isoformat()} ({now_utc.strftime('%A')})", }, {"role": "user", "content": payload["task"]}, ] agent = Agent( + model="gpt-5.4", computer=computer, tools=[], acknowledge_safety_check_callback=lambda message: ( diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index bda8aef2..36f3b52b 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -77,10 +77,11 @@ def main(): try: computer.goto("https://duckduckgo.com") + now_utc = datetime.datetime.now(datetime.UTC) items = [ { "role": "system", - "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + "content": f"- Current date and time: {now_utc.isoformat()} ({now_utc.strftime('%A')})", }, { "role": "user", @@ -89,6 +90,7 @@ def main(): ] agent = Agent( + model="gpt-5.4", computer=computer, tools=[], acknowledge_safety_check_callback=lambda message: ( diff --git a/pkg/templates/python/openai-computer-use/utils.py b/pkg/templates/python/openai-computer-use/utils.py index fe795ad2..0204dc3d 100644 --- a/pkg/templates/python/openai-computer-use/utils.py +++ b/pkg/templates/python/openai-computer-use/utils.py @@ -2,6 +2,7 @@ import requests from dotenv import load_dotenv import json +import time from urllib.parse import urlparse load_dotenv(override=True) @@ -54,12 +55,40 @@ def create_response(**kwargs): if openai_org: headers["Openai-Organization"] = openai_org - response = requests.post(url, headers=headers, json=kwargs) - - if response.status_code != 200: - print(f"Error: {response.status_code} {response.text}") - - return response.json() + max_attempts = int(os.getenv("OPENAI_RETRY_MAX_ATTEMPTS", "4")) + base_delay_seconds = float(os.getenv("OPENAI_RETRY_BASE_DELAY_SECONDS", "0.5")) + timeout_seconds = float(os.getenv("OPENAI_REQUEST_TIMEOUT_SECONDS", "120")) + + for attempt in range(1, max_attempts + 1): + try: + response = requests.post(url, headers=headers, json=kwargs, timeout=timeout_seconds) + except requests.RequestException as exc: + if attempt < max_attempts: + delay = base_delay_seconds * (2 ** (attempt - 1)) + print( + f"Warning: request failed ({exc}); retrying in {delay:.1f}s " + f"({attempt}/{max_attempts})" + ) + time.sleep(delay) + continue + raise RuntimeError(f"OpenAI request failed after {max_attempts} attempts: {exc}") from exc + + if response.status_code == 200: + return response.json() + + # Retry transient OpenAI server errors (5xx). + if 500 <= response.status_code < 600 and attempt < max_attempts: + delay = base_delay_seconds * (2 ** (attempt - 1)) + print( + f"Warning: OpenAI server error {response.status_code}; retrying in " + f"{delay:.1f}s ({attempt}/{max_attempts})" + ) + time.sleep(delay) + continue + + raise RuntimeError(f"OpenAI API error {response.status_code}: {response.text}") + + raise RuntimeError("OpenAI request failed unexpectedly") def check_blocklisted_url(url: str) -> None: diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index 8e98d573..494369e9 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -70,7 +70,7 @@ app.action( await computer.goto('https://duckduckgo.com'); const agent = new Agent({ - model: 'computer-use-preview', + model: 'gpt-5.4', computer, tools: [], acknowledge_safety_check_callback: (m: string): boolean => { diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 75deafc4..123d83d8 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -6,17 +6,17 @@ import { type ResponseFunctionToolCallOutputItem, type ResponseComputerToolCall, type ResponseComputerToolCallOutputItem, - type ComputerTool, type Tool, } from 'openai/resources/responses/responses'; import * as utils from './utils'; import type { AgentEvent } from './log-events'; import { describeAction, describeBatchActions } from './log-events'; -import { batchInstructions, batchComputerTool, navigationTools } from './toolset'; -import type { KernelComputer } from './kernel-computer'; +import { batchInstructions, batchComputerTool, computerUseExtraTool } from './toolset'; +import type { CuaAction, KernelComputer } from './kernel-computer'; const BATCH_FUNC_NAME = 'batch_computer_actions'; +const EXTRA_FUNC_NAME = 'computer_use_extra'; export class Agent { private model: string; @@ -35,21 +35,17 @@ export class Agent { tools?: Tool[]; acknowledge_safety_check_callback?: (msg: string) => boolean; }) { - this.model = opts.model ?? 'computer-use-preview'; + this.model = opts.model ?? 'gpt-5.4'; this.computer = opts.computer; this.ackCb = opts.acknowledge_safety_check_callback ?? ((): boolean => true); - const [w, h] = this.computer.getDimensions(); this.tools = [ - ...navigationTools, + { + type: 'computer', + } as unknown as Tool, batchComputerTool, + computerUseExtraTool, ...(opts.tools ?? []), - { - type: 'computer_use_preview', - display_width: w, - display_height: h, - environment: this.computer.getEnvironment(), - } as ComputerTool, ]; } @@ -150,36 +146,47 @@ export class Agent { if (fc.name === BATCH_FUNC_NAME) { return this.handleBatchCall(fc.call_id, argsObj); } - - // Navigation tools (goto, back, forward) - const navFn = (this.computer as unknown as Record)[fc.name]; - if (typeof navFn === 'function') { - await (navFn as (...a: unknown[]) => unknown).call( - this.computer, - ...Object.values(argsObj), - ); + if (fc.name === EXTRA_FUNC_NAME) { + return this.handleExtraCall(fc.call_id, argsObj); } + return [ { type: 'function_call_output', call_id: fc.call_id, - output: 'success', + output: `Unsupported function call: ${fc.name}`, } as unknown as ResponseFunctionToolCallOutputItem, ]; } if (item.type === 'computer_call') { - const cc = item as ResponseComputerToolCall; - const { type: actionType, ...actionArgs } = cc.action; + const cc = item as ResponseComputerToolCall & { + action?: Record; + actions?: Array>; + }; + const actionList = Array.isArray(cc.actions) + ? cc.actions + : cc.action + ? [cc.action] + : []; + const elapsedMs = this.currentModelElapsedMs(); + const actionType = + actionList.length === 1 ? String(actionList[0]?.type ?? 'unknown') : 'batch'; + const description = + actionList.length === 1 + ? describeAction(actionType, actionList[0] ?? {}) + : describeBatchActions(actionList); + const actionPayload = + actionList.length === 1 ? (actionList[0] ?? {}) : { type: 'batch', actions: actionList }; this.emit('action', { action_type: actionType, - description: describeAction(actionType as string, actionArgs), - action: cc.action as unknown as Record, + description, + action: actionPayload, ...(elapsedMs === null ? {} : { elapsed_ms: elapsedMs }), }); + await this.computer.batchActions(actionList as CuaAction[]); - await this.executeComputerAction(actionType as string, cc.action as unknown as Record); const screenshot = await this.computer.screenshot(); this.emit('screenshot', { captured: true, bytes_base64: screenshot.length }); @@ -192,14 +199,16 @@ export class Agent { const currentUrl = await this.computer.getCurrentUrl(); utils.checkBlocklistedUrl(currentUrl); + const screenshotOutput = { + type: 'computer_screenshot', + image_url: `data:image/png;base64,${screenshot}`, + } as unknown as ResponseComputerToolCallOutputItem['output']; + const out: Omit = { type: 'computer_call_output', call_id: cc.call_id, acknowledged_safety_checks: pending, - output: { - type: 'computer_screenshot', - image_url: `data:image/png;base64,${screenshot}`, - }, + output: screenshotOutput, }; return [out as ResponseItem]; } @@ -207,71 +216,87 @@ export class Agent { return []; } - private async executeComputerAction( - actionType: string, - action: Record, - ): Promise { - switch (actionType) { - case 'click': - await this.computer.click( - action.x as number, - action.y as number, - (action.button as string) ?? 'left', - ); - break; - case 'double_click': - await this.computer.doubleClick(action.x as number, action.y as number); - break; - case 'type': - await this.computer.type(action.text as string); - break; - case 'keypress': - await this.computer.keypress(action.keys as string[]); - break; - case 'scroll': - await this.computer.scroll( - action.x as number, - action.y as number, - (action.scroll_x as number) ?? 0, - (action.scroll_y as number) ?? 0, - ); - break; - case 'move': - await this.computer.move(action.x as number, action.y as number); - break; - case 'drag': - await this.computer.drag(action.path as Array<{ x: number; y: number }>); - break; - case 'wait': - await this.computer.wait((action.ms as number) ?? 1000); - break; - case 'screenshot': - break; - default: - console.warn(`Unknown computer action: ${actionType}`); + private async handleBatchCall( + callId: string, + argsObj: Record, + ): Promise { + const actions = argsObj.actions as unknown as CuaAction[]; + await this.computer.batchActions(actions); + + let statusText = 'Actions executed successfully.'; + const terminalReadAction = this.batchTerminalReadAction(actions); + if (terminalReadAction === 'url') { + try { + const currentUrl = await this.computer.getCurrentUrl(); + statusText = `Actions executed successfully. Current URL: ${currentUrl}`; + } catch (error) { + statusText = `Actions executed, but url() failed: ${error instanceof Error ? error.message : String(error)}`; + } } + + const outputItems: Array> = [{ type: 'text', text: statusText }]; + if (terminalReadAction !== 'url') { + const screenshot = await this.computer.screenshot(); + outputItems.push({ + type: 'image_url', + image_url: `data:image/png;base64,${screenshot}`, + detail: 'original', + }); + } + return [ + { + type: 'function_call_output', + call_id: callId, + output: JSON.stringify(outputItems), + } as unknown as ResponseFunctionToolCallOutputItem, + ]; } - private async handleBatchCall( + private async handleExtraCall( callId: string, argsObj: Record, ): Promise { - const actions = argsObj.actions as unknown as Parameters[0]; - await this.computer.batchActions(actions); + const action = typeof argsObj.action === 'string' ? argsObj.action : ''; + const url = typeof argsObj.url === 'string' ? argsObj.url : ''; + let statusText = ''; + if (action === 'goto') { + await this.computer.batchActions([{ type: 'goto', url }]); + statusText = 'goto executed successfully.'; + } else if (action === 'back') { + await this.computer.batchActions([{ type: 'back' }]); + statusText = 'back executed successfully.'; + } else if (action === 'url') { + const currentUrl = await this.computer.getCurrentUrl(); + statusText = `Current URL: ${currentUrl}`; + } else { + statusText = `unknown ${EXTRA_FUNC_NAME} action: ${action}`; + } - const screenshot = await this.computer.screenshot(); + const outputItems: Array> = [{ type: 'text', text: statusText }]; + if (action !== 'url') { + const screenshot = await this.computer.screenshot(); + outputItems.push({ + type: 'image_url', + image_url: `data:image/png;base64,${screenshot}`, + detail: 'original', + }); + } return [ { type: 'function_call_output', call_id: callId, - output: JSON.stringify([ - { type: 'text', text: 'Actions executed successfully.' }, - { type: 'image_url', image_url: `data:image/png;base64,${screenshot}` }, - ]), + output: JSON.stringify(outputItems), } as unknown as ResponseFunctionToolCallOutputItem, ]; } + private batchTerminalReadAction(actions: CuaAction[]): '' | 'url' | 'screenshot' { + if (actions.length === 0) return ''; + const lastType = actions[actions.length - 1]?.type; + if (lastType === 'url' || lastType === 'screenshot') return lastType; + return ''; + } + async runFullTurn(opts: { messages: ResponseInputItem[]; print_steps?: boolean; @@ -317,6 +342,10 @@ export class Agent { input: [...inputMessages, ...newItems], tools: this.tools, truncation: 'auto', + reasoning: { + effort: 'low', + summary: 'concise', + }, instructions: batchInstructions, }); if (!response.output) throw new Error('No output from model'); diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index b9ab03d9..bcfa59fd 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -1,5 +1,5 @@ import { Kernel } from '@onkernel/sdk'; -import { describeAction, describeBatchActions, type AgentEvent } from './log-events'; +import { describeAction, type AgentEvent } from './log-events'; // CUA model key names -> X11 keysym names for the Kernel computer API const KEYSYM_MAP: Record = { @@ -63,21 +63,82 @@ const KEYSYM_MAP: Record = { NUMLOCK: 'Num_Lock', }; +const MODIFIER_KEYSYMS = new Set([ + 'Control_L', + 'Control_R', + 'Alt_L', + 'Alt_R', + 'Shift_L', + 'Shift_R', + 'Super_L', + 'Super_R', + 'Meta_L', + 'Meta_R', +]); +const GOTO_CHORD_DELAY_MS = 200; + function translateKeys(keys: string[]): string[] { return keys.map((k) => KEYSYM_MAP[k] ?? k); } -interface CuaAction { +function expandComboKeys(keys: string[]): string[] { + const out: string[] = []; + for (const raw of keys) { + if (typeof raw !== 'string') continue; + const parts = raw.includes('+') ? raw.split('+') : [raw]; + for (const part of parts) { + const trimmed = part.trim(); + if (trimmed) out.push(trimmed); + } + } + return out; +} + +function normalizeKeypressPayload( + keys: string[] = [], + holdKeys: string[] = [], +): { keys: string[]; holdKeys: string[] } { + const translatedHoldKeys = translateKeys(expandComboKeys(holdKeys)); + const translatedKeyEntries = translateKeys(expandComboKeys(keys)); + + const holdFromKeys: string[] = []; + const primaryKeys: string[] = []; + for (const key of translatedKeyEntries) { + if (MODIFIER_KEYSYMS.has(key)) holdFromKeys.push(key); + else primaryKeys.push(key); + } + + if (primaryKeys.length === 0) { + return { keys: translatedKeyEntries, holdKeys: translatedHoldKeys }; + } + + const holdMerged = [...translatedHoldKeys, ...holdFromKeys]; + const dedupedHold: string[] = []; + for (const key of holdMerged) { + if (!dedupedHold.includes(key)) dedupedHold.push(key); + } + return { keys: primaryKeys, holdKeys: dedupedHold }; +} + +function pixelsToScrollTicks(delta: number | undefined): number { + const value = typeof delta === 'number' && Number.isFinite(delta) ? delta : 0; + return Math.trunc(value); +} + +export interface CuaAction { type: string; x?: number; y?: number; text?: string; + url?: string; keys?: string[]; + hold_keys?: string[]; button?: string | number; scroll_x?: number; scroll_y?: number; ms?: number; path?: Array<{ x: number; y: number }>; + [key: string]: unknown; } type BatchAction = { @@ -85,7 +146,7 @@ type BatchAction = { click_mouse?: { x: number; y: number; button?: string; num_clicks?: number }; move_mouse?: { x: number; y: number }; type_text?: { text: string }; - press_key?: { keys: string[] }; + press_key?: { keys: string[]; hold_keys?: string[] }; scroll?: { x: number; y: number; delta_x?: number; delta_y?: number }; drag_mouse?: { path: number[][] }; sleep?: { duration_ms: number }; @@ -118,16 +179,24 @@ function translateCuaAction(action: CuaAction): BatchAction { }; case 'type': return { type: 'type_text', type_text: { text: action.text ?? '' } }; - case 'keypress': - return { type: 'press_key', press_key: { keys: translateKeys(action.keys ?? []) } }; + case 'keypress': { + const normalized = normalizeKeypressPayload(action.keys ?? [], action.hold_keys ?? []); + return { + type: 'press_key', + press_key: { + keys: normalized.keys, + ...(normalized.holdKeys.length > 0 ? { hold_keys: normalized.holdKeys } : {}), + }, + }; + } case 'scroll': return { type: 'scroll', scroll: { x: action.x ?? 0, y: action.y ?? 0, - delta_x: action.scroll_x ?? 0, - delta_y: action.scroll_y ?? 0, + delta_x: pixelsToScrollTicks(action.scroll_x), + delta_y: pixelsToScrollTicks(action.scroll_y), }, }; case 'move': @@ -143,11 +212,110 @@ function translateCuaAction(action: CuaAction): BatchAction { } } +function isBatchComputerActionType(actionType: string): boolean { + return ['click', 'double_click', 'type', 'keypress', 'scroll', 'move', 'drag', 'wait'].includes( + actionType, + ); +} + +function gotoBatchActions(url: string): BatchAction[] { + return [ + { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['l'] } }, + { type: 'sleep', sleep: { duration_ms: GOTO_CHORD_DELAY_MS } }, + { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['a'] } }, + { type: 'type_text', type_text: { text: url } }, + { type: 'press_key', press_key: { keys: ['Return'] } }, + ]; +} + +function backBatchActions(): BatchAction[] { + return [ + { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Left'] } }, + ]; +} + +function validateBatchTerminalReadActions(actions: CuaAction[]): void { + let readIdx = -1; + let readType = ''; + actions.forEach((action, idx) => { + if (action.type !== 'url' && action.type !== 'screenshot') return; + if (readIdx >= 0) { + throw new Error( + `batch can include at most one return-value action (${readType} or ${action.type}); found ${readType} at index ${readIdx} and ${action.type} at index ${idx}`, + ); + } + if (idx !== actions.length - 1) { + throw new Error(`return-value action "${action.type}" must be last in batch`); + } + readIdx = idx; + readType = action.type; + }); +} + +function buildPendingBatch(actions: CuaAction[]): BatchAction[] { + const pending: BatchAction[] = []; + for (const action of actions) { + const actionType = action.type; + if (isBatchComputerActionType(actionType)) { + pending.push(translateCuaAction(action)); + continue; + } + if (actionType === 'goto') { + pending.push(...gotoBatchActions(action.url ?? '')); + continue; + } + if (actionType === 'back') { + pending.push(...backBatchActions()); + continue; + } + if (actionType === 'url' || actionType === 'screenshot') { + continue; + } + throw new Error(`Unknown CUA action type: ${actionType}`); + } + return pending; +} + +function truncateText(text: string, max = 30): string { + if (text.length <= max) return text; + return `${text.slice(0, max - 3)}...`; +} + +function describeTranslatedBatch(actions: BatchAction[]): string { + const parts = actions.map((action) => { + switch (action.type) { + case 'click_mouse': { + const click = action.click_mouse; + if (!click) return action.type; + if ((click.num_clicks ?? 0) > 1) return `double_click(${click.x},${click.y})`; + return `click(${click.x},${click.y})`; + } + case 'type_text': { + const text = action.type_text?.text ?? ''; + return `type(${JSON.stringify(truncateText(text))})`; + } + case 'press_key': + return `key(hold=${JSON.stringify(action.press_key?.hold_keys ?? [])}, keys=${JSON.stringify(action.press_key?.keys ?? [])})`; + case 'scroll': + return 'scroll'; + case 'move_mouse': + return 'move'; + case 'drag_mouse': + return 'drag'; + case 'sleep': + return `sleep(${action.sleep?.duration_ms ?? 0}ms)`; + default: + return action.type; + } + }); + return `batch[${parts.join(' -> ')}]`; +} + export class KernelComputer { private client: Kernel; private sessionId: string; - private width = 1024; - private height = 768; + private width = 1920; + private height = 1080; private onEvent: ((event: AgentEvent) => void) | null; constructor(client: Kernel, sessionId: string, onEvent?: (event: AgentEvent) => void) { @@ -229,22 +397,33 @@ export class KernelComputer { }); } - async keypress(keys: string[]): Promise { - const translatedKeys = translateKeys(keys); - const op = describeAction('keypress', { keys: translatedKeys }); + async keypress(keys: string[], holdKeys: string[] = []): Promise { + const normalized = normalizeKeypressPayload(keys, holdKeys); + const op = describeAction('keypress', { + keys: normalized.keys, + ...(normalized.holdKeys.length > 0 ? { hold_keys: normalized.holdKeys } : {}), + }); await this.traceCall(op, async () => { - await this.client.browsers.computer.pressKey(this.sessionId, { keys: translatedKeys }); + await this.client.browsers.computer.pressKey( + this.sessionId, + { + keys: normalized.keys, + ...(normalized.holdKeys.length > 0 ? { hold_keys: normalized.holdKeys } : {}), + } as Parameters[1], + ); }); } async scroll(x: number, y: number, scrollX: number, scrollY: number): Promise { const op = describeAction('scroll', { x, y, scroll_x: scrollX, scroll_y: scrollY }); + const tickX = pixelsToScrollTicks(scrollX); + const tickY = pixelsToScrollTicks(scrollY); await this.traceCall(op, async () => { await this.client.browsers.computer.scroll(this.sessionId, { x, y, - delta_x: scrollX, - delta_y: scrollY, + delta_x: tickX, + delta_y: tickY, }); }); } @@ -269,31 +448,23 @@ export class KernelComputer { } async batchActions(actions: CuaAction[]): Promise { - const actionRecords = actions.map((action) => ({ ...action })) as Array>; - const op = describeBatchActions(actionRecords); + validateBatchTerminalReadActions(actions); + const pending = buildPendingBatch(actions); + const op = describeTranslatedBatch(pending); await this.traceCall(op, async () => { - const translated = actions.map(translateCuaAction); + if (pending.length === 0) return; await this.client.browsers.computer.batch(this.sessionId, { - actions: translated as Parameters[1]['actions'], + actions: pending as Parameters[1]['actions'], }); }); } async goto(url: string): Promise { - const op = `goto(${JSON.stringify(url)})`; - await this.traceCall(op, async () => { - await this.client.browsers.playwright.execute(this.sessionId, { - code: `await page.goto(${JSON.stringify(url)})`, - }); - }); + await this.batchActions([{ type: 'goto', url }]); } async back(): Promise { - await this.traceCall('back()', async () => { - await this.client.browsers.playwright.execute(this.sessionId, { - code: 'await page.goBack()', - }); - }); + await this.batchActions([{ type: 'back' }]); } async forward(): Promise { diff --git a/pkg/templates/typescript/openai-computer-use/lib/log-events.ts b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts index 943ded3b..d22852e4 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/log-events.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts @@ -43,7 +43,12 @@ export function describeAction(actionType: string, actionArgs: Record typeof k === 'string'); + const serializedHoldKeys = holdKeys.filter((k): k is string => typeof k === 'string'); + if (serializedHoldKeys.length > 0) { + return `keypress(hold=${JSON.stringify(serializedHoldKeys)}, keys=${JSON.stringify(serializedKeys)})`; + } return `keypress(${JSON.stringify(serializedKeys)})`; } case 'scroll': @@ -56,6 +61,14 @@ export function describeAction(actionType: string, actionArgs: Record max ? `${singleLine.slice(0, max - 3)}...` : singleLine; } +function formatKernelOp(op: string): string { + if (!op) return op; + if (op.includes('(') || op.includes('[')) return op; + return `${op}()`; +} + class ThinkingSpinner { private active = false; private timer: NodeJS.Timeout | null = null; @@ -103,6 +109,7 @@ export function createEventLogger(opts?: { } let inText = false; + let lastLiveViewUrl = ''; const spinner = new ThinkingSpinner(process.stdout.isTTY); return (event: AgentEvent): void => { @@ -110,7 +117,10 @@ export function createEventLogger(opts?: { switch (event.event) { case 'session_state': { const liveUrl = asString(data.live_view_url); - if (liveUrl) process.stdout.write(`${timestamp()} kernel> live view: ${liveUrl}\n`); + if (liveUrl && liveUrl !== lastLiveViewUrl) { + process.stdout.write(`${timestamp()} kernel> live view: ${liveUrl}\n`); + lastLiveViewUrl = liveUrl; + } break; } case 'backend': { @@ -124,19 +134,25 @@ export function createEventLogger(opts?: { if (op === 'live_url') { const detail = asString(data.detail); - if (detail) process.stdout.write(`${timestamp()} kernel> live view: ${detail}\n`); + if (detail && detail !== lastLiveViewUrl) { + process.stdout.write(`${timestamp()} kernel> live view: ${detail}\n`); + lastLiveViewUrl = detail; + } break; } if (op.endsWith('.done')) { const baseOp = op.slice(0, -'.done'.length); - if (baseOp.startsWith('get_current_url') && !verbose) break; + const displayOp = formatKernelOp(baseOp); const detail = asString(data.detail); const elapsedMs = asNumber(data.elapsed_ms); const elapsed = elapsedMs === null ? '' : `[${(elapsedMs / 1000).toFixed(3)}s] `; process.stdout.write( - `${timestamp()} kernel> ${elapsed}${baseOp}${detail ? ` ${detail}` : ''}\n`, + `${timestamp()} kernel> ${elapsed}${displayOp}${detail ? ` ${detail}` : ''}\n`, ); + if (baseOp === 'browsers.new' && detail) { + lastLiveViewUrl = detail; + } break; } diff --git a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts index 4cd39321..815e2b69 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts @@ -1,11 +1,20 @@ -export const batchInstructions = `You have two ways to perform actions: +export const batchInstructions = `You have three ways to perform actions: 1. The standard computer tool — use for single actions when you need screenshot feedback after each step. 2. batch_computer_actions — use to execute multiple actions at once when you can predict the outcome. +3. computer_use_extra — use high-level browser actions: goto, back, and url. ALWAYS prefer batch_computer_actions when performing predictable sequences like: - Clicking a text field, typing text, and pressing Enter -- Typing a URL and pressing Enter -- Any sequence where you don't need to see intermediate results`; +- Any sequence where you don't need to see intermediate results + +Use computer_use_extra for: +- action="goto" only when changing the page URL +- action="back" to go back in history +- action="url" to read the exact current URL + +When interacting with page content (search boxes, forms, chat inputs): +- Click the target input first, then type. +- Do not use URL-navigation actions for in-page text entry.`; export const batchComputerTool = { type: 'function' as const, @@ -18,7 +27,9 @@ export const batchComputerTool = { 'PREFER this over individual computer actions when:\n' + '- Typing text followed by pressing Enter\n' + '- Clicking a field and then typing into it\n' + - '- Any sequence where intermediate screenshots are not needed', + "- Any sequence where intermediate screenshots aren't needed\n\n" + + 'Constraint: return-value actions (url, screenshot) can appear at most once ' + + 'and only as the final action in the batch.', parameters: { type: 'object', properties: { @@ -30,12 +41,27 @@ export const batchComputerTool = { properties: { type: { type: 'string', - enum: ['click', 'double_click', 'type', 'keypress', 'scroll', 'move', 'drag', 'wait'], + enum: [ + 'click', + 'double_click', + 'type', + 'keypress', + 'scroll', + 'move', + 'drag', + 'wait', + 'goto', + 'back', + 'url', + 'screenshot', + ], }, x: { type: 'number' }, y: { type: 'number' }, text: { type: 'string' }, + url: { type: 'string' }, keys: { type: 'array', items: { type: 'string' } }, + hold_keys: { type: 'array', items: { type: 'string' } }, button: { type: 'string' }, scroll_x: { type: 'number' }, scroll_y: { type: 'number' }, @@ -49,44 +75,24 @@ export const batchComputerTool = { strict: false, }; -export const navigationTools = [ - { - type: 'function' as const, - name: 'goto', - description: 'Go to a specific URL.', - parameters: { - type: 'object', - properties: { - url: { - type: 'string', - description: 'Fully qualified URL to navigate to.', - }, +export const computerUseExtraTool = { + type: 'function' as const, + name: 'computer_use_extra', + description: 'High-level browser actions for navigation and URL retrieval.', + parameters: { + type: 'object', + properties: { + action: { + type: 'string', + enum: ['goto', 'back', 'url'], + description: 'Action to perform: goto, back, or url.', + }, + url: { + type: 'string', + description: 'Required when action is goto. Fully qualified URL to navigate to.', }, - additionalProperties: false, - required: ['url'], - }, - strict: false, - }, - { - type: 'function' as const, - name: 'back', - description: 'Navigate back in the browser history.', - parameters: { - type: 'object', - properties: {}, - additionalProperties: false, - }, - strict: false, - }, - { - type: 'function' as const, - name: 'forward', - description: 'Navigate forward in the browser history.', - parameters: { - type: 'object', - properties: {}, - additionalProperties: false, }, - strict: false, + required: ['action'], }, -]; + strict: false, +}; diff --git a/pkg/templates/typescript/openai-computer-use/lib/utils.ts b/pkg/templates/typescript/openai-computer-use/lib/utils.ts index f1f21a92..c97c47e4 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/utils.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/utils.ts @@ -34,13 +34,62 @@ export function sanitizeMessage(msg: ResponseItem): ResponseItem { export async function createResponse( params: OpenAI.Responses.ResponseCreateParams, ): Promise<{ output?: OpenAI.Responses.ResponseOutputItem[] }> { - try { - const response = await openai.responses.create(params); - return 'output' in response ? response : { output: undefined }; - } catch (err: unknown) { - console.error((err as Error).message); - throw err; + const maxAttempts = Number(process.env.OPENAI_RETRY_MAX_ATTEMPTS ?? '4'); + const baseDelaySeconds = Number(process.env.OPENAI_RETRY_BASE_DELAY_SECONDS ?? '0.5'); + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + try { + const response = await openai.responses.create(params); + return 'output' in response ? response : { output: undefined }; + } catch (err: unknown) { + const status = getErrorStatus(err); + const retryable = isRetryableError(err); + const message = getErrorMessage(err); + + if (!retryable || attempt >= maxAttempts) { + console.error(message); + throw err; + } + + const delayMs = baseDelaySeconds * 1000 * 2 ** (attempt - 1); + const label = status === null ? 'OpenAI request failed' : `OpenAI server error ${status}`; + console.warn( + `Warning: ${label}; retrying in ${(delayMs / 1000).toFixed(1)}s (${attempt}/${maxAttempts})`, + ); + await sleep(delayMs); + } } + throw new Error('OpenAI request failed unexpectedly'); +} + +function getErrorStatus(err: unknown): number | null { + if (typeof err !== 'object' || err === null) return null; + if (!('status' in err)) return null; + const status = (err as { status?: unknown }).status; + return typeof status === 'number' ? status : null; +} + +function getErrorMessage(err: unknown): string { + if (err instanceof Error && err.message) return err.message; + return String(err); +} + +function isRetryableError(err: unknown): boolean { + const status = getErrorStatus(err); + if (status !== null) return status >= 500; + + const msg = getErrorMessage(err).toLowerCase(); + return ( + msg.includes('fetch failed') || + msg.includes('network') || + msg.includes('econnreset') || + msg.includes('etimedout') || + msg.includes('timeout') + ); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); } export function checkBlocklistedUrl(url: string): boolean { diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts index 8d1fe3f6..fd538c94 100644 --- a/pkg/templates/typescript/openai-computer-use/run_local.ts +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -48,7 +48,7 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom await computer.goto('https://duckduckgo.com'); const agent = new Agent({ - model: 'computer-use-preview', + model: 'gpt-5.4', computer, tools: [], acknowledge_safety_check_callback: (m: string): boolean => { From effb56c67369727e463a8c7a28329cbadbe0657c Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 13:59:32 -0400 Subject: [PATCH 04/17] Update OpenAI computer-use templates for clipboard URL reads and latest Kernel SDK. This aligns URL retrieval with keyboard-and-clipboard computer actions (instead of Playwright execution), removes leftover Playwright references, and refreshes TS/Python Kernel SDK dependencies to current versions. Made-with: Cursor --- .../python/openai-computer-use/README.md | 4 +- .../computers/kernel_computer.py | 49 ++++++++++++++++--- .../python/openai-computer-use/pyproject.toml | 2 +- .../python/openai-computer-use/uv.lock | 8 +-- .../typescript/openai-computer-use/README.md | 4 +- .../lib/kernel-computer.ts | 35 ++++++++++--- .../openai-computer-use/package.json | 2 +- .../openai-computer-use/pnpm-lock.yaml | 10 ++-- 8 files changed, 87 insertions(+), 27 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/README.md b/pkg/templates/python/openai-computer-use/README.md index 6c4d6f1a..1276eca8 100644 --- a/pkg/templates/python/openai-computer-use/README.md +++ b/pkg/templates/python/openai-computer-use/README.md @@ -2,7 +2,7 @@ This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI with Kernel's native browser control API. -It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) instead of Playwright, and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. +It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. ## Local testing @@ -16,7 +16,7 @@ uv run run_local.py uv run run_local.py --output jsonl ``` -The local runner defaults to concise CUA-style logs (`text`), including `kernel>` backend SDK call lines with elapsed timing and `agent>` model output lines. Use `--output jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. +The local runner defaults to human-readable logs (`text`). Use `--output jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. ## Deploy to Kernel diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 4bf9c97a..8b9c9084 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -219,6 +219,32 @@ def _back_batch_actions() -> List[Dict[str, Any]]: ] +def _forward_batch_actions() -> List[Dict[str, Any]]: + return [ + { + "type": "press_key", + "press_key": {"hold_keys": ["Alt"], "keys": ["Right"]}, + } + ] + + +def _current_url_batch_actions() -> List[Dict[str, Any]]: + return [ + { + "type": "press_key", + "press_key": {"hold_keys": ["Ctrl"], "keys": ["l"]}, + }, + { + "type": "press_key", + "press_key": {"hold_keys": ["Ctrl"], "keys": ["a"]}, + }, + { + "type": "press_key", + "press_key": {"hold_keys": ["Ctrl"], "keys": ["c"]}, + }, + ] + + def _validate_batch_terminal_read_actions(actions: List[Dict[str, Any]]) -> None: read_idx = -1 read_type = "" @@ -512,18 +538,29 @@ def back(self) -> None: self.batch_actions([{"type": "back"}]) def forward(self) -> None: + actions = _forward_batch_actions() + op = _describe_translated_batch(actions) self._trace_backend( - "forward()", - lambda: self.client.browsers.playwright.execute( - self.session_id, code="await page.goForward()" + op, + lambda: self.client.browsers.computer.batch( + self.session_id, actions=actions ), ) def get_current_url(self) -> str: def _do() -> str: - result = self.client.browsers.playwright.execute( - self.session_id, code="return page.url()" + copy_actions = _current_url_batch_actions() + copy_op = _describe_translated_batch(copy_actions) + self._trace_backend( + copy_op, + lambda: self.client.browsers.computer.batch( + self.session_id, actions=copy_actions + ), ) - return result.result if result.result else "" + result = self.client.browsers.computer.read_clipboard(self.session_id) + current_url = (result.text or "").strip() + if not current_url: + raise ValueError("clipboard URL was empty") + return current_url return self._trace_backend("get_current_url()", _do) diff --git a/pkg/templates/python/openai-computer-use/pyproject.toml b/pkg/templates/python/openai-computer-use/pyproject.toml index 47e45577..e0f50500 100644 --- a/pkg/templates/python/openai-computer-use/pyproject.toml +++ b/pkg/templates/python/openai-computer-use/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "httpx>=0.28.1", - "kernel>=0.38.0", + "kernel>=0.43.0", "python-dotenv>=1.2.1", "requests>=2.32.5", ] diff --git a/pkg/templates/python/openai-computer-use/uv.lock b/pkg/templates/python/openai-computer-use/uv.lock index 42620637..c4fbfed6 100644 --- a/pkg/templates/python/openai-computer-use/uv.lock +++ b/pkg/templates/python/openai-computer-use/uv.lock @@ -163,7 +163,7 @@ wheels = [ [[package]] name = "kernel" -version = "0.38.0" +version = "0.43.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -173,9 +173,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/90/77/2b2430c9b017d50dc1b4bad2c394cb862d4e504dfd5868de5634ec2129df/kernel-0.38.0.tar.gz", hash = "sha256:6eb8bf6abc35c43c96a69ef6efe4235e2007393dd12dbb95f084595bef234453", size = 193498, upload-time = "2026-02-25T18:54:51.895Z" } +sdist = { url = "https://files.pythonhosted.org/packages/29/99/639401caa99d752ce430e85d2aacbf1e0da3e748d0b7cff8758b4e49f62f/kernel-0.43.0.tar.gz", hash = "sha256:f3a4c8959eb26e783ece943507871f12ae5b884c841dc81d640a2f46f22b6ed2", size = 196586, upload-time = "2026-03-10T17:30:39.461Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4c/4d/c7b95eeac08fed24d15f11fee11c4807e154fbec7ad5cc99c7943e4a9e06/kernel-0.38.0-py3-none-any.whl", hash = "sha256:8548d34980034a1e9300a5bec51730a38729115355d86a7cd3e2680095f15bd6", size = 225184, upload-time = "2026-02-25T18:54:50.454Z" }, + { url = "https://files.pythonhosted.org/packages/94/fb/519de9d31f1eb5b0c5bb374e31584af5a8191e25dfa05ca4014e7bd38dba/kernel-0.43.0-py3-none-any.whl", hash = "sha256:c5a1b311e318d04ec7f1bd5b7400fc38fefe72ca1d248f48ebf921a49ee3f608", size = 229540, upload-time = "2026-03-10T17:30:37.775Z" }, ] [[package]] @@ -313,7 +313,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, - { name = "kernel", specifier = ">=0.38.0" }, + { name = "kernel", specifier = ">=0.43.0" }, { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "requests", specifier = ">=2.32.5" }, ] diff --git a/pkg/templates/typescript/openai-computer-use/README.md b/pkg/templates/typescript/openai-computer-use/README.md index 996a8002..e652eedf 100644 --- a/pkg/templates/typescript/openai-computer-use/README.md +++ b/pkg/templates/typescript/openai-computer-use/README.md @@ -2,7 +2,7 @@ This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI with Kernel's native browser control API. -It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) instead of Playwright, and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. +It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency. ## Local testing @@ -21,7 +21,7 @@ pnpm exec tsx run_local.ts pnpm run test:local -- --output=jsonl ``` -The local runner defaults to concise CUA-style logs (`text`), including `kernel>` backend SDK call lines with elapsed timing and `agent>` model output lines. Use `--output=jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. +The local runner defaults to human-readable logs (`text`). Use `--output=jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. ## Deploy to Kernel diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index bcfa59fd..c8ecd239 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -234,6 +234,20 @@ function backBatchActions(): BatchAction[] { ]; } +function forwardBatchActions(): BatchAction[] { + return [ + { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Right'] } }, + ]; +} + +function currentUrlBatchActions(): BatchAction[] { + return [ + { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['l'] } }, + { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['a'] } }, + { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['c'] } }, + ]; +} + function validateBatchTerminalReadActions(actions: CuaAction[]): void { let readIdx = -1; let readType = ''; @@ -468,19 +482,28 @@ export class KernelComputer { } async forward(): Promise { - await this.traceCall('forward()', async () => { - await this.client.browsers.playwright.execute(this.sessionId, { - code: 'await page.goForward()', + const forwardActions = forwardBatchActions(); + await this.traceCall(describeTranslatedBatch(forwardActions), async () => { + await this.client.browsers.computer.batch(this.sessionId, { + actions: forwardActions as Parameters[1]['actions'], }); }); } async getCurrentUrl(): Promise { return this.traceCall('get_current_url()', async () => { - const result = await this.client.browsers.playwright.execute(this.sessionId, { - code: 'return page.url()', + const copyActions = currentUrlBatchActions(); + await this.traceCall(describeTranslatedBatch(copyActions), async () => { + await this.client.browsers.computer.batch(this.sessionId, { + actions: copyActions as Parameters[1]['actions'], + }); }); - return (result.result as string) ?? ''; + const result = await this.client.browsers.computer.readClipboard(this.sessionId); + const currentUrl = (result.text ?? '').trim(); + if (!currentUrl) { + throw new Error('clipboard URL was empty'); + } + return currentUrl; }); } } diff --git a/pkg/templates/typescript/openai-computer-use/package.json b/pkg/templates/typescript/openai-computer-use/package.json index 6ccba641..b9371e19 100644 --- a/pkg/templates/typescript/openai-computer-use/package.json +++ b/pkg/templates/typescript/openai-computer-use/package.json @@ -6,7 +6,7 @@ "test:local": "npx tsx run_local.ts" }, "dependencies": { - "@onkernel/sdk": "^0.38.0", + "@onkernel/sdk": "^0.43.0", "dotenv": "^17.2.3", "openai": "^6.13.0" }, diff --git a/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml b/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml index 39dc64d1..28304dd1 100644 --- a/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml +++ b/pkg/templates/typescript/openai-computer-use/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@onkernel/sdk': - specifier: ^0.38.0 - version: 0.38.0 + specifier: ^0.43.0 + version: 0.43.0 dotenv: specifier: ^17.2.3 version: 17.3.1 @@ -186,8 +186,8 @@ packages: cpu: [x64] os: [win32] - '@onkernel/sdk@0.38.0': - resolution: {integrity: sha512-BwbC3OkUg9xhdTshyyUi7+vqwC6gjsHpfpFsDAlVe/rzzledBsL3Usf5rrYfk1Bpk72P+OfF2NtUt5HLaVrjvQ==} + '@onkernel/sdk@0.43.0': + resolution: {integrity: sha512-pvveMdVCzjtVqNeLI+yk+VBTMaIvRe/jevvKJqnHl2svlDxvT7Z0mNFeiAWsDLeh1TQL92aWEKZoyEVxRniO9w==} '@types/node@22.19.11': resolution: {integrity: sha512-BH7YwL6rA93ReqeQS1c4bsPpcfOmJasG+Fkr6Y59q83f9M1WcBRHR2vM+P9eOisYRcN3ujQoiZY8uk5W+1WL8w==} @@ -317,7 +317,7 @@ snapshots: '@esbuild/win32-x64@0.27.3': optional: true - '@onkernel/sdk@0.38.0': {} + '@onkernel/sdk@0.43.0': {} '@types/node@22.19.11': dependencies: From a611a9f5632a2d49930e10e18a374ada3743c525 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 14:13:29 -0400 Subject: [PATCH 05/17] Fix remaining PR review issues in CUA templates. Enforce TypeScript URL blocklist failures, preserve current_url in computer_call_output, handle special click button variants, and remove duplicated/dead Python template code flagged in PR feedback. Made-with: Cursor --- .../python/openai-computer-use/agent/agent.py | 63 +++---------------- .../openai-computer-use/computers/config.py | 5 -- .../computers/kernel_computer.py | 32 +++++++++- .../openai-computer-use/lib/agent.ts | 1 + .../lib/kernel-computer.ts | 32 +++++++++- .../openai-computer-use/lib/utils.ts | 12 ++-- 6 files changed, 80 insertions(+), 65 deletions(-) delete mode 100644 pkg/templates/python/openai-computer-use/computers/config.py diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index d7526a7d..7e8bcbad 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -1,7 +1,11 @@ import json import time from typing import Any, Callable -from computers.kernel_computer import KernelComputer +from computers.kernel_computer import ( + KernelComputer, + _describe_action, + _describe_batch_actions, +) from utils import ( create_response, show_image, @@ -186,56 +190,6 @@ def _extract_prompt_text(self, item: dict[str, Any]) -> str | None: parts.append(text) return " ".join(parts) if parts else None - def _describe_action(self, action_type: str, action_args: dict[str, Any]) -> str: - if action_type == "click": - x = int(action_args.get("x", 0)) - y = int(action_args.get("y", 0)) - button = action_args.get("button", "left") - if button in ("", "left"): - return f"click({x}, {y})" - return f"click({x}, {y}, {button})" - if action_type == "double_click": - return f"double_click({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" - if action_type == "type": - text = str(action_args.get("text", "")) - if len(text) > 60: - text = f"{text[:57]}..." - return f"type({text!r})" - if action_type == "keypress": - keys = action_args.get("keys", []) - hold_keys = action_args.get("hold_keys", []) - if hold_keys: - return f"keypress(hold={hold_keys}, keys={keys})" - return f"keypress({keys})" - if action_type == "scroll": - return ( - f"scroll({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))}, " - f"dx={int(action_args.get('scroll_x', 0))}, dy={int(action_args.get('scroll_y', 0))})" - ) - if action_type == "move": - return f"move({int(action_args.get('x', 0))}, {int(action_args.get('y', 0))})" - if action_type == "drag": - return "drag(...)" - if action_type == "wait": - return f"wait({int(action_args.get('ms', 1000))}ms)" - if action_type == "goto": - return f"goto({action_args.get('url', '')!r})" - if action_type == "back": - return "back()" - if action_type == "url": - return "url()" - if action_type == "screenshot": - return "screenshot()" - return action_type - - def _describe_batch_actions(self, actions: list[dict[str, Any]]) -> str: - pieces: list[str] = [] - for action in actions: - action_type = str(action.get("type", "unknown")) - action_args = {k: v for k, v in action.items() if k != "type"} - pieces.append(self._describe_action(action_type, action_args)) - return "batch[" + " -> ".join(pieces) + "]" - def _batch_terminal_read_action(self, actions: list[dict[str, Any]]) -> str: if not actions: return "" @@ -269,7 +223,7 @@ def handle_item(self, item): typed_actions = [a for a in actions if isinstance(a, dict)] payload = { "action_type": "batch", - "description": self._describe_batch_actions(typed_actions), + "description": _describe_batch_actions(typed_actions), "action": {"type": "batch", "actions": typed_actions}, } if elapsed_ms is not None: @@ -315,14 +269,14 @@ def handle_item(self, item): if len(typed_actions) == 1: action_type = str(typed_actions[0].get("type", "unknown")) action_payload: dict[str, Any] = typed_actions[0] - description = self._describe_action( + description = _describe_action( action_type, {k: v for k, v in typed_actions[0].items() if k != "type"}, ) else: action_type = "batch" action_payload = {"type": "batch", "actions": typed_actions} - description = self._describe_batch_actions(typed_actions) + description = _describe_batch_actions(typed_actions) payload = { "action_type": action_type, @@ -363,6 +317,7 @@ def handle_item(self, item): if self.computer.get_environment() == "browser": current_url = self.computer.get_current_url() check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url return [call_output] return [] diff --git a/pkg/templates/python/openai-computer-use/computers/config.py b/pkg/templates/python/openai-computer-use/computers/config.py deleted file mode 100644 index 28a9b7ee..00000000 --- a/pkg/templates/python/openai-computer-use/computers/config.py +++ /dev/null @@ -1,5 +0,0 @@ -from .kernel_computer import KernelComputer - -computers_config = { - "kernel": KernelComputer, -} diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 8b9c9084..1c1f681e 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -124,12 +124,33 @@ def _normalize_button(button) -> str: def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: action_type = action.get("type", "") if action_type == "click": + button = action.get("button") + if button == "back": + return { + "type": "press_key", + "press_key": {"hold_keys": ["Alt"], "keys": ["Left"]}, + } + if button == "forward": + return { + "type": "press_key", + "press_key": {"hold_keys": ["Alt"], "keys": ["Right"]}, + } + if button == "wheel": + return { + "type": "scroll", + "scroll": { + "x": action.get("x", 0), + "y": action.get("y", 0), + "delta_x": int(action.get("scroll_x", 0)), + "delta_y": int(action.get("scroll_y", 0)), + }, + } return { "type": "click_mouse", "click_mouse": { "x": action.get("x", 0), "y": action.get("y", 0), - "button": _normalize_button(action.get("button")), + "button": _normalize_button(button), }, } elif action_type == "double_click": @@ -449,6 +470,15 @@ def _do() -> str: return self._trace_backend("screenshot", _do) def click(self, x: int, y: int, button="left") -> None: + if button == "back": + self.back() + return + if button == "forward": + self.forward() + return + if button == "wheel": + self.scroll(x, y, 0, 0) + return normalized_button = _normalize_button(button) op = _describe_action("click", {"x": x, "y": y, "button": normalized_button}) self._trace_backend( diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 123d83d8..6b896828 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -203,6 +203,7 @@ export class Agent { type: 'computer_screenshot', image_url: `data:image/png;base64,${screenshot}`, } as unknown as ResponseComputerToolCallOutputItem['output']; + (screenshotOutput as { current_url?: string }).current_url = currentUrl; const out: Omit = { type: 'computer_call_output', diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index c8ecd239..1eb54698 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -167,11 +167,29 @@ function normalizeButton(button?: string | number): string { function translateCuaAction(action: CuaAction): BatchAction { switch (action.type) { - case 'click': + case 'click': { + if (action.button === 'back') { + return { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Left'] } }; + } + if (action.button === 'forward') { + return { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Right'] } }; + } + if (action.button === 'wheel') { + return { + type: 'scroll', + scroll: { + x: action.x ?? 0, + y: action.y ?? 0, + delta_x: pixelsToScrollTicks(action.scroll_x), + delta_y: pixelsToScrollTicks(action.scroll_y), + }, + }; + } return { type: 'click_mouse', click_mouse: { x: action.x ?? 0, y: action.y ?? 0, button: normalizeButton(action.button) }, }; + } case 'double_click': return { type: 'click_mouse', @@ -386,6 +404,18 @@ export class KernelComputer { } async click(x: number, y: number, button: string | number = 'left'): Promise { + if (button === 'back') { + await this.back(); + return; + } + if (button === 'forward') { + await this.forward(); + return; + } + if (button === 'wheel') { + await this.scroll(x, y, 0, 0); + return; + } const normalizedButton = normalizeButton(button) as 'left' | 'right' | 'middle'; const op = describeAction('click', { x, y, button: normalizedButton }); await this.traceCall(op, async () => { diff --git a/pkg/templates/typescript/openai-computer-use/lib/utils.ts b/pkg/templates/typescript/openai-computer-use/lib/utils.ts index c97c47e4..0241ddf0 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/utils.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/utils.ts @@ -92,12 +92,16 @@ function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } -export function checkBlocklistedUrl(url: string): boolean { +export function checkBlocklistedUrl(url: string): void { try { const host = new URL(url).hostname; - return BLOCKED_DOMAINS.some((d) => host === d || host.endsWith(`.${d}`)); - } catch { - return false; + if (BLOCKED_DOMAINS.some((d) => host === d || host.endsWith(`.${d}`))) { + throw new Error(`Blocked URL: ${url}`); + } + } catch (error) { + if (error instanceof Error && error.message.startsWith('Blocked URL:')) { + throw error; + } } } From a7919d2fd2d23e22ccd04b8d87076fa38a79731f Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 14:24:59 -0400 Subject: [PATCH 06/17] Fix URL-read side effects in computer-use templates. Avoid destructive per-turn URL reads in the TypeScript loop and restore page focus after clipboard URL capture by sending Escape in both TypeScript and Python helpers. Made-with: Cursor --- .../openai-computer-use/computers/kernel_computer.py | 4 ++++ .../typescript/openai-computer-use/lib/agent.ts | 11 ----------- .../openai-computer-use/lib/kernel-computer.ts | 1 + 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 1c1f681e..5bddfc52 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -263,6 +263,10 @@ def _current_url_batch_actions() -> List[Dict[str, Any]]: "type": "press_key", "press_key": {"hold_keys": ["Ctrl"], "keys": ["c"]}, }, + { + "type": "press_key", + "press_key": {"keys": ["Escape"]}, + }, ] diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 6b896828..9027a70b 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -325,17 +325,6 @@ export class Agent { turns += 1; const inputMessages = [...opts.messages]; - // Append current URL context to system message - const currentUrl = await this.computer.getCurrentUrl(); - const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); - if (sysIndex >= 0) { - const msg = inputMessages[sysIndex]; - const urlInfo = `\n- Current URL: ${currentUrl}`; - if (msg && 'content' in msg && typeof msg.content === 'string') { - inputMessages[sysIndex] = { ...msg, content: msg.content + urlInfo } as typeof msg; - } - } - this.debugPrint(...inputMessages, ...newItems); this.modelRequestStartedAt = Date.now(); const response = await utils.createResponse({ diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 1eb54698..005d4b0e 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -263,6 +263,7 @@ function currentUrlBatchActions(): BatchAction[] { { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['l'] } }, { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['a'] } }, { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['c'] } }, + { type: 'press_key', press_key: { keys: ['Escape'] } }, ]; } From 3f99e7f42d900ae0352de64e736a08ec4677a680 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 22:38:32 -0400 Subject: [PATCH 07/17] Separate deploy entrypoints from local runners and share lifecycle logging helpers. Remove direct-run fallbacks from deployable app entrypoints and centralize duplicated browser lifecycle backend/session event emission in reusable Python and TypeScript logging helpers. Made-with: Cursor --- .../openai-computer-use/agent/logging.py | 51 ++++++++++++++++ .../python/openai-computer-use/main.py | 61 ++++++------------- .../python/openai-computer-use/run_local.py | 50 +++++---------- .../typescript/openai-computer-use/index.ts | 52 ++++------------ .../openai-computer-use/lib/logging.ts | 47 ++++++++++++++ .../openai-computer-use/run_local.ts | 35 ++++------- 6 files changed, 155 insertions(+), 141 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/agent/logging.py b/pkg/templates/python/openai-computer-use/agent/logging.py index 93edddf7..f6f5ccfc 100644 --- a/pkg/templates/python/openai-computer-use/agent/logging.py +++ b/pkg/templates/python/openai-computer-use/agent/logging.py @@ -283,3 +283,54 @@ def render_text(event: dict) -> None: sys.stderr.flush() return render_text + + +def emit_browser_new_started(on_event: Callable[[dict], None]) -> None: + on_event({"event": "backend", "data": {"op": "browsers.new"}}) + + +def emit_browser_new_done( + on_event: Callable[[dict], None], started_at: datetime, live_view_url: str | None +) -> None: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.new.done", + "detail": live_view_url or "", + "elapsed_ms": int((datetime.now() - started_at).total_seconds() * 1000), + }, + } + ) + + +def emit_session_state( + on_event: Callable[[dict], None], session_id: str, live_view_url: str | None +) -> None: + on_event( + { + "event": "session_state", + "data": { + "session_id": session_id, + "live_view_url": live_view_url or "", + }, + } + ) + + +def emit_browser_delete_started(on_event: Callable[[dict], None]) -> None: + on_event({"event": "backend", "data": {"op": "browsers.delete"}}) + + +def emit_browser_delete_done( + on_event: Callable[[dict], None], started_at: datetime +) -> None: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.delete.done", + "elapsed_ms": int((datetime.now() - started_at).total_seconds() * 1000), + }, + } + ) diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index d8a42ca6..db238212 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -1,13 +1,18 @@ import asyncio import datetime import os -import subprocess -import sys from typing import NotRequired, TypedDict import kernel from agent import Agent -from agent.logging import create_event_logger +from agent.logging import ( + create_event_logger, + emit_browser_delete_done, + emit_browser_delete_started, + emit_browser_new_done, + emit_browser_new_started, + emit_session_state, +) from computers.kernel_computer import KernelComputer from kernel import Kernel @@ -56,31 +61,15 @@ async def cua_task( on_event = create_event_logger(output=output_mode) browser_create_started_at = datetime.datetime.now() - on_event({"event": "backend", "data": {"op": "browsers.new"}}) + emit_browser_new_started(on_event) kernel_browser = await asyncio.to_thread( client.browsers.create, invocation_id=ctx.invocation_id, stealth=True ) - on_event( - { - "event": "backend", - "data": { - "op": "browsers.new.done", - "detail": kernel_browser.browser_live_view_url or "", - "elapsed_ms": int( - (datetime.datetime.now() - browser_create_started_at).total_seconds() - * 1000 - ), - }, - } + emit_browser_new_done( + on_event, browser_create_started_at, kernel_browser.browser_live_view_url ) - on_event( - { - "event": "session_state", - "data": { - "session_id": kernel_browser.session_id, - "live_view_url": kernel_browser.browser_live_view_url or "", - }, - } + emit_session_state( + on_event, kernel_browser.session_id, kernel_browser.browser_live_view_url ) def run_agent(): @@ -133,26 +122,10 @@ def run_agent(): return await asyncio.to_thread(run_agent) finally: browser_delete_started_at = datetime.datetime.now() - on_event({"event": "backend", "data": {"op": "browsers.delete"}}) + emit_browser_delete_started(on_event) try: await asyncio.to_thread(client.browsers.delete_by_id, kernel_browser.session_id) finally: - on_event( - { - "event": "backend", - "data": { - "op": "browsers.delete.done", - "elapsed_ms": int( - (datetime.datetime.now() - browser_delete_started_at).total_seconds() - * 1000 - ), - }, - } - ) - - -if __name__ == "__main__": - # `main.py` is the deployable Kernel app entrypoint. - # For local execution, forward to the existing local harness. - command = [sys.executable, "run_local.py", *sys.argv[1:]] - raise SystemExit(subprocess.call(command)) + emit_browser_delete_done(on_event, browser_delete_started_at) + + diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index 36f3b52b..ef46d491 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -16,7 +16,14 @@ from kernel import Kernel from agent import Agent -from agent.logging import create_event_logger +from agent.logging import ( + create_event_logger, + emit_browser_delete_done, + emit_browser_delete_started, + emit_browser_new_done, + emit_browser_new_started, + emit_session_state, +) from computers.kernel_computer import KernelComputer @@ -47,30 +54,12 @@ def main(): on_event = create_event_logger(output=args.output, verbose=args.debug) browser_create_started_at = datetime.datetime.now() - on_event({"event": "backend", "data": {"op": "browsers.new"}}) + emit_browser_new_started(on_event) browser = client.browsers.create(timeout_seconds=300) - on_event( - { - "event": "backend", - "data": { - "op": "browsers.new.done", - "detail": browser.browser_live_view_url or "", - "elapsed_ms": int( - (datetime.datetime.now() - browser_create_started_at).total_seconds() - * 1000 - ), - }, - } - ) - on_event( - { - "event": "session_state", - "data": { - "session_id": browser.session_id, - "live_view_url": browser.browser_live_view_url or "", - }, - } + emit_browser_new_done( + on_event, browser_create_started_at, browser.browser_live_view_url ) + emit_session_state(on_event, browser.session_id, browser.browser_live_view_url) computer = KernelComputer(client, browser.session_id, on_event=on_event) @@ -108,22 +97,11 @@ def main(): raise ValueError("No response from agent") finally: browser_delete_started_at = datetime.datetime.now() - on_event({"event": "backend", "data": {"op": "browsers.delete"}}) + emit_browser_delete_started(on_event) try: client.browsers.delete_by_id(browser.session_id) finally: - on_event( - { - "event": "backend", - "data": { - "op": "browsers.delete.done", - "elapsed_ms": int( - (datetime.datetime.now() - browser_delete_started_at).total_seconds() - * 1000 - ), - }, - } - ) + emit_browser_delete_done(on_event, browser_delete_started_at) print("> Browser session deleted") diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index 494369e9..806cbddd 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -1,11 +1,16 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; import * as dotenv from 'dotenv'; -import { resolve } from 'node:path'; -import { fileURLToPath } from 'node:url'; import type { ResponseItem, ResponseOutputMessage } from 'openai/resources/responses/responses'; import { Agent } from './lib/agent'; import { KernelComputer } from './lib/kernel-computer'; -import { createEventLogger } from './lib/logging'; +import { + createEventLogger, + emitBrowserDeleteDone, + emitBrowserDeleteStarted, + emitBrowserNewDone, + emitBrowserNewStarted, + emitSessionState, +} from './lib/logging'; import type { OutputMode } from './lib/log-events'; dotenv.config({ override: true, quiet: true }); @@ -48,21 +53,11 @@ app.action( const outputMode: OutputMode = payload.output === 'jsonl' ? 'jsonl' : 'text'; const onEvent = createEventLogger({ output: outputMode }); - onEvent({ event: 'backend', data: { op: 'browsers.new' } }); + emitBrowserNewStarted(onEvent); const browserCreateStartedAt = Date.now(); const kb = await kernel.browsers.create({ invocation_id: ctx.invocation_id }); - onEvent({ - event: 'backend', - data: { - op: 'browsers.new.done', - detail: kb.browser_live_view_url ?? '', - elapsed_ms: Date.now() - browserCreateStartedAt, - }, - }); - onEvent({ - event: 'session_state', - data: { session_id: kb.session_id, live_view_url: kb.browser_live_view_url ?? '' }, - }); + emitBrowserNewDone(onEvent, browserCreateStartedAt, kb.browser_live_view_url); + emitSessionState(onEvent, kb.session_id, kb.browser_live_view_url); const computer = new KernelComputer(kernel, kb.session_id, onEvent); @@ -119,34 +114,13 @@ app.action( console.error('Error in cua-task:', error); return { elapsed, answer: null }; } finally { - onEvent({ event: 'backend', data: { op: 'browsers.delete' } }); + emitBrowserDeleteStarted(onEvent); const browserDeleteStartedAt = Date.now(); try { await kernel.browsers.deleteByID(kb.session_id); } finally { - onEvent({ - event: 'backend', - data: { - op: 'browsers.delete.done', - elapsed_ms: Date.now() - browserDeleteStartedAt, - }, - }); + emitBrowserDeleteDone(onEvent, browserDeleteStartedAt); } } }, ); - -function isDirectRun(): boolean { - const entry = process.argv[1]; - if (!entry) return false; - return resolve(entry) === resolve(fileURLToPath(import.meta.url)); -} - -if (isDirectRun()) { - void import('./run_local') - .then(({ runLocalTest }) => runLocalTest(process.argv.slice(2))) - .catch((error: unknown) => { - console.error(error); - process.exit(1); - }); -} diff --git a/pkg/templates/typescript/openai-computer-use/lib/logging.ts b/pkg/templates/typescript/openai-computer-use/lib/logging.ts index 34791749..2abfbc91 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/logging.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/logging.ts @@ -25,6 +25,53 @@ function formatKernelOp(op: string): string { return `${op}()`; } +export function emitBrowserNewStarted(onEvent: (event: AgentEvent) => void): void { + onEvent({ event: 'backend', data: { op: 'browsers.new' } }); +} + +export function emitBrowserNewDone( + onEvent: (event: AgentEvent) => void, + startedAtMs: number, + liveViewUrl?: string | null, +): void { + onEvent({ + event: 'backend', + data: { + op: 'browsers.new.done', + detail: liveViewUrl ?? '', + elapsed_ms: Date.now() - startedAtMs, + }, + }); +} + +export function emitSessionState( + onEvent: (event: AgentEvent) => void, + sessionId: string, + liveViewUrl?: string | null, +): void { + onEvent({ + event: 'session_state', + data: { session_id: sessionId, live_view_url: liveViewUrl ?? '' }, + }); +} + +export function emitBrowserDeleteStarted(onEvent: (event: AgentEvent) => void): void { + onEvent({ event: 'backend', data: { op: 'browsers.delete' } }); +} + +export function emitBrowserDeleteDone( + onEvent: (event: AgentEvent) => void, + startedAtMs: number, +): void { + onEvent({ + event: 'backend', + data: { + op: 'browsers.delete.done', + elapsed_ms: Date.now() - startedAtMs, + }, + }); +} + class ThinkingSpinner { private active = false; private timer: NodeJS.Timeout | null = null; diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts index fd538c94..84c52848 100644 --- a/pkg/templates/typescript/openai-computer-use/run_local.ts +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -4,7 +4,14 @@ import { resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import { Agent } from './lib/agent'; import { KernelComputer } from './lib/kernel-computer'; -import { createEventLogger } from './lib/logging'; +import { + createEventLogger, + emitBrowserDeleteDone, + emitBrowserDeleteStarted, + emitBrowserNewDone, + emitBrowserNewStarted, + emitSessionState, +} from './lib/logging'; import type { OutputMode } from './lib/log-events'; dotenv.config({ override: true, quiet: true }); @@ -26,21 +33,11 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom const debug = args.includes('--debug'); const onEvent = createEventLogger({ output: outputMode, verbose: debug }); - onEvent({ event: 'backend', data: { op: 'browsers.new' } }); + emitBrowserNewStarted(onEvent); const browserCreateStartedAt = Date.now(); const browser = await client.browsers.create({ timeout_seconds: 300 }); - onEvent({ - event: 'backend', - data: { - op: 'browsers.new.done', - detail: browser.browser_live_view_url ?? '', - elapsed_ms: Date.now() - browserCreateStartedAt, - }, - }); - onEvent({ - event: 'session_state', - data: { session_id: browser.session_id, live_view_url: browser.browser_live_view_url ?? '' }, - }); + emitBrowserNewDone(onEvent, browserCreateStartedAt, browser.browser_live_view_url); + emitSessionState(onEvent, browser.session_id, browser.browser_live_view_url); const computer = new KernelComputer(client, browser.session_id, onEvent); @@ -83,18 +80,12 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom onEvent, }); } finally { - onEvent({ event: 'backend', data: { op: 'browsers.delete' } }); + emitBrowserDeleteStarted(onEvent); const browserDeleteStartedAt = Date.now(); try { await client.browsers.deleteByID(browser.session_id); } finally { - onEvent({ - event: 'backend', - data: { - op: 'browsers.delete.done', - elapsed_ms: Date.now() - browserDeleteStartedAt, - }, - }); + emitBrowserDeleteDone(onEvent, browserDeleteStartedAt); } console.log('> Browser session deleted'); } From 0e81e1834dd6e3b18a9f623e6816e573e5abcab9 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Wed, 11 Mar 2026 23:00:33 -0400 Subject: [PATCH 08/17] Handle short drag paths safely in CUA templates. Normalize drag paths before calling Kernel drag APIs so zero/one-point model outputs do not 400 entire batches, padding single-point paths and no-oping empty drags in both TypeScript and Python. Made-with: Cursor --- .../computers/kernel_computer.py | 38 +++++++++++++++++-- .../lib/kernel-computer.ts | 33 ++++++++++++++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 5bddfc52..33230446 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -121,6 +121,33 @@ def _normalize_button(button) -> str: return str(button) +def _normalize_drag_path(path: Any) -> List[List[int]]: + points: List[List[int]] = [] + if isinstance(path, list): + for point in path: + if not isinstance(point, dict): + continue + x = point.get("x") + y = point.get("y") + if ( + isinstance(x, (int, float)) + and not isinstance(x, bool) + and isinstance(y, (int, float)) + and not isinstance(y, bool) + ): + points.append([int(x), int(y)]) + if not points: + return [] + if len(points) == 1: + x, y = points[0] + return [[x, y], [x + 1, y]] + return points + + +def _drag_noop_action() -> Dict[str, Any]: + return {"type": "sleep", "sleep": {"duration_ms": 1}} + + def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: action_type = action.get("type", "") if action_type == "click": @@ -185,7 +212,9 @@ def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: elif action_type == "move": return {"type": "move_mouse", "move_mouse": {"x": action.get("x", 0), "y": action.get("y", 0)}} elif action_type == "drag": - path = [[p["x"], p["y"]] for p in action.get("path", [])] + path = _normalize_drag_path(action.get("path", [])) + if len(path) < 2: + return _drag_noop_action() return {"type": "drag_mouse", "drag_mouse": {"path": path}} elif action_type == "wait": return {"type": "sleep", "sleep": {"duration_ms": action.get("ms", 1000)}} @@ -546,8 +575,11 @@ def drag(self, path: List[Dict[str, int]]) -> None: op = _describe_action("drag", {"path": path}) def _do() -> None: - p = [[pt["x"], pt["y"]] for pt in path] - self.client.browsers.computer.drag_mouse(self.session_id, path=p) + normalized_path = _normalize_drag_path(path) + if len(normalized_path) < 2: + time.sleep(0.001) + return + self.client.browsers.computer.drag_mouse(self.session_id, path=normalized_path) self._trace_backend(op, _do) diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 005d4b0e..65409de1 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -165,6 +165,28 @@ function normalizeButton(button?: string | number): string { return button; } +function normalizeDragPath(path: Array<{ x: number; y: number }> | undefined): number[][] { + const points: Array<[number, number]> = (path ?? []) + .filter( + (point): point is { x: number; y: number } => + typeof point?.x === 'number' && + Number.isFinite(point.x) && + typeof point?.y === 'number' && + Number.isFinite(point.y), + ) + .map((point): [number, number] => [Math.trunc(point.x), Math.trunc(point.y)]); + if (points.length === 0) return []; + if (points.length === 1) { + const [x, y] = points[0]!; + return [[x, y], [x + 1, y]]; + } + return points; +} + +function dragNoopAction(): BatchAction { + return { type: 'sleep', sleep: { duration_ms: 1 } }; +} + function translateCuaAction(action: CuaAction): BatchAction { switch (action.type) { case 'click': { @@ -220,7 +242,8 @@ function translateCuaAction(action: CuaAction): BatchAction { case 'move': return { type: 'move_mouse', move_mouse: { x: action.x ?? 0, y: action.y ?? 0 } }; case 'drag': { - const path = (action.path ?? []).map((p) => [p.x, p.y]); + const path = normalizeDragPath(action.path); + if (path.length < 2) return dragNoopAction(); return { type: 'drag_mouse', drag_mouse: { path } }; } case 'wait': @@ -483,8 +506,12 @@ export class KernelComputer { async drag(path: Array<{ x: number; y: number }>): Promise { const op = describeAction('drag', { path }); await this.traceCall(op, async () => { - const p = path.map((pt) => [pt.x, pt.y]); - await this.client.browsers.computer.dragMouse(this.sessionId, { path: p }); + const normalizedPath = normalizeDragPath(path); + if (normalizedPath.length < 2) { + await new Promise((resolve) => setTimeout(resolve, 1)); + return; + } + await this.client.browsers.computer.dragMouse(this.sessionId, { path: normalizedPath }); }); } From ff8d0c1fd94913e709308777f5be4e534fe48f8c Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 10:43:04 -0400 Subject: [PATCH 09/17] Fix key translation bypass in batch actions and add type validation for batch args. Route goto/back/forward/currentUrl batch helpers through normalizeKeypressPayload so CUA-style key names are translated to X11 keysyms. Validate that batch actions argument is a list/array before passing to batchActions. Co-Authored-By: Claude Opus 4.6 --- .../python/openai-computer-use/agent/agent.py | 2 + .../computers/kernel_computer.py | 56 ++++++------------- .../openai-computer-use/lib/agent.ts | 2 +- .../lib/kernel-computer.ts | 26 ++++----- 4 files changed, 32 insertions(+), 54 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index 7e8bcbad..c5d830b7 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -324,6 +324,8 @@ def handle_item(self, item): def _handle_batch_call(self, call_id, args): actions = args.get("actions", []) + if not isinstance(actions, list): + actions = [] self.computer.batch_actions(actions) status_text = "Actions executed successfully." terminal_action = self._batch_terminal_read_action(actions if isinstance(actions, list) else []) diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 33230446..993f1f17 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -235,67 +235,43 @@ def _is_batch_computer_action_type(action_type: str) -> bool: } +def _press_key_action( + keys: List[str], hold_keys: List[str] | None = None +) -> Dict[str, Any]: + payload = _normalize_keypress_payload(keys=keys, hold_keys=hold_keys) + return {"type": "press_key", "press_key": payload} + + def _goto_batch_actions(url: str) -> List[Dict[str, Any]]: return [ - { - "type": "press_key", - "press_key": {"hold_keys": ["Ctrl"], "keys": ["l"]}, - }, + _press_key_action(["l"], hold_keys=["Ctrl"]), { "type": "sleep", "sleep": {"duration_ms": GOTO_CHORD_DELAY_MS}, }, - { - "type": "press_key", - "press_key": {"hold_keys": ["Ctrl"], "keys": ["a"]}, - }, + _press_key_action(["a"], hold_keys=["Ctrl"]), { "type": "type_text", "type_text": {"text": url}, }, - { - "type": "press_key", - "press_key": {"keys": ["Return"]}, - }, + _press_key_action(["Return"]), ] def _back_batch_actions() -> List[Dict[str, Any]]: - return [ - { - "type": "press_key", - "press_key": {"hold_keys": ["Alt"], "keys": ["Left"]}, - } - ] + return [_press_key_action(["Left"], hold_keys=["Alt"])] def _forward_batch_actions() -> List[Dict[str, Any]]: - return [ - { - "type": "press_key", - "press_key": {"hold_keys": ["Alt"], "keys": ["Right"]}, - } - ] + return [_press_key_action(["Right"], hold_keys=["Alt"])] def _current_url_batch_actions() -> List[Dict[str, Any]]: return [ - { - "type": "press_key", - "press_key": {"hold_keys": ["Ctrl"], "keys": ["l"]}, - }, - { - "type": "press_key", - "press_key": {"hold_keys": ["Ctrl"], "keys": ["a"]}, - }, - { - "type": "press_key", - "press_key": {"hold_keys": ["Ctrl"], "keys": ["c"]}, - }, - { - "type": "press_key", - "press_key": {"keys": ["Escape"]}, - }, + _press_key_action(["l"], hold_keys=["Ctrl"]), + _press_key_action(["a"], hold_keys=["Ctrl"]), + _press_key_action(["c"], hold_keys=["Ctrl"]), + _press_key_action(["Escape"]), ] diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 9027a70b..73f321ef 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -221,7 +221,7 @@ export class Agent { callId: string, argsObj: Record, ): Promise { - const actions = argsObj.actions as unknown as CuaAction[]; + const actions = Array.isArray(argsObj.actions) ? (argsObj.actions as CuaAction[]) : []; await this.computer.batchActions(actions); let statusText = 'Actions executed successfully.'; diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 65409de1..389b09cd 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -259,34 +259,34 @@ function isBatchComputerActionType(actionType: string): boolean { ); } +function pressKeyAction(keys: string[], holdKeys?: string[]): BatchAction { + return { type: 'press_key', press_key: normalizeKeypressPayload(keys, holdKeys) }; +} + function gotoBatchActions(url: string): BatchAction[] { return [ - { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['l'] } }, + pressKeyAction(['l'], ['Ctrl']), { type: 'sleep', sleep: { duration_ms: GOTO_CHORD_DELAY_MS } }, - { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['a'] } }, + pressKeyAction(['a'], ['Ctrl']), { type: 'type_text', type_text: { text: url } }, - { type: 'press_key', press_key: { keys: ['Return'] } }, + pressKeyAction(['Return']), ]; } function backBatchActions(): BatchAction[] { - return [ - { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Left'] } }, - ]; + return [pressKeyAction(['Left'], ['Alt'])]; } function forwardBatchActions(): BatchAction[] { - return [ - { type: 'press_key', press_key: { hold_keys: ['Alt'], keys: ['Right'] } }, - ]; + return [pressKeyAction(['Right'], ['Alt'])]; } function currentUrlBatchActions(): BatchAction[] { return [ - { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['l'] } }, - { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['a'] } }, - { type: 'press_key', press_key: { hold_keys: ['Ctrl'], keys: ['c'] } }, - { type: 'press_key', press_key: { keys: ['Escape'] } }, + pressKeyAction(['l'], ['Ctrl']), + pressKeyAction(['a'], ['Ctrl']), + pressKeyAction(['c'], ['Ctrl']), + pressKeyAction(['Escape']), ]; } From 5c705935f67d8381162e1e032a59c51fd4d03c2b Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 11:25:44 -0400 Subject: [PATCH 10/17] Parameterize local CUA template tasks and centralize the computer tool shape. This makes local smoke tests easier to target while keeping the OpenAI computer tool definition consistent across the Python and TypeScript templates. Made-with: Cursor --- .../python/openai-computer-use/agent/agent.py | 7 ++++--- .../python/openai-computer-use/run_local.py | 11 +++++++++-- .../typescript/openai-computer-use/lib/agent.ts | 6 +++--- .../typescript/openai-computer-use/run_local.ts | 16 ++++++++++++++-- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index c5d830b7..cd08d2ae 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -117,6 +117,9 @@ "strict": False, } +# Keep this shape aligned with CUA and current OpenAI Responses API. +OPENAI_COMPUTER_TOOL = {"type": "computer"} + class Agent: """An agent that uses OpenAI CUA with Kernel's native computer control API.""" @@ -140,9 +143,7 @@ def __init__( if computer: self.tools += [ - { - "type": "computer", - }, + dict(OPENAI_COMPUTER_TOOL), BATCH_TOOL, EXTRA_TOOL, ] diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index ef46d491..05f9dc51 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -3,7 +3,7 @@ No Kernel app deployment needed. Usage: - KERNEL_API_KEY=... OPENAI_API_KEY=... uv run run_local.py --output text + KERNEL_API_KEY=... OPENAI_API_KEY=... uv run run_local.py --task "go to example.com and summarize it" """ import argparse @@ -26,6 +26,8 @@ ) from computers.kernel_computer import KernelComputer +DEFAULT_TASK = "go to example.com and summarize what the page says" + def parse_args(): parser = argparse.ArgumentParser(description="Run OpenAI CUA local test") @@ -40,6 +42,11 @@ def parse_args(): action="store_true", help="Enable verbose debug payload logging", ) + parser.add_argument( + "--task", + default=DEFAULT_TASK, + help="User task prompt to run in the browser session", + ) return parser.parse_args() @@ -74,7 +81,7 @@ def main(): }, { "role": "user", - "content": "go to ebay.com and look up oberheim ob-x prices and give me a report", + "content": args.task, }, ] diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index 73f321ef..f86a7e84 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -17,6 +17,8 @@ import type { CuaAction, KernelComputer } from './kernel-computer'; const BATCH_FUNC_NAME = 'batch_computer_actions'; const EXTRA_FUNC_NAME = 'computer_use_extra'; +// Keep this shape aligned with CUA and current OpenAI Responses API. +const OPENAI_COMPUTER_TOOL = { type: 'computer' } as unknown as Tool; export class Agent { private model: string; @@ -40,9 +42,7 @@ export class Agent { this.ackCb = opts.acknowledge_safety_check_callback ?? ((): boolean => true); this.tools = [ - { - type: 'computer', - } as unknown as Tool, + OPENAI_COMPUTER_TOOL, batchComputerTool, computerUseExtraTool, ...(opts.tools ?? []), diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts index 84c52848..211a7538 100644 --- a/pkg/templates/typescript/openai-computer-use/run_local.ts +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -21,15 +21,18 @@ dotenv.config({ override: true, quiet: true }); * No Kernel app deployment needed. * * Usage: - * KERNEL_API_KEY=... OPENAI_API_KEY=... npx tsx run_local.ts + * KERNEL_API_KEY=... OPENAI_API_KEY=... npx tsx run_local.ts --task "go to example.com and summarize it" */ +const DEFAULT_TASK = 'go to example.com and summarize what the page says'; + export async function runLocalTest(args: string[] = process.argv.slice(2)): Promise { if (!process.env.KERNEL_API_KEY) throw new Error('KERNEL_API_KEY is not set'); if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); const outputMode = parseOutputMode(args); + const task = parseTask(args); const debug = args.includes('--debug'); const onEvent = createEventLogger({ output: outputMode, verbose: debug }); @@ -69,7 +72,7 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom content: [ { type: 'input_text', - text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', + text: task, }, ], }, @@ -100,6 +103,15 @@ function parseOutputMode(args: string[]): OutputMode { return output === 'jsonl' ? 'jsonl' : 'text'; } +function parseTask(args: string[]): string { + const taskFromEquals = args.find((arg) => arg.startsWith('--task='))?.slice('--task='.length).trim(); + const taskFlagIndex = args.findIndex((arg) => arg === '--task'); + const nextArg = taskFlagIndex >= 0 ? args[taskFlagIndex + 1] : undefined; + const taskFromNext = nextArg && !nextArg.startsWith('--') ? nextArg.trim() : undefined; + const task = taskFromEquals || taskFromNext; + return task && task.length > 0 ? task : DEFAULT_TASK; +} + function isDirectRun(): boolean { const entry = process.argv[1]; if (!entry) return false; From e3b2dffc647481605898030578aeab44619cd1ed Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 11:27:37 -0400 Subject: [PATCH 11/17] Fix TypeScript batch key payload modifiers. This remaps normalized modifier keys to the Kernel API's `hold_keys` field so browser navigation helpers keep sending Ctrl/Alt chords correctly. Made-with: Cursor --- .../openai-computer-use/lib/kernel-computer.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 389b09cd..0740bec1 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -260,7 +260,14 @@ function isBatchComputerActionType(actionType: string): boolean { } function pressKeyAction(keys: string[], holdKeys?: string[]): BatchAction { - return { type: 'press_key', press_key: normalizeKeypressPayload(keys, holdKeys) }; + const normalized = normalizeKeypressPayload(keys, holdKeys); + return { + type: 'press_key', + press_key: { + keys: normalized.keys, + ...(normalized.holdKeys.length > 0 ? { hold_keys: normalized.holdKeys } : {}), + }, + }; } function gotoBatchActions(url: string): BatchAction[] { From 44e1533ea8dbb58540cb0bf1f4f29e53781167b9 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 13:40:55 -0400 Subject: [PATCH 12/17] Align the TypeScript CUA template with the current Responses API. Return multimodal tool outputs directly, validate drag paths instead of silently no-oping them, and simplify the local runner so generated apps behave consistently in debug and deployed flows. Made-with: Cursor --- .../typescript/openai-computer-use/README.md | 11 +--- .../typescript/openai-computer-use/index.ts | 5 +- .../openai-computer-use/lib/agent.ts | 45 ++++++++-------- .../lib/kernel-computer.ts | 53 +++++++++++-------- .../openai-computer-use/lib/log-events.ts | 2 - .../openai-computer-use/lib/logging.ts | 14 +---- .../openai-computer-use/lib/toolset.ts | 19 ++++++- .../openai-computer-use/lib/utils.ts | 12 +++++ .../openai-computer-use/run_local.ts | 15 +----- 9 files changed, 92 insertions(+), 84 deletions(-) diff --git a/pkg/templates/typescript/openai-computer-use/README.md b/pkg/templates/typescript/openai-computer-use/README.md index e652eedf..b8bab4aa 100644 --- a/pkg/templates/typescript/openai-computer-use/README.md +++ b/pkg/templates/typescript/openai-computer-use/README.md @@ -12,24 +12,17 @@ You can test against a remote Kernel browser without deploying: cp .env.example .env # Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env pnpm install -pnpm run test:local -# Equivalent direct run from the app entrypoint: -pnpm exec tsx index.ts -# Direct run of the local runner file: pnpm exec tsx run_local.ts -# JSONL event output -pnpm run test:local -- --output=jsonl +pnpm exec tsx run_local.ts --task "go to https://news.ycombinator.com and get the top 5 articles" ``` -The local runner defaults to human-readable logs (`text`). Use `--output=jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. +The local runner defaults to a built-in sample task. Pass `--task "..."` to run a custom prompt locally, and add `--debug` to include verbose in-flight events. ## Deploy to Kernel ```bash kernel deploy index.ts --env-file .env kernel invoke ts-openai-cua cua-task -p '{"task":"Go to https://news.ycombinator.com and get the top 5 articles"}' -# JSONL logs for invocation -kernel invoke ts-openai-cua cua-task -p '{"task":"Go to https://news.ycombinator.com and get the top 5 articles","output":"jsonl"}' ``` See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index 806cbddd..fc70b829 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -11,13 +11,11 @@ import { emitBrowserNewStarted, emitSessionState, } from './lib/logging'; -import type { OutputMode } from './lib/log-events'; dotenv.config({ override: true, quiet: true }); interface CuaInput { task: string; - output?: OutputMode; } interface CuaOutput { elapsed: number; @@ -50,8 +48,7 @@ app.action( async (ctx: KernelContext, payload?: CuaInput): Promise => { const start = Date.now(); if (!payload?.task) throw new Error('task is required'); - const outputMode: OutputMode = payload.output === 'jsonl' ? 'jsonl' : 'text'; - const onEvent = createEventLogger({ output: outputMode }); + const onEvent = createEventLogger(); emitBrowserNewStarted(onEvent); const browserCreateStartedAt = Date.now(); diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index f86a7e84..b7234d21 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -17,6 +17,7 @@ import type { CuaAction, KernelComputer } from './kernel-computer'; const BATCH_FUNC_NAME = 'batch_computer_actions'; const EXTRA_FUNC_NAME = 'computer_use_extra'; +const POST_ACTION_SETTLE_MS = 300; // Keep this shape aligned with CUA and current OpenAI Responses API. const OPENAI_COMPUTER_TOOL = { type: 'computer' } as unknown as Tool; @@ -70,6 +71,11 @@ export class Agent { return this.modelRequestStartedAt === null ? null : Date.now() - this.modelRequestStartedAt; } + private async capturePostActionScreenshot(): Promise { + await new Promise((resolve) => setTimeout(resolve, POST_ACTION_SETTLE_MS)); + return this.computer.screenshot(); + } + private extractReasoningText(item: Record): string { const summary = item.summary; if (!Array.isArray(summary)) return ''; @@ -187,7 +193,7 @@ export class Agent { }); await this.computer.batchActions(actionList as CuaAction[]); - const screenshot = await this.computer.screenshot(); + const screenshot = await this.capturePostActionScreenshot(); this.emit('screenshot', { captured: true, bytes_base64: screenshot.length }); const pending = cc.pending_safety_checks ?? []; @@ -203,7 +209,6 @@ export class Agent { type: 'computer_screenshot', image_url: `data:image/png;base64,${screenshot}`, } as unknown as ResponseComputerToolCallOutputItem['output']; - (screenshotOutput as { current_url?: string }).current_url = currentUrl; const out: Omit = { type: 'computer_call_output', @@ -235,20 +240,18 @@ export class Agent { } } - const outputItems: Array> = [{ type: 'text', text: statusText }]; - if (terminalReadAction !== 'url') { - const screenshot = await this.computer.screenshot(); - outputItems.push({ - type: 'image_url', - image_url: `data:image/png;base64,${screenshot}`, - detail: 'original', - }); - } + const screenshot = await this.capturePostActionScreenshot(); + const outputItems: Array> = [{ type: 'input_text', text: statusText }]; + outputItems.push({ + type: 'input_image', + image_url: `data:image/png;base64,${screenshot}`, + detail: 'original', + }); return [ { type: 'function_call_output', call_id: callId, - output: JSON.stringify(outputItems), + output: outputItems, } as unknown as ResponseFunctionToolCallOutputItem, ]; } @@ -273,20 +276,18 @@ export class Agent { statusText = `unknown ${EXTRA_FUNC_NAME} action: ${action}`; } - const outputItems: Array> = [{ type: 'text', text: statusText }]; - if (action !== 'url') { - const screenshot = await this.computer.screenshot(); - outputItems.push({ - type: 'image_url', - image_url: `data:image/png;base64,${screenshot}`, - detail: 'original', - }); - } + const screenshot = await this.capturePostActionScreenshot(); + const outputItems: Array> = [{ type: 'input_text', text: statusText }]; + outputItems.push({ + type: 'input_image', + image_url: `data:image/png;base64,${screenshot}`, + detail: 'original', + }); return [ { type: 'function_call_output', call_id: callId, - output: JSON.stringify(outputItems), + output: outputItems, } as unknown as ResponseFunctionToolCallOutputItem, ]; } diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 0740bec1..0e2e96c1 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -165,26 +165,40 @@ function normalizeButton(button?: string | number): string { return button; } -function normalizeDragPath(path: Array<{ x: number; y: number }> | undefined): number[][] { - const points: Array<[number, number]> = (path ?? []) - .filter( - (point): point is { x: number; y: number } => - typeof point?.x === 'number' && - Number.isFinite(point.x) && - typeof point?.y === 'number' && - Number.isFinite(point.y), - ) - .map((point): [number, number] => [Math.trunc(point.x), Math.trunc(point.y)]); - if (points.length === 0) return []; - if (points.length === 1) { - const [x, y] = points[0]!; - return [[x, y], [x + 1, y]]; +function normalizeDragPath(path: unknown): number[][] { + if (!Array.isArray(path)) return []; + + const points: Array<[number, number]> = []; + for (const point of path) { + if (Array.isArray(point) && point.length >= 2) { + const [x, y] = point; + if (typeof x === 'number' && Number.isFinite(x) && typeof y === 'number' && Number.isFinite(y)) { + points.push([Math.trunc(x), Math.trunc(y)]); + } + continue; + } + + if ( + point && + typeof point === 'object' && + typeof (point as { x?: unknown }).x === 'number' && + Number.isFinite((point as { x: number }).x) && + typeof (point as { y?: unknown }).y === 'number' && + Number.isFinite((point as { y: number }).y) + ) { + points.push([ + Math.trunc((point as { x: number }).x), + Math.trunc((point as { y: number }).y), + ]); + } } + return points; } -function dragNoopAction(): BatchAction { - return { type: 'sleep', sleep: { duration_ms: 1 } }; +function validateDragPath(path: number[][]): void { + if (path.length >= 2) return; + throw new Error(`drag action requires path with at least two points; got ${JSON.stringify(path)}`); } function translateCuaAction(action: CuaAction): BatchAction { @@ -243,7 +257,7 @@ function translateCuaAction(action: CuaAction): BatchAction { return { type: 'move_mouse', move_mouse: { x: action.x ?? 0, y: action.y ?? 0 } }; case 'drag': { const path = normalizeDragPath(action.path); - if (path.length < 2) return dragNoopAction(); + validateDragPath(path); return { type: 'drag_mouse', drag_mouse: { path } }; } case 'wait': @@ -514,10 +528,7 @@ export class KernelComputer { const op = describeAction('drag', { path }); await this.traceCall(op, async () => { const normalizedPath = normalizeDragPath(path); - if (normalizedPath.length < 2) { - await new Promise((resolve) => setTimeout(resolve, 1)); - return; - } + validateDragPath(normalizedPath); await this.client.browsers.computer.dragMouse(this.sessionId, { path: normalizedPath }); }); } diff --git a/pkg/templates/typescript/openai-computer-use/lib/log-events.ts b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts index d22852e4..87643472 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/log-events.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/log-events.ts @@ -1,5 +1,3 @@ -export type OutputMode = 'text' | 'jsonl'; - export type AgentEventName = | 'session_state' | 'backend' diff --git a/pkg/templates/typescript/openai-computer-use/lib/logging.ts b/pkg/templates/typescript/openai-computer-use/lib/logging.ts index 2abfbc91..77eaf3c6 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/logging.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/logging.ts @@ -1,4 +1,4 @@ -import type { AgentEvent, OutputMode } from './log-events'; +import type { AgentEvent } from './log-events'; const MAX_LINE_WIDTH = 120; @@ -142,19 +142,9 @@ class ThinkingSpinner { } } -export function createEventLogger(opts?: { - output?: OutputMode; - verbose?: boolean; -}): (event: AgentEvent) => void { - const output = opts?.output ?? 'text'; +export function createEventLogger(opts?: { verbose?: boolean }): (event: AgentEvent) => void { const verbose = opts?.verbose ?? false; - if (output === 'jsonl') { - return (event: AgentEvent): void => { - process.stdout.write(`${JSON.stringify({ event: event.event, data: event.data })}\n`); - }; - } - let inText = false; let lastLiveViewUrl = ''; const spinner = new ThinkingSpinner(process.stdout.isTTY); diff --git a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts index 815e2b69..441563be 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts @@ -14,7 +14,12 @@ Use computer_use_extra for: When interacting with page content (search boxes, forms, chat inputs): - Click the target input first, then type. -- Do not use URL-navigation actions for in-page text entry.`; +- Do not use URL-navigation actions for in-page text entry. + +For drag actions in batch_computer_actions: +- Always include a path field. +- path must be an array of at least two points. +- Each point must be an object like {"x": 123, "y": 456}.`; export const batchComputerTool = { type: 'function' as const, @@ -65,6 +70,18 @@ export const batchComputerTool = { button: { type: 'string' }, scroll_x: { type: 'number' }, scroll_y: { type: 'number' }, + path: { + type: 'array', + description: 'Required for drag actions. Provide at least two points as objects with x/y coordinates.', + items: { + type: 'object', + properties: { + x: { type: 'number' }, + y: { type: 'number' }, + }, + required: ['x', 'y'], + }, + }, }, required: ['type'], }, diff --git a/pkg/templates/typescript/openai-computer-use/lib/utils.ts b/pkg/templates/typescript/openai-computer-use/lib/utils.ts index 0241ddf0..9a3134bf 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/utils.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/utils.ts @@ -28,6 +28,18 @@ export function sanitizeMessage(msg: ResponseItem): ResponseItem { output.image_url = '[omitted]'; } } + if ( + sanitizedMsg.type === 'function_call_output' && + Array.isArray((sanitizedMsg as { output?: unknown }).output) + ) { + const outputItems = (sanitizedMsg as unknown as { output: Array<{ type?: unknown; image_url?: unknown }> }).output; + sanitizedMsg.output = outputItems.map((item) => { + if (item.type === 'input_image' && typeof item.image_url === 'string') { + return { ...item, image_url: '[omitted]' }; + } + return item; + }) as typeof sanitizedMsg.output; + } return sanitizedMsg; } diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts index 211a7538..66cf30d2 100644 --- a/pkg/templates/typescript/openai-computer-use/run_local.ts +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -12,7 +12,6 @@ import { emitBrowserNewStarted, emitSessionState, } from './lib/logging'; -import type { OutputMode } from './lib/log-events'; dotenv.config({ override: true, quiet: true }); @@ -31,10 +30,9 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); - const outputMode = parseOutputMode(args); const task = parseTask(args); const debug = args.includes('--debug'); - const onEvent = createEventLogger({ output: outputMode, verbose: debug }); + const onEvent = createEventLogger({ verbose: debug }); emitBrowserNewStarted(onEvent); const browserCreateStartedAt = Date.now(); @@ -72,7 +70,7 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom content: [ { type: 'input_text', - text: task, + text: task, }, ], }, @@ -94,15 +92,6 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom } } -function parseOutputMode(args: string[]): OutputMode { - const outputArg = args.find((arg) => arg.startsWith('--output=')); - const outputFromEquals = outputArg?.split('=')[1]; - const outputFlagIndex = args.findIndex((arg) => arg === '--output'); - const outputFromNext = outputFlagIndex >= 0 ? args[outputFlagIndex + 1] : undefined; - const output = outputFromEquals ?? outputFromNext; - return output === 'jsonl' ? 'jsonl' : 'text'; -} - function parseTask(args: string[]): string { const taskFromEquals = args.find((arg) => arg.startsWith('--task='))?.slice('--task='.length).trim(); const taskFlagIndex = args.findIndex((arg) => arg === '--task'); From ab1934acee3d5395ced77dd60e3e40a14ebda307 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 13:41:00 -0400 Subject: [PATCH 13/17] Align the Python CUA template with the current Responses API. Return multimodal tool outputs directly, document and validate drag paths, and simplify local runner logging so the generated app fails loudly on bad drag batches instead of degrading into no-op sleeps. Made-with: Cursor --- .../python/openai-computer-use/README.md | 7 +- .../python/openai-computer-use/agent/agent.py | 69 ++++++++++++------- .../openai-computer-use/agent/logging.py | 16 +---- .../computers/kernel_computer.py | 28 ++++---- .../python/openai-computer-use/main.py | 8 +-- .../python/openai-computer-use/run_local.py | 8 +-- .../python/openai-computer-use/utils.py | 19 +++++ 7 files changed, 86 insertions(+), 69 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/README.md b/pkg/templates/python/openai-computer-use/README.md index 1276eca8..684b36d3 100644 --- a/pkg/templates/python/openai-computer-use/README.md +++ b/pkg/templates/python/openai-computer-use/README.md @@ -12,19 +12,16 @@ You can test against a remote Kernel browser without deploying: cp .env.example .env # Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env uv run run_local.py -# JSONL event output -uv run run_local.py --output jsonl +uv run run_local.py --task "go to https://news.ycombinator.com and get the top 5 articles" ``` -The local runner defaults to human-readable logs (`text`). Use `--output jsonl` for one structured event per line (including backend events). Add `--debug` to include verbose in-flight events. +The local runner defaults to a built-in sample task. Pass `--task "..."` to run a custom prompt locally, and add `--debug` to include verbose in-flight events. ## Deploy to Kernel ```bash kernel deploy main.py --env-file .env kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}' -# JSONL logs for invocation -kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles","output":"jsonl"}' ``` See the [docs](https://www.kernel.sh/docs/quickstart) for more information. diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index cd08d2ae..c1f0c8a1 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -16,6 +16,7 @@ BATCH_FUNC_NAME = "batch_computer_actions" EXTRA_FUNC_NAME = "computer_use_extra" +POST_ACTION_SETTLE_SECONDS = 0.3 BATCH_INSTRUCTIONS = """You have three ways to perform actions: 1. The standard computer tool — use for single actions when you need screenshot feedback after each step. @@ -33,7 +34,12 @@ When interacting with page content (search boxes, forms, chat inputs): - Click the target input first, then type. -- Do not use URL-navigation actions for in-page text entry.""" +- Do not use URL-navigation actions for in-page text entry. + +For drag actions in batch_computer_actions: +- Always include a path field. +- path must be an array of at least two points. +- Each point must be an object like {"x": 123, "y": 456}.""" BATCH_TOOL = { "type": "function", @@ -85,6 +91,18 @@ "button": {"type": "string"}, "scroll_x": {"type": "number"}, "scroll_y": {"type": "number"}, + "path": { + "type": "array", + "description": "Required for drag actions. Provide at least two points as objects with x/y coordinates.", + "items": { + "type": "object", + "properties": { + "x": {"type": "number"}, + "y": {"type": "number"}, + }, + "required": ["x", "y"], + }, + }, }, "required": ["type"], }, @@ -161,6 +179,10 @@ def _current_model_elapsed_ms(self) -> int | None: return None return int((time.time() - self._model_request_started_at) * 1000) + def _capture_post_action_screenshot(self) -> str: + time.sleep(POST_ACTION_SETTLE_SECONDS) + return self.computer.screenshot() + def _extract_reasoning_text(self, item: dict[str, Any]) -> str: summary = item.get("summary") if not isinstance(summary, list): @@ -289,7 +311,7 @@ def handle_item(self, item): self._emit_event("action", payload) self.computer.batch_actions(typed_actions) - screenshot_base64 = self.computer.screenshot() + screenshot_base64 = self._capture_post_action_screenshot() self._emit_event( "screenshot", {"captured": True, "bytes_base64": len(screenshot_base64)}, @@ -318,7 +340,6 @@ def handle_item(self, item): if self.computer.get_environment() == "browser": current_url = self.computer.get_current_url() check_blocklisted_url(current_url) - call_output["output"]["current_url"] = current_url return [call_output] return [] @@ -336,21 +357,20 @@ def _handle_batch_call(self, call_id, args): status_text = f"Actions executed successfully. Current URL: {current_url}" except Exception as exc: status_text = f"Actions executed, but url() failed: {exc}" - output_items: list[dict[str, Any]] = [{"type": "text", "text": status_text}] - if terminal_action != "url": - screenshot_base64 = self.computer.screenshot() - output_items.append( - { - "type": "image_url", - "image_url": f"data:image/png;base64,{screenshot_base64}", - "detail": "original", - } - ) + screenshot_base64 = self._capture_post_action_screenshot() + output_items: list[dict[str, Any]] = [{"type": "input_text", "text": status_text}] + output_items.append( + { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + "detail": "original", + } + ) return [ { "type": "function_call_output", "call_id": call_id, - "output": json.dumps(output_items), + "output": output_items, } ] @@ -368,21 +388,20 @@ def _handle_extra_call(self, call_id, args): else: status_text = f"unknown {EXTRA_FUNC_NAME} action: {action}" - output_items: list[dict[str, Any]] = [{"type": "text", "text": status_text}] - if action != "url": - screenshot_base64 = self.computer.screenshot() - output_items.append( - { - "type": "image_url", - "image_url": f"data:image/png;base64,{screenshot_base64}", - "detail": "original", - } - ) + screenshot_base64 = self._capture_post_action_screenshot() + output_items: list[dict[str, Any]] = [{"type": "input_text", "text": status_text}] + output_items.append( + { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + "detail": "original", + } + ) return [ { "type": "function_call_output", "call_id": call_id, - "output": json.dumps(output_items), + "output": output_items, } ] diff --git a/pkg/templates/python/openai-computer-use/agent/logging.py b/pkg/templates/python/openai-computer-use/agent/logging.py index f6f5ccfc..3895b9ee 100644 --- a/pkg/templates/python/openai-computer-use/agent/logging.py +++ b/pkg/templates/python/openai-computer-use/agent/logging.py @@ -1,11 +1,9 @@ -import json import sys import threading import time from datetime import datetime -from typing import Callable, Literal +from typing import Callable -OutputMode = Literal["text", "jsonl"] MAX_LINE_WIDTH = 120 @@ -119,17 +117,7 @@ def _run(self) -> None: sys.stdout.flush() -def create_event_logger( - output: OutputMode = "text", verbose: bool = False -) -> Callable[[dict], None]: - if output == "jsonl": - def render_jsonl(event: dict) -> None: - payload = {"event": event.get("event"), "data": event.get("data", {})} - sys.stdout.write(json.dumps(payload, default=str) + "\n") - sys.stdout.flush() - - return render_jsonl - +def create_event_logger(verbose: bool = False) -> Callable[[dict], None]: spinner = _ThinkingSpinner(sys.stdout.isatty()) in_text = False last_live_view_url = "" diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index 993f1f17..c70dc419 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -125,6 +125,16 @@ def _normalize_drag_path(path: Any) -> List[List[int]]: points: List[List[int]] = [] if isinstance(path, list): for point in path: + if isinstance(point, (list, tuple)) and len(point) >= 2: + x, y = point[0], point[1] + if ( + isinstance(x, (int, float)) + and not isinstance(x, bool) + and isinstance(y, (int, float)) + and not isinstance(y, bool) + ): + points.append([int(x), int(y)]) + continue if not isinstance(point, dict): continue x = point.get("x") @@ -136,16 +146,13 @@ def _normalize_drag_path(path: Any) -> List[List[int]]: and not isinstance(y, bool) ): points.append([int(x), int(y)]) - if not points: - return [] - if len(points) == 1: - x, y = points[0] - return [[x, y], [x + 1, y]] return points -def _drag_noop_action() -> Dict[str, Any]: - return {"type": "sleep", "sleep": {"duration_ms": 1}} +def _validate_drag_path(path: List[List[int]]) -> None: + if len(path) >= 2: + return + raise ValueError(f"drag action requires path with at least two points; got {path!r}") def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: @@ -213,8 +220,7 @@ def _translate_cua_action(action: Dict[str, Any]) -> Dict[str, Any]: return {"type": "move_mouse", "move_mouse": {"x": action.get("x", 0), "y": action.get("y", 0)}} elif action_type == "drag": path = _normalize_drag_path(action.get("path", [])) - if len(path) < 2: - return _drag_noop_action() + _validate_drag_path(path) return {"type": "drag_mouse", "drag_mouse": {"path": path}} elif action_type == "wait": return {"type": "sleep", "sleep": {"duration_ms": action.get("ms", 1000)}} @@ -552,9 +558,7 @@ def drag(self, path: List[Dict[str, int]]) -> None: def _do() -> None: normalized_path = _normalize_drag_path(path) - if len(normalized_path) < 2: - time.sleep(0.001) - return + _validate_drag_path(normalized_path) self.client.browsers.computer.drag_mouse(self.session_id, path=normalized_path) self._trace_backend(op, _do) diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index db238212..e881e0fd 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -1,7 +1,7 @@ import asyncio import datetime import os -from typing import NotRequired, TypedDict +from typing import TypedDict import kernel from agent import Agent @@ -32,7 +32,6 @@ class CuaInput(TypedDict): task: str - output: NotRequired[str] class CuaOutput(TypedDict): @@ -55,10 +54,7 @@ async def cua_task( if not payload or not payload.get("task"): raise ValueError("task is required") - output_mode = payload.get("output", "text") - if output_mode not in ("text", "jsonl"): - output_mode = "text" - on_event = create_event_logger(output=output_mode) + on_event = create_event_logger() browser_create_started_at = datetime.datetime.now() emit_browser_new_started(on_event) diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index 05f9dc51..ae7de771 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -31,12 +31,6 @@ def parse_args(): parser = argparse.ArgumentParser(description="Run OpenAI CUA local test") - parser.add_argument( - "--output", - choices=["text", "jsonl"], - default="text", - help="Log output mode", - ) parser.add_argument( "--debug", action="store_true", @@ -58,7 +52,7 @@ def main(): raise ValueError("OPENAI_API_KEY is not set") client = Kernel(api_key=os.getenv("KERNEL_API_KEY")) - on_event = create_event_logger(output=args.output, verbose=args.debug) + on_event = create_event_logger(verbose=args.debug) browser_create_started_at = datetime.datetime.now() emit_browser_new_started(on_event) diff --git a/pkg/templates/python/openai-computer-use/utils.py b/pkg/templates/python/openai-computer-use/utils.py index 0204dc3d..cc3510c1 100644 --- a/pkg/templates/python/openai-computer-use/utils.py +++ b/pkg/templates/python/openai-computer-use/utils.py @@ -41,6 +41,25 @@ def sanitize_message(msg: dict) -> dict: sanitized = msg.copy() sanitized["output"] = {**output, "image_url": "[omitted]"} return sanitized + if msg.get("type") == "function_call_output": + output = msg.get("output") + if isinstance(output, list): + sanitized_items = [] + changed = False + for item in output: + if ( + isinstance(item, dict) + and item.get("type") == "input_image" + and isinstance(item.get("image_url"), str) + ): + sanitized_items.append({**item, "image_url": "[omitted]"}) + changed = True + else: + sanitized_items.append(item) + if changed: + sanitized = msg.copy() + sanitized["output"] = sanitized_items + return sanitized return msg From 11595f9d5918421dd061d0dd718dbd5c97ccc0a7 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 13:41:05 -0400 Subject: [PATCH 14/17] Ignore dmux workspace state. Keep local dmux artifacts out of the repo so template verification and review commits stay clean. Made-with: Cursor --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 90ca5864..092b5596 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ qa-* __pycache__ +.dmux/ From eb5d6a1700ba1f9614b6b4360c897e9f040630bf Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 14:12:46 -0400 Subject: [PATCH 15/17] Refine CUA drag batching guidance and quiet Python transport logs. Steer the templates away from batching layout-changing drags and suppress noisy httpx request logs so local runs stay readable without hiding the template's own events. Made-with: Cursor --- pkg/templates/python/openai-computer-use/agent/agent.py | 2 ++ pkg/templates/python/openai-computer-use/agent/logging.py | 8 ++++++++ pkg/templates/python/openai-computer-use/main.py | 2 ++ pkg/templates/python/openai-computer-use/run_local.py | 2 ++ .../typescript/openai-computer-use/lib/toolset.ts | 2 ++ 5 files changed, 16 insertions(+) diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index c1f0c8a1..400ce12b 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -39,6 +39,8 @@ For drag actions in batch_computer_actions: - Always include a path field. - path must be an array of at least two points. +- If one drag is likely to change the position, order, or layout of other targets, do not batch multiple drags together. +- In those cases, prefer one drag at a time and inspect the updated screenshot before planning the next drag. - Each point must be an object like {"x": 123, "y": 456}.""" BATCH_TOOL = { diff --git a/pkg/templates/python/openai-computer-use/agent/logging.py b/pkg/templates/python/openai-computer-use/agent/logging.py index 3895b9ee..2241aed2 100644 --- a/pkg/templates/python/openai-computer-use/agent/logging.py +++ b/pkg/templates/python/openai-computer-use/agent/logging.py @@ -1,3 +1,4 @@ +import logging import sys import threading import time @@ -7,6 +8,13 @@ MAX_LINE_WIDTH = 120 +def quiet_http_transport_logs() -> None: + # The Kernel Python SDK uses httpx underneath, and those request logs can + # become noisy when the surrounding process configures root logging at INFO. + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + + def _timestamp() -> str: return datetime.now().strftime("%H:%M:%S.%f")[:-3] diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index e881e0fd..c7bcfa09 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -12,6 +12,7 @@ emit_browser_new_done, emit_browser_new_started, emit_session_state, + quiet_http_transport_logs, ) from computers.kernel_computer import KernelComputer from kernel import Kernel @@ -42,6 +43,7 @@ class CuaOutput(TypedDict): if not api_key: raise ValueError("OPENAI_API_KEY is not set") +quiet_http_transport_logs() client = Kernel() app = kernel.App("python-openai-cua") diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index ae7de771..97e79145 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -23,6 +23,7 @@ emit_browser_new_done, emit_browser_new_started, emit_session_state, + quiet_http_transport_logs, ) from computers.kernel_computer import KernelComputer @@ -51,6 +52,7 @@ def main(): if not os.getenv("OPENAI_API_KEY"): raise ValueError("OPENAI_API_KEY is not set") + quiet_http_transport_logs() client = Kernel(api_key=os.getenv("KERNEL_API_KEY")) on_event = create_event_logger(verbose=args.debug) diff --git a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts index 441563be..aa43b9f1 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/toolset.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/toolset.ts @@ -19,6 +19,8 @@ When interacting with page content (search boxes, forms, chat inputs): For drag actions in batch_computer_actions: - Always include a path field. - path must be an array of at least two points. +- If one drag is likely to change the position, order, or layout of other targets, do not batch multiple drags together. +- In those cases, prefer one drag at a time and inspect the updated screenshot before planning the next drag. - Each point must be an object like {"x": 123, "y": 456}.`; export const batchComputerTool = { From 16536a03ee9469a2bd440b7fa4f36d66a3bb4d20 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 14:33:16 -0400 Subject: [PATCH 16/17] Add optional replay recording to the OpenAI CUA templates. Allow both deployed actions and local runners to start and stop Kernel browser replays on demand so replay capture can be enabled per run without changing the default flow. Made-with: Cursor --- .../python/openai-computer-use/main.py | 26 +++- .../python/openai-computer-use/replay.py | 114 ++++++++++++++++++ .../python/openai-computer-use/run_local.py | 10 ++ .../typescript/openai-computer-use/index.ts | 20 ++- .../openai-computer-use/lib/replay.ts | 114 ++++++++++++++++++ .../openai-computer-use/run_local.ts | 18 +++ 6 files changed, 295 insertions(+), 7 deletions(-) create mode 100644 pkg/templates/python/openai-computer-use/replay.py create mode 100644 pkg/templates/typescript/openai-computer-use/lib/replay.ts diff --git a/pkg/templates/python/openai-computer-use/main.py b/pkg/templates/python/openai-computer-use/main.py index c7bcfa09..a20c3598 100644 --- a/pkg/templates/python/openai-computer-use/main.py +++ b/pkg/templates/python/openai-computer-use/main.py @@ -1,7 +1,7 @@ import asyncio import datetime import os -from typing import TypedDict +from typing import NotRequired, TypedDict import kernel from agent import Agent @@ -16,6 +16,7 @@ ) from computers.kernel_computer import KernelComputer from kernel import Kernel +from replay import maybe_start_replay, maybe_stop_replay """ Example app that runs an agent using openai CUA @@ -33,10 +34,12 @@ class CuaInput(TypedDict): task: str + replay: NotRequired[bool] class CuaOutput(TypedDict): result: str + replay_url: NotRequired[str] api_key = os.getenv("OPENAI_API_KEY") @@ -69,6 +72,14 @@ async def cua_task( emit_session_state( on_event, kernel_browser.session_id, kernel_browser.browser_live_view_url ) + replay = await asyncio.to_thread( + maybe_start_replay, + client, + kernel_browser.session_id, + bool(payload.get("replay", False)), + on_event, + ) + replay_url: str | None = None def run_agent(): computer = KernelComputer(client, kernel_browser.session_id, on_event=on_event) @@ -117,13 +128,24 @@ def run_agent(): return {"result": result} try: - return await asyncio.to_thread(run_agent) + result = await asyncio.to_thread(run_agent) finally: browser_delete_started_at = datetime.datetime.now() emit_browser_delete_started(on_event) try: + replay_url = await asyncio.to_thread( + maybe_stop_replay, + client, + kernel_browser.session_id, + replay, + on_event, + ) await asyncio.to_thread(client.browsers.delete_by_id, kernel_browser.session_id) finally: emit_browser_delete_done(on_event, browser_delete_started_at) + if replay_url: + result["replay_url"] = replay_url + return result + diff --git a/pkg/templates/python/openai-computer-use/replay.py b/pkg/templates/python/openai-computer-use/replay.py new file mode 100644 index 00000000..c98316ff --- /dev/null +++ b/pkg/templates/python/openai-computer-use/replay.py @@ -0,0 +1,114 @@ +import datetime +import time +from dataclasses import dataclass +from typing import Callable + +from kernel import Kernel + +DEFAULT_REPLAY_GRACE_SECONDS = 5.0 +REPLAY_PROCESSING_DELAY_SECONDS = 2.0 +REPLAY_POLL_TIMEOUT_SECONDS = 60.0 +REPLAY_POLL_INTERVAL_SECONDS = 1.0 + + +@dataclass +class ReplayState: + enabled: bool + replay_id: str | None = None + replay_view_url: str | None = None + + +def maybe_start_replay( + client: Kernel, + session_id: str, + enabled: bool = False, + on_event: Callable[[dict], None] | None = None, +) -> ReplayState: + state = ReplayState(enabled=enabled) + if not enabled: + return state + + started_at = datetime.datetime.now() + if on_event: + on_event({"event": "backend", "data": {"op": "browsers.replays.start"}}) + + try: + replay = client.browsers.replays.start(session_id) + state.replay_id = replay.replay_id + if on_event: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.replays.start.done", + "detail": state.replay_id or "", + "elapsed_ms": int( + (datetime.datetime.now() - started_at).total_seconds() * 1000 + ), + }, + } + ) + except Exception as exc: + print(f"Warning: failed to start replay recording: {exc}") + print("Continuing without replay recording.") + state.enabled = False + + return state + + +def maybe_stop_replay( + client: Kernel, + session_id: str, + replay: ReplayState, + on_event: Callable[[dict], None] | None = None, + grace_period_seconds: float = DEFAULT_REPLAY_GRACE_SECONDS, +) -> str | None: + if not replay.enabled or not replay.replay_id: + return replay.replay_view_url + + if grace_period_seconds > 0: + time.sleep(grace_period_seconds) + + started_at = datetime.datetime.now() + if on_event: + on_event({"event": "backend", "data": {"op": "browsers.replays.stop"}}) + + try: + client.browsers.replays.stop(replay_id=replay.replay_id, id=session_id) + time.sleep(REPLAY_PROCESSING_DELAY_SECONDS) + + deadline = time.time() + REPLAY_POLL_TIMEOUT_SECONDS + while time.time() < deadline: + try: + replays = client.browsers.replays.list(session_id) + for replay_item in replays: + if replay_item.replay_id == replay.replay_id: + replay.replay_view_url = replay_item.replay_view_url + break + if replay.replay_view_url: + break + except Exception: + pass + + time.sleep(REPLAY_POLL_INTERVAL_SECONDS) + + if on_event: + on_event( + { + "event": "backend", + "data": { + "op": "browsers.replays.stop.done", + "detail": replay.replay_view_url or replay.replay_id or "", + "elapsed_ms": int( + (datetime.datetime.now() - started_at).total_seconds() * 1000 + ), + }, + } + ) + + if not replay.replay_view_url: + print("Warning: replay may still be processing") + except Exception as exc: + print(f"Warning: failed to stop replay recording cleanly: {exc}") + + return replay.replay_view_url diff --git a/pkg/templates/python/openai-computer-use/run_local.py b/pkg/templates/python/openai-computer-use/run_local.py index 97e79145..baa68a28 100644 --- a/pkg/templates/python/openai-computer-use/run_local.py +++ b/pkg/templates/python/openai-computer-use/run_local.py @@ -26,6 +26,7 @@ quiet_http_transport_logs, ) from computers.kernel_computer import KernelComputer +from replay import maybe_start_replay, maybe_stop_replay DEFAULT_TASK = "go to example.com and summarize what the page says" @@ -42,6 +43,11 @@ def parse_args(): default=DEFAULT_TASK, help="User task prompt to run in the browser session", ) + parser.add_argument( + "--replay", + action="store_true", + help="Record a Kernel browser replay for this local run", + ) return parser.parse_args() @@ -63,6 +69,7 @@ def main(): on_event, browser_create_started_at, browser.browser_live_view_url ) emit_session_state(on_event, browser.session_id, browser.browser_live_view_url) + replay = maybe_start_replay(client, browser.session_id, args.replay, on_event) computer = KernelComputer(client, browser.session_id, on_event=on_event) @@ -102,6 +109,9 @@ def main(): browser_delete_started_at = datetime.datetime.now() emit_browser_delete_started(on_event) try: + replay_url = maybe_stop_replay(client, browser.session_id, replay, on_event) + if replay_url: + print(f"> Replay URL: {replay_url}") client.browsers.delete_by_id(browser.session_id) finally: emit_browser_delete_done(on_event, browser_delete_started_at) diff --git a/pkg/templates/typescript/openai-computer-use/index.ts b/pkg/templates/typescript/openai-computer-use/index.ts index fc70b829..e6a9a343 100644 --- a/pkg/templates/typescript/openai-computer-use/index.ts +++ b/pkg/templates/typescript/openai-computer-use/index.ts @@ -3,6 +3,7 @@ import * as dotenv from 'dotenv'; import type { ResponseItem, ResponseOutputMessage } from 'openai/resources/responses/responses'; import { Agent } from './lib/agent'; import { KernelComputer } from './lib/kernel-computer'; +import { maybeStartReplay, maybeStopReplay } from './lib/replay'; import { createEventLogger, emitBrowserDeleteDone, @@ -16,10 +17,12 @@ dotenv.config({ override: true, quiet: true }); interface CuaInput { task: string; + replay?: boolean; } interface CuaOutput { elapsed: number; answer: string | null; + replay_url?: string; logs?: ResponseItem[]; } @@ -57,6 +60,12 @@ app.action( emitSessionState(onEvent, kb.session_id, kb.browser_live_view_url); const computer = new KernelComputer(kernel, kb.session_id, onEvent); + const replay = await maybeStartReplay(kernel, kb.session_id, { + enabled: payload.replay === true, + onEvent, + }); + let answer: string | null = null; + let replayUrl: string | null = null; try { await computer.goto('https://duckduckgo.com'); @@ -103,21 +112,22 @@ app.action( const assistant = messages.find((m) => m.role === 'assistant'); const lastContentIndex = assistant?.content?.length ? assistant.content.length - 1 : -1; const lastContent = lastContentIndex >= 0 ? assistant?.content?.[lastContentIndex] : null; - const answer = lastContent && 'text' in lastContent ? lastContent.text : null; - - return { elapsed, answer }; + answer = lastContent && 'text' in lastContent ? lastContent.text : null; } catch (error) { - const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); console.error('Error in cua-task:', error); - return { elapsed, answer: null }; + answer = null; } finally { emitBrowserDeleteStarted(onEvent); const browserDeleteStartedAt = Date.now(); try { + replayUrl = await maybeStopReplay(kernel, kb.session_id, replay, { onEvent }); await kernel.browsers.deleteByID(kb.session_id); } finally { emitBrowserDeleteDone(onEvent, browserDeleteStartedAt); } } + + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + return replayUrl ? { elapsed, answer, replay_url: replayUrl } : { elapsed, answer }; }, ); diff --git a/pkg/templates/typescript/openai-computer-use/lib/replay.ts b/pkg/templates/typescript/openai-computer-use/lib/replay.ts new file mode 100644 index 00000000..6858d9b7 --- /dev/null +++ b/pkg/templates/typescript/openai-computer-use/lib/replay.ts @@ -0,0 +1,114 @@ +import type { Kernel } from '@onkernel/sdk'; +import type { AgentEvent } from './log-events'; + +const DEFAULT_REPLAY_GRACE_MS = 5000; +const REPLAY_PROCESSING_DELAY_MS = 2000; +const REPLAY_POLL_TIMEOUT_MS = 60000; +const REPLAY_POLL_INTERVAL_MS = 1000; + +type EventLogger = (event: AgentEvent) => void; + +export interface ReplayState { + enabled: boolean; + replayId: string | null; + replayViewUrl: string | null; +} + +export async function maybeStartReplay( + kernel: Kernel, + sessionId: string, + opts?: { + enabled?: boolean; + onEvent?: EventLogger; + }, +): Promise { + const enabled = opts?.enabled ?? false; + const state: ReplayState = { + enabled, + replayId: null, + replayViewUrl: null, + }; + + if (!enabled) return state; + + const startedAtMs = Date.now(); + opts?.onEvent?.({ event: 'backend', data: { op: 'browsers.replays.start' } }); + try { + const replay = await kernel.browsers.replays.start(sessionId); + state.replayId = replay.replay_id ?? null; + opts?.onEvent?.({ + event: 'backend', + data: { + op: 'browsers.replays.start.done', + detail: state.replayId ?? '', + elapsed_ms: Date.now() - startedAtMs, + }, + }); + } catch (error) { + console.warn(`Warning: failed to start replay recording: ${String(error)}`); + console.warn('Continuing without replay recording.'); + state.enabled = false; + } + + return state; +} + +export async function maybeStopReplay( + kernel: Kernel, + sessionId: string, + replay: ReplayState, + opts?: { + onEvent?: EventLogger; + gracePeriodMs?: number; + }, +): Promise { + if (!replay.enabled || !replay.replayId) return replay.replayViewUrl; + + const gracePeriodMs = opts?.gracePeriodMs ?? DEFAULT_REPLAY_GRACE_MS; + if (gracePeriodMs > 0) { + await sleep(gracePeriodMs); + } + + const startedAtMs = Date.now(); + opts?.onEvent?.({ event: 'backend', data: { op: 'browsers.replays.stop' } }); + try { + await kernel.browsers.replays.stop(replay.replayId, { id: sessionId }); + await sleep(REPLAY_PROCESSING_DELAY_MS); + + const pollStartedAt = Date.now(); + while (Date.now() - pollStartedAt < REPLAY_POLL_TIMEOUT_MS) { + try { + const replays = await kernel.browsers.replays.list(sessionId); + const matchingReplay = replays.find((item) => item.replay_id === replay.replayId); + if (matchingReplay) { + replay.replayViewUrl = matchingReplay.replay_view_url ?? null; + break; + } + } catch { + // Ignore transient polling errors while the replay finishes processing. + } + await sleep(REPLAY_POLL_INTERVAL_MS); + } + + opts?.onEvent?.({ + event: 'backend', + data: { + op: 'browsers.replays.stop.done', + detail: replay.replayViewUrl ?? replay.replayId ?? '', + elapsed_ms: Date.now() - startedAtMs, + }, + }); + + if (!replay.replayViewUrl) { + console.warn('Warning: replay may still be processing.'); + } + } catch (error) { + console.warn(`Warning: failed to stop replay recording cleanly: ${String(error)}`); + } + + return replay.replayViewUrl; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/pkg/templates/typescript/openai-computer-use/run_local.ts b/pkg/templates/typescript/openai-computer-use/run_local.ts index 66cf30d2..e57382a9 100644 --- a/pkg/templates/typescript/openai-computer-use/run_local.ts +++ b/pkg/templates/typescript/openai-computer-use/run_local.ts @@ -4,6 +4,7 @@ import { resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import { Agent } from './lib/agent'; import { KernelComputer } from './lib/kernel-computer'; +import { maybeStartReplay, maybeStopReplay } from './lib/replay'; import { createEventLogger, emitBrowserDeleteDone, @@ -31,6 +32,7 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom const client = new Kernel({ apiKey: process.env.KERNEL_API_KEY }); const task = parseTask(args); + const replayEnabled = parseReplay(args); const debug = args.includes('--debug'); const onEvent = createEventLogger({ verbose: debug }); @@ -41,6 +43,10 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom emitSessionState(onEvent, browser.session_id, browser.browser_live_view_url); const computer = new KernelComputer(client, browser.session_id, onEvent); + const replay = await maybeStartReplay(client, browser.session_id, { + enabled: replayEnabled, + onEvent, + }); try { await computer.goto('https://duckduckgo.com'); @@ -84,6 +90,10 @@ export async function runLocalTest(args: string[] = process.argv.slice(2)): Prom emitBrowserDeleteStarted(onEvent); const browserDeleteStartedAt = Date.now(); try { + const replayUrl = await maybeStopReplay(client, browser.session_id, replay, { onEvent }); + if (replayUrl) { + console.log(`> Replay URL: ${replayUrl}`); + } await client.browsers.deleteByID(browser.session_id); } finally { emitBrowserDeleteDone(onEvent, browserDeleteStartedAt); @@ -101,6 +111,14 @@ function parseTask(args: string[]): string { return task && task.length > 0 ? task : DEFAULT_TASK; } +function parseReplay(args: string[]): boolean { + const replayFromEquals = args.find((arg) => arg.startsWith('--replay='))?.slice('--replay='.length).trim(); + if (replayFromEquals) { + return !['0', 'false', 'no', 'off'].includes(replayFromEquals.toLowerCase()); + } + return args.includes('--replay'); +} + function isDirectRun(): boolean { const entry = process.argv[1]; if (!entry) return false; From 9f0f9b54658925910d6ba3a91923fa42c3fe5073 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Thu, 12 Mar 2026 15:14:14 -0400 Subject: [PATCH 17/17] Harden URL checks in the OpenAI CUA templates. Make clipboard-based blocklist lookups best-effort so browser runs don't fail on empty reads, and translate forward actions in pending batches to keep navigation support consistent. Made-with: Cursor --- .../python/openai-computer-use/agent/agent.py | 10 ++++++++-- .../computers/kernel_computer.py | 3 +++ .../typescript/openai-computer-use/lib/agent.ts | 13 +++++++++++-- .../openai-computer-use/lib/kernel-computer.ts | 4 ++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pkg/templates/python/openai-computer-use/agent/agent.py b/pkg/templates/python/openai-computer-use/agent/agent.py index 400ce12b..3fac6159 100644 --- a/pkg/templates/python/openai-computer-use/agent/agent.py +++ b/pkg/templates/python/openai-computer-use/agent/agent.py @@ -340,8 +340,14 @@ def handle_item(self, item): } if self.computer.get_environment() == "browser": - current_url = self.computer.get_current_url() - check_blocklisted_url(current_url) + try: + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + except Exception as exc: + self._emit_event( + "backend", + {"op": "get_current_url.skipped", "detail": str(exc)}, + ) return [call_output] return [] diff --git a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py index c70dc419..da2fd2d5 100644 --- a/pkg/templates/python/openai-computer-use/computers/kernel_computer.py +++ b/pkg/templates/python/openai-computer-use/computers/kernel_computer.py @@ -312,6 +312,9 @@ def _build_pending_batch(actions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: if action_type == "back": pending.extend(_back_batch_actions()) continue + if action_type == "forward": + pending.extend(_forward_batch_actions()) + continue if action_type in ("url", "screenshot"): continue raise ValueError(f"Unknown CUA action type: {action_type}") diff --git a/pkg/templates/typescript/openai-computer-use/lib/agent.ts b/pkg/templates/typescript/openai-computer-use/lib/agent.ts index b7234d21..4fa11fcf 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/agent.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/agent.ts @@ -202,8 +202,17 @@ export class Agent { if (!this.ackCb(msg)) throw new Error(`Safety check failed: ${msg}`); } - const currentUrl = await this.computer.getCurrentUrl(); - utils.checkBlocklistedUrl(currentUrl); + if (this.computer.getEnvironment() === 'browser') { + try { + const currentUrl = await this.computer.getCurrentUrl(); + utils.checkBlocklistedUrl(currentUrl); + } catch (error) { + this.emit('backend', { + op: 'get_current_url.skipped', + detail: error instanceof Error ? error.message : String(error), + }); + } + } const screenshotOutput = { type: 'computer_screenshot', diff --git a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts index 0e2e96c1..94500ed0 100644 --- a/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts +++ b/pkg/templates/typescript/openai-computer-use/lib/kernel-computer.ts @@ -345,6 +345,10 @@ function buildPendingBatch(actions: CuaAction[]): BatchAction[] { pending.push(...backBatchActions()); continue; } + if (actionType === 'forward') { + pending.push(...forwardBatchActions()); + continue; + } if (actionType === 'url' || actionType === 'screenshot') { continue; }