diff --git a/pkg/create/templates.go b/pkg/create/templates.go index f8541a1..699b97f 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{ Languages: []string{LanguageTypeScript, LanguagePython}, }, TemplateYutoriComputerUse: { - Name: "Yutori n1 Computer Use", - Description: "Implements a Yutori n1 computer use agent", + Name: "Yutori n1.5 Computer Use", + Description: "Implements a Yutori n1.5 computer use agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, TemplateTzafonComputerUse: { diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md index 7523aff..8f7df74 100644 --- a/pkg/templates/python/yutori/README.md +++ b/pkg/templates/python/yutori/README.md @@ -1,8 +1,8 @@ -# Kernel Python Sample App - Yutori n1 Computer Use +# Kernel Python Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. +This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. -[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. ## Setup @@ -55,9 +55,9 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. +Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. +> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. @@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. -## n1-latest Supported Actions +## n1.5-latest Supported Actions + +This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only. | Action | Description | |--------|-------------| -| `left_click` | Left mouse click at coordinates | -| `double_click` | Double-click at coordinates | -| `triple_click` | Triple-click at coordinates | +| `left_click` | Left mouse click at coordinates (supports `modifier`) | +| `double_click` | Double-click at coordinates (supports `modifier`) | +| `triple_click` | Triple-click at coordinates (supports `modifier`) | +| `middle_click` | Middle mouse click at coordinates | | `right_click` | Right mouse click at coordinates | +| `mouse_move` | Move mouse to coordinates without clicking | +| `mouse_down` | Press the left mouse button at coordinates | +| `mouse_up` | Release the left mouse button at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | -| `key_press` | Send keyboard input | -| `hover` | Move mouse without clicking | +| `key_press` | Send a single key or key combination | +| `hold_key` | Hold a key for a duration | | `drag` | Click-and-drag operation | | `wait` | Pause for UI to update | | `refresh` | Reload current page | | `go_back` | Navigate back in history | +| `go_forward` | Navigate forward in history | | `goto_url` | Navigate to a URL | +### Disabled tools + +The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model. + ## Resources -- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5) - [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py index 066aafb..ad39024 100644 --- a/pkg/templates/python/yutori/loop.py +++ b/pkg/templates/python/yutori/loop.py @@ -1,14 +1,14 @@ """ -Yutori n1 Sampling Loop +Yutori n1.5 Sampling Loop -Implements the agent loop for Yutori's n1-latest computer use model. -n1-latest uses an OpenAI-compatible API with tool_calls: +Implements the agent loop for Yutori's n1.5-latest computer use model. +n1.5-latest uses an OpenAI-compatible API with tool_calls: - Actions are returned via tool_calls in the assistant message - Tool results use role: "tool" with matching tool_call_id - The model stops by returning content without tool_calls - Coordinates are returned in 1000x1000 space and need scaling -@see https://docs.yutori.com/reference/n1 +@see https://docs.yutori.com/reference/n1-5 """ import json @@ -17,12 +17,18 @@ from kernel import Kernel from openai import OpenAI -from tools import ComputerTool, N1Action, ToolResult +from tools import ComputerTool, N15Action, ToolResult + +# Tools that require a Playwright page / DOM access. The default core tool set +# already excludes them, but we also list them in `disable_tools` so the +# exclusion is explicit and survives if the default ever changes. +DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"] +TOOL_SET = "browser_tools_core-20260403" async def sampling_loop( *, - model: str = "n1-latest", + model: str = "n1.5-latest", task: str, api_key: str, kernel: Kernel, @@ -69,6 +75,13 @@ async def sampling_loop( messages=conversation_messages, max_completion_tokens=max_completion_tokens, temperature=0.3, + # n1.5-specific knobs go in extra_body. + # tool_set selects the core (coordinate-based) tools. + # disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. + extra_body={ + "tool_set": TOOL_SET, + "disable_tools": DISABLED_TOOLS, + }, ) except Exception as api_error: print(f"API call failed: {api_error}") @@ -108,7 +121,7 @@ async def sampling_loop( }) continue - action: N1Action = {"action_type": action_name, **args} + action: N15Action = {"action_type": action_name, **args} print(f"Executing action: {action_name}", args) scaled_action = _scale_coordinates(action, viewport_width, viewport_height) @@ -155,7 +168,7 @@ async def sampling_loop( } -def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: +def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action: scaled = dict(action) if "coordinates" in scaled and scaled["coordinates"]: diff --git a/pkg/templates/python/yutori/main.py b/pkg/templates/python/yutori/main.py index 4679df8..21543d9 100644 --- a/pkg/templates/python/yutori/main.py +++ b/pkg/templates/python/yutori/main.py @@ -30,7 +30,7 @@ async def cua_task( payload: QueryInput, ) -> QueryOutput: """ - Process a user query using Yutori n1 Computer Use with Kernel's browser automation. + Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation. Args: ctx: Kernel context containing invocation information @@ -58,7 +58,7 @@ async def cua_task( print("Kernel browser live view url:", session.live_view_url) loop_result = await sampling_loop( - model="n1-latest", + model="n1.5-latest", task=payload["query"], api_key=str(api_key), kernel=session.kernel, diff --git a/pkg/templates/python/yutori/tools/__init__.py b/pkg/templates/python/yutori/tools/__init__.py index 63da518..5a1a428 100644 --- a/pkg/templates/python/yutori/tools/__init__.py +++ b/pkg/templates/python/yutori/tools/__init__.py @@ -1,11 +1,11 @@ -"""Yutori n1 Computer Tools.""" +"""Yutori n1.5 Computer Tools.""" from .base import ToolError, ToolResult -from .computer import ComputerTool, N1Action +from .computer import ComputerTool, N15Action __all__ = [ "ToolError", "ToolResult", "ComputerTool", - "N1Action", + "N15Action", ] diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index d078ade..18b3504 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -1,15 +1,17 @@ """ -Yutori n1 Computer Tool +Yutori n1.5 Computer Tool -Maps n1-latest action format to Kernel's Computer Controls API. +Maps n1.5-latest action format to Kernel's Computer Controls API. Screenshots are converted to WebP for better compression across multi-step trajectories. + +@see https://docs.yutori.com/reference/n1-5 """ import asyncio import base64 import json from io import BytesIO -from typing import Literal, TypedDict +from typing import Any, Literal, TypedDict from kernel import Kernel from PIL import Image @@ -20,33 +22,38 @@ SCREENSHOT_DELAY_S = 0.3 ACTION_DELAY_S = 0.3 -N1ActionType = Literal[ +N15ActionType = Literal[ "left_click", "double_click", "triple_click", + "middle_click", "right_click", + "mouse_move", + "mouse_down", + "mouse_up", "scroll", "type", "key_press", - "hover", + "hold_key", "drag", "wait", "refresh", "go_back", + "go_forward", "goto_url", ] -class N1Action(TypedDict, total=False): - action_type: N1ActionType +class N15Action(TypedDict, total=False): + action_type: N15ActionType coordinates: tuple[int, int] | list[int] start_coordinates: tuple[int, int] | list[int] direction: Literal["up", "down", "left", "right"] amount: int text: str - press_enter_after: bool - clear_before_typing: bool - key_comb: str + key: str + modifier: str + duration: int url: str @@ -97,22 +104,27 @@ def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: i self.height = height self.kiosk_mode = kiosk_mode - async def execute(self, action: N1Action) -> ToolResult: + async def execute(self, action: N15Action) -> ToolResult: action_type = action.get("action_type") handlers = { "left_click": lambda a: self._handle_click(a, "left", 1), "double_click": lambda a: self._handle_click(a, "left", 2), "triple_click": lambda a: self._handle_click(a, "left", 3), + "middle_click": lambda a: self._handle_click(a, "middle", 1), "right_click": lambda a: self._handle_click(a, "right", 1), + "mouse_move": self._handle_mouse_move, + "mouse_down": lambda a: self._handle_mouse_button(a, "down"), + "mouse_up": lambda a: self._handle_mouse_button(a, "up"), "scroll": self._handle_scroll, "type": self._handle_type, "key_press": self._handle_key_press, - "hover": self._handle_hover, + "hold_key": self._handle_hold_key, "drag": self._handle_drag, "wait": self._handle_wait, "refresh": self._handle_refresh, "go_back": self._handle_go_back, + "go_forward": self._handle_go_forward, "goto_url": self._handle_goto_url, } @@ -122,22 +134,51 @@ async def execute(self, action: N1Action) -> ToolResult: return await handler(action) - async def _handle_click(self, action: N1Action, button: str, num_clicks: int) -> ToolResult: + async def _handle_click(self, action: N15Action, button: str, num_clicks: int) -> ToolResult: + coords = self._get_coordinates(action.get("coordinates")) + modifier = action.get("modifier") + kwargs: dict[str, Any] = { + "x": coords["x"], + "y": coords["y"], + "button": button, + "click_type": "click", + "num_clicks": num_clicks, + } + if modifier: + kwargs["hold_keys"] = [self._map_key(modifier)] + + self.kernel.browsers.computer.click_mouse(self.session_id, **kwargs) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_mouse_move(self, action: N15Action) -> ToolResult: + coords = self._get_coordinates(action.get("coordinates")) + + self.kernel.browsers.computer.move_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_mouse_button(self, action: N15Action, click_type: str) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) self.kernel.browsers.computer.click_mouse( self.session_id, x=coords["x"], y=coords["y"], - button=button, - click_type="click", - num_clicks=num_clicks, + button="left", + click_type=click_type, ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_scroll(self, action: N1Action) -> ToolResult: + async def _handle_scroll(self, action: N15Action) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) direction = action.get("direction") notches = max(action.get("amount", 3), 1) @@ -171,45 +212,26 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}." return screenshot_result - async def _handle_type(self, action: N1Action) -> ToolResult: + async def _handle_type(self, action: N15Action) -> ToolResult: text = action.get("text") if not text: raise ToolError("text is required for type action") - if action.get("clear_before_typing"): - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["ctrl+a"], - ) - await asyncio.sleep(0.1) - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["BackSpace"], - ) - await asyncio.sleep(0.1) - self.kernel.browsers.computer.type_text( self.session_id, text=text, delay=TYPING_DELAY_MS, ) - if action.get("press_enter_after"): - await asyncio.sleep(0.1) - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["Return"], - ) - await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_key_press(self, action: N1Action) -> ToolResult: - key_comb = action.get("key_comb") - if not key_comb: - raise ToolError("key_comb is required for key_press action") + async def _handle_key_press(self, action: N15Action) -> ToolResult: + key = action.get("key") + if not key: + raise ToolError("key is required for key_press action") - mapped_key = self._map_key(key_comb) + mapped_key = self._map_key(key) self.kernel.browsers.computer.press_key( self.session_id, @@ -219,19 +241,24 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_hover(self, action: N1Action) -> ToolResult: - coords = self._get_coordinates(action.get("coordinates")) + async def _handle_hold_key(self, action: N15Action) -> ToolResult: + key = action.get("key") + if not key: + raise ToolError("key is required for hold_key action") - self.kernel.browsers.computer.move_mouse( + mapped_key = self._map_key(key) + duration = action.get("duration") or 1000 + + self.kernel.browsers.computer.press_key( self.session_id, - x=coords["x"], - y=coords["y"], + keys=[mapped_key], + duration=duration, ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_drag(self, action: N1Action) -> ToolResult: + async def _handle_drag(self, action: N15Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) end_coords = self._get_coordinates(action.get("coordinates")) @@ -244,11 +271,13 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_wait(self, action: N1Action) -> ToolResult: - await asyncio.sleep(2) + async def _handle_wait(self, action: N15Action) -> ToolResult: + duration = action.get("duration") + seconds = (duration / 1000) if duration and duration > 0 else 2 + await asyncio.sleep(seconds) return await self.screenshot() - async def _handle_refresh(self, action: N1Action) -> ToolResult: + async def _handle_refresh(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, keys=["F5"], @@ -256,7 +285,7 @@ async def _handle_refresh(self, action: N1Action) -> ToolResult: await asyncio.sleep(2) return await self.screenshot() - async def _handle_go_back(self, action: N1Action) -> ToolResult: + async def _handle_go_back(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, keys=["alt+Left"], @@ -264,7 +293,15 @@ async def _handle_go_back(self, action: N1Action) -> ToolResult: await asyncio.sleep(1.5) return await self.screenshot() - async def _handle_goto_url(self, action: N1Action) -> ToolResult: + async def _handle_go_forward(self, action: N15Action) -> ToolResult: + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["alt+Right"], + ) + await asyncio.sleep(1.5) + return await self.screenshot() + + async def _handle_goto_url(self, action: N15Action) -> ToolResult: url = action.get("url") if not url: raise ToolError("url is required for goto_url action") diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md index 92c009d..2d9f781 100644 --- a/pkg/templates/typescript/yutori/README.md +++ b/pkg/templates/typescript/yutori/README.md @@ -1,8 +1,8 @@ -# Kernel TypeScript Sample App - Yutori n1 Computer Use +# Kernel TypeScript Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. +This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. -[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. ## Setup @@ -55,9 +55,9 @@ kernel invoke ts-yutori-cua cua-task --payload '{"query": "Enter https://example ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. +Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. +> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. @@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. -## n1-latest Supported Actions +## n1.5-latest Supported Actions + +This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only. | Action | Description | |--------|-------------| -| `left_click` | Left mouse click at coordinates | -| `double_click` | Double-click at coordinates | -| `triple_click` | Triple-click at coordinates | +| `left_click` | Left mouse click at coordinates (supports `modifier`) | +| `double_click` | Double-click at coordinates (supports `modifier`) | +| `triple_click` | Triple-click at coordinates (supports `modifier`) | +| `middle_click` | Middle mouse click at coordinates | | `right_click` | Right mouse click at coordinates | +| `mouse_move` | Move mouse to coordinates without clicking | +| `mouse_down` | Press the left mouse button at coordinates | +| `mouse_up` | Release the left mouse button at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | -| `key_press` | Send keyboard input | -| `hover` | Move mouse without clicking | +| `key_press` | Send a single key or key combination | +| `hold_key` | Hold a key for a duration | | `drag` | Click-and-drag operation | | `wait` | Pause for UI to update | | `refresh` | Reload current page | | `go_back` | Navigate back in history | +| `go_forward` | Navigate forward in history | | `goto_url` | Navigate to a URL | +### Disabled tools + +The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model. + ## Resources -- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5) - [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/yutori/index.ts b/pkg/templates/typescript/yutori/index.ts index 364e723..c38a1b5 100644 --- a/pkg/templates/typescript/yutori/index.ts +++ b/pkg/templates/typescript/yutori/index.ts @@ -47,7 +47,7 @@ app.action( try { // Run the sampling loop const { finalAnswer, messages } = await samplingLoop({ - model: 'n1-latest', + model: 'n1.5-latest', task: payload.query, apiKey: YUTORI_API_KEY, kernel, diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts index e0f9479..f113b61 100644 --- a/pkg/templates/typescript/yutori/loop.ts +++ b/pkg/templates/typescript/yutori/loop.ts @@ -1,19 +1,25 @@ /** - * Yutori n1 Sampling Loop - * - * Implements the agent loop for Yutori's n1-latest computer use model. - * n1-latest uses an OpenAI-compatible API with tool_calls: + * Yutori n1.5 Sampling Loop + * + * Implements the agent loop for Yutori's n1.5-latest computer use model. + * n1.5-latest uses an OpenAI-compatible API with tool_calls: * - Actions are returned via tool_calls in the assistant message * - Tool results use role: "tool" with matching tool_call_id * - The model stops by returning content without tool_calls * - Coordinates are returned in 1000x1000 space and need scaling - * - * @see https://docs.yutori.com/reference/n1 + * + * @see https://docs.yutori.com/reference/n1-5 */ import OpenAI from 'openai'; import type { Kernel } from '@onkernel/sdk'; -import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; +import { ComputerTool, type N15Action, type ToolResult } from './tools/computer'; + +// Tools that require a Playwright page / DOM access. The default core tool set +// already excludes them, but we also list them in `disable_tools` so the +// exclusion is explicit and survives if the default ever changes. +const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js']; +const TOOL_SET = 'browser_tools_core-20260403'; interface SamplingLoopOptions { model?: string; @@ -34,7 +40,7 @@ interface SamplingLoopResult { } export async function samplingLoop({ - model = 'n1-latest', + model = 'n1.5-latest', task, apiKey, kernel, @@ -85,6 +91,14 @@ export async function samplingLoop({ messages: conversationMessages, max_completion_tokens: maxCompletionTokens, temperature: 0.3, + // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types). + // tool_set selects the core (coordinate-based) tools. + // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. + // @ts-expect-error extra_body is a Yutori extension + extra_body: { + tool_set: TOOL_SET, + disable_tools: DISABLED_TOOLS, + }, }); } catch (apiError) { console.error('API call failed:', apiError); @@ -131,8 +145,8 @@ export async function samplingLoop({ continue; } - const action: N1Action = { - action_type: actionName as N1Action['action_type'], + const action: N15Action = { + action_type: actionName as N15Action['action_type'], ...args, }; @@ -190,7 +204,7 @@ export async function samplingLoop({ }; } -function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { +function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeight: number): N15Action { const scaled = { ...action }; if (scaled.coordinates) { diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index 5ba8e09..7e64b2b 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -1,8 +1,10 @@ /** - * Yutori n1 Computer Tool - * - * Maps n1-latest action format to Kernel's Computer Controls API. + * Yutori n1.5 Computer Tool + * + * Maps n1.5-latest action format to Kernel's Computer Controls API. * Screenshots are converted to WebP for better compression across multi-step trajectories. + * + * @see https://docs.yutori.com/reference/n1-5 */ import { Buffer } from 'buffer'; @@ -26,31 +28,36 @@ export class ToolError extends Error { } } -export type N1ActionType = +export type N15ActionType = | 'left_click' | 'double_click' | 'triple_click' + | 'middle_click' | 'right_click' + | 'mouse_move' + | 'mouse_down' + | 'mouse_up' | 'scroll' | 'type' | 'key_press' - | 'hover' + | 'hold_key' | 'drag' | 'wait' | 'refresh' | 'go_back' + | 'go_forward' | 'goto_url'; -export interface N1Action { - action_type: N1ActionType; +export interface N15Action { + action_type: N15ActionType; coordinates?: [number, number]; start_coordinates?: [number, number]; direction?: 'up' | 'down' | 'left' | 'right'; amount?: number; text?: string; - press_enter_after?: boolean; - clear_before_typing?: boolean; - key_comb?: string; + key?: string; + modifier?: string; + duration?: number; url?: string; } @@ -107,7 +114,7 @@ export class ComputerTool { this.kioskMode = kioskMode; } - async execute(action: N1Action): Promise { + async execute(action: N15Action): Promise { const { action_type } = action; switch (action_type) { @@ -117,24 +124,34 @@ export class ComputerTool { return this.handleClick(action, 'left', 2); case 'triple_click': return this.handleClick(action, 'left', 3); + case 'middle_click': + return this.handleClick(action, 'middle', 1); case 'right_click': return this.handleClick(action, 'right', 1); + case 'mouse_move': + return this.handleMouseMove(action); + case 'mouse_down': + return this.handleMouseButton(action, 'down'); + case 'mouse_up': + return this.handleMouseButton(action, 'up'); case 'scroll': return this.handleScroll(action); case 'type': return this.handleType(action); case 'key_press': return this.handleKeyPress(action); - case 'hover': - return this.handleHover(action); + case 'hold_key': + return this.handleHoldKey(action); case 'drag': return this.handleDrag(action); case 'wait': - return this.handleWait(); + return this.handleWait(action); case 'refresh': return this.handleRefresh(); case 'go_back': return this.handleGoBack(); + case 'go_forward': + return this.handleGoForward(); case 'goto_url': return this.handleGotoUrl(action); default: @@ -142,22 +159,50 @@ export class ComputerTool { } } - private async handleClick(action: N1Action, button: 'left' | 'right', numClicks: number): Promise { + private async handleClick(action: N15Action, button: 'left' | 'right' | 'middle', numClicks: number): Promise { const coords = this.getCoordinates(action.coordinates); - + const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined; + await this.kernel.browsers.computer.clickMouse(this.sessionId, { x: coords.x, y: coords.y, button, click_type: 'click', num_clicks: numClicks, + ...(holdKeys ? { hold_keys: holdKeys } : {}), + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleMouseMove(action: N15Action): Promise { + const coords = this.getCoordinates(action.coordinates); + + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x: coords.x, + y: coords.y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleMouseButton(action: N15Action, clickType: 'down' | 'up'): Promise { + const coords = this.getCoordinates(action.coordinates); + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: coords.x, + y: coords.y, + button: 'left', + click_type: clickType, }); await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleScroll(action: N1Action): Promise { + private async handleScroll(action: N15Action): Promise { const coords = this.getCoordinates(action.coordinates); const direction = action.direction; const notches = Math.max(action.amount ?? 3, 1); @@ -199,46 +244,28 @@ export class ComputerTool { }; } - private async handleType(action: N1Action): Promise { + private async handleType(action: N15Action): Promise { const text = action.text; if (!text) { throw new ToolError('text is required for type action'); } - if (action.clear_before_typing) { - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['ctrl+a'], - }); - await this.sleep(100); - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['BackSpace'], - }); - await this.sleep(100); - } - await this.kernel.browsers.computer.typeText(this.sessionId, { text, delay: TYPING_DELAY_MS, }); - if (action.press_enter_after) { - await this.sleep(100); - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['Return'], - }); - } - await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleKeyPress(action: N1Action): Promise { - const keyComb = action.key_comb; - if (!keyComb) { - throw new ToolError('key_comb is required for key_press action'); + private async handleKeyPress(action: N15Action): Promise { + const key = action.key; + if (!key) { + throw new ToolError('key is required for key_press action'); } - const mappedKey = this.mapKey(keyComb); + const mappedKey = this.mapKey(key); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [mappedKey], @@ -248,19 +275,25 @@ export class ComputerTool { return this.screenshot(); } - private async handleHover(action: N1Action): Promise { - const coords = this.getCoordinates(action.coordinates); + private async handleHoldKey(action: N15Action): Promise { + const key = action.key; + if (!key) { + throw new ToolError('key is required for hold_key action'); + } - await this.kernel.browsers.computer.moveMouse(this.sessionId, { - x: coords.x, - y: coords.y, + const mappedKey = this.mapKey(key); + const durationMs = action.duration && action.duration > 0 ? action.duration : 1000; + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [mappedKey], + duration: durationMs, }); await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleDrag(action: N1Action): Promise { + private async handleDrag(action: N15Action): Promise { const startCoords = this.getCoordinates(action.start_coordinates); const endCoords = this.getCoordinates(action.coordinates); @@ -273,8 +306,9 @@ export class ComputerTool { return this.screenshot(); } - private async handleWait(): Promise { - await this.sleep(2000); + private async handleWait(action: N15Action): Promise { + const durationMs = action.duration && action.duration > 0 ? action.duration : 2000; + await this.sleep(durationMs); return this.screenshot(); } @@ -296,7 +330,16 @@ export class ComputerTool { return this.screenshot(); } - private async handleGotoUrl(action: N1Action): Promise { + private async handleGoForward(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Right'], + }); + + await this.sleep(1500); + return this.screenshot(); + } + + private async handleGotoUrl(action: N15Action): Promise { const url = action.url; if (!url) { throw new ToolError('url is required for goto_url action');