diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 12504189..efcd1cfe 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -58,7 +58,7 @@ Here are all valid language + template combinations: | typescript | openai-computer-use | ts-openai-cua | ts-openai-cua | Yes | OPENAI_API_KEY | | typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | typescript | claude-agent-sdk | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | -| typescript | yutori-computer-use | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | +| typescript | yutori | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | | python | sample-app | py-sample-app | python-basic | No | - | | python | gemini-computer-use | py-gemini-cua | python-gemini-cua | Yes | GOOGLE_API_KEY | @@ -68,7 +68,7 @@ Here are all valid language + template combinations: | python | openai-computer-use | py-openai-cua | python-openai-cua | Yes | OPENAI_API_KEY | | python | openagi-computer-use | py-openagi-cua | python-openagi-cua | Yes | OAGI_API_KEY | | python | claude-agent-sdk | py-claude-agent-sdk | py-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | -| python | yutori-computer-use | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | +| python | yutori | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | > **Yutori:** Test both default browser and `"kiosk": true` (uses Playwright for goto_url when kiosk is enabled). @@ -86,7 +86,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n ts-openai-cua -l typescript -t openai-computer-use ../bin/kernel create -n ts-gemini-cua -l typescript -t gemini-computer-use ../bin/kernel create -n ts-claude-agent-sdk -l typescript -t claude-agent-sdk -../bin/kernel create -n ts-yutori-cua -l typescript -t yutori-computer-use +../bin/kernel create -n ts-yutori-cua -l typescript -t yutori # Python templates ../bin/kernel create -n py-sample-app -l python -t sample-app @@ -97,7 +97,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n py-openagi-cua -l python -t openagi-computer-use ../bin/kernel create -n py-claude-agent-sdk -l python -t claude-agent-sdk ../bin/kernel create -n py-gemini-cua -l python -t gemini-computer-use -../bin/kernel create -n py-yutori-cua -l python -t yutori-computer-use +../bin/kernel create -n py-yutori-cua -l python -t yutori ``` ## Step 5: Deploy Each Template diff --git a/pkg/create/templates.go b/pkg/create/templates.go index c9b07b7d..fb5845f3 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -18,7 +18,7 @@ const ( TemplateStagehand = "stagehand" TemplateOpenAGIComputerUse = "openagi-computer-use" TemplateClaudeAgentSDK = "claude-agent-sdk" - TemplateYutoriComputerUse = "yutori-computer-use" + TemplateYutoriComputerUse = "yutori" ) type TemplateInfo struct { diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py index afee6f61..d5d8a249 100644 --- a/pkg/templates/python/anthropic-computer-use/loop.py +++ b/pkg/templates/python/anthropic-computer-use/loop.py @@ -50,6 +50,7 @@ class APIProvider(StrEnum): * As the initial step click on the search bar. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. * Either that, or make sure you scroll down to see everything before deciding something isn't available. +* Scroll action: scroll_amount and the tool result are in wheel units (not pixels). * When using your computer function calls, they take a while to run and send back to you. * Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is {datetime.now().strftime("%A, %B %d, %Y")}. diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py index d4a46d7d..27b3b088 100644 --- a/pkg/templates/python/anthropic-computer-use/tools/computer.py +++ b/pkg/templates/python/anthropic-computer-use/tools/computer.py @@ -370,21 +370,17 @@ async def __call__( else: x, y = self._last_mouse_position - # Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) - scroll_factor = scroll_amount * 120 - + notches = max(scroll_amount or 1, 1) delta_x = 0 delta_y = 0 if scroll_direction == "up": - delta_y = -scroll_factor + delta_y = -notches elif scroll_direction == "down": - delta_y = scroll_factor + delta_y = notches elif scroll_direction == "left": - delta_x = -scroll_factor + delta_x = -notches elif scroll_direction == "right": - delta_x = scroll_factor - - print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y)} pixels {scroll_direction}") + delta_x = notches self.kernel.browsers.computer.scroll( id=self.session_id, @@ -393,7 +389,12 @@ async def __call__( delta_x=delta_x, delta_y=delta_y, ) - return await self.screenshot() + + await asyncio.sleep(0.2) + screenshot_result = await self.screenshot() + return screenshot_result.replace( + output=f"Scrolled {notches} wheel unit(s) {scroll_direction}." + ) if action in ("hold_key", "wait"): if duration is None or not isinstance(duration, (int, float)): diff --git a/pkg/templates/python/gemini-computer-use/main.py b/pkg/templates/python/gemini-computer-use/main.py index 870ff776..1f5bd81e 100644 --- a/pkg/templates/python/gemini-computer-use/main.py +++ b/pkg/templates/python/gemini-computer-use/main.py @@ -75,9 +75,8 @@ async def cua_task( } -# Run locally if executed directly (not imported as a module) -# Execute via: uv run main.py -if __name__ == "__main__": +# Run locally when not in Kernel invocation. Execute via: uv run main.py +if __name__ == "__main__" and not os.getenv("KERNEL_INVOCATION"): import asyncio async def main(): diff --git a/pkg/templates/python/gemini-computer-use/tools/computer.py b/pkg/templates/python/gemini-computer-use/tools/computer.py index 5cf309d8..60d4079f 100644 --- a/pkg/templates/python/gemini-computer-use/tools/computer.py +++ b/pkg/templates/python/gemini-computer-use/tools/computer.py @@ -21,6 +21,8 @@ TYPING_DELAY_MS = 12 SCREENSHOT_DELAY_SECS = 0.5 +PX_PER_NOTCH = 60 +MAX_NOTCHES_PER_ACTION = 17 class ComputerTool: @@ -131,22 +133,21 @@ async def execute_action( elif action_name == GeminiAction.SCROLL_DOCUMENT: if "direction" not in args: return ToolResult(error="scroll_document requires direction") - # Scroll at center of viewport center_x = self.screen_size.width // 2 center_y = self.screen_size.height // 2 - scroll_delta = 500 - delta_x, delta_y = 0, 0 + magnitude_px = args.get("magnitude", 400) + doc_notches = min(MAX_NOTCHES_PER_ACTION, max(1, round(magnitude_px / PX_PER_NOTCH))) direction = args["direction"] + delta_x = delta_y = 0 if direction == "down": - delta_y = scroll_delta + delta_y = doc_notches elif direction == "up": - delta_y = -scroll_delta + delta_y = -doc_notches elif direction == "right": - delta_x = scroll_delta + delta_x = doc_notches elif direction == "left": - delta_x = -scroll_delta - + delta_x = -doc_notches self.kernel.browsers.computer.scroll( self.session_id, x=center_x, @@ -164,24 +165,18 @@ async def execute_action( x = self.denormalize_x(args["x"]) y = self.denormalize_y(args["y"]) - # Denormalize magnitude if provided - magnitude = args.get("magnitude", 800) + magnitude_px = args.get("magnitude", 400) + notches = min(MAX_NOTCHES_PER_ACTION, max(1, round(magnitude_px / PX_PER_NOTCH))) direction = args["direction"] - if direction in ("up", "down"): - magnitude = self.denormalize_y(magnitude) - else: - magnitude = self.denormalize_x(magnitude) - - delta_x, delta_y = 0, 0 + delta_x = delta_y = 0 if direction == "down": - delta_y = magnitude + delta_y = notches elif direction == "up": - delta_y = -magnitude + delta_y = -notches elif direction == "right": - delta_x = magnitude + delta_x = notches elif direction == "left": - delta_x = -magnitude - + delta_x = -notches self.kernel.browsers.computer.scroll( self.session_id, x=x, diff --git a/pkg/templates/python/openagi-computer-use/kernel_handler.py b/pkg/templates/python/openagi-computer-use/kernel_handler.py index 564274ca..364a8762 100644 --- a/pkg/templates/python/openagi-computer-use/kernel_handler.py +++ b/pkg/templates/python/openagi-computer-use/kernel_handler.py @@ -36,13 +36,16 @@ class KernelActionHandler: - HOTKEY -> press_key(keys=[...]) - TYPE -> type_text(text=...) - SCROLL -> scroll(x, y, delta_y=...) + + Note: OpenAGI/Lux tends to emit scroll N times for "scroll by N" (e.g. 3 identical + [scroll] actions for "scroll down with amount 3"). We treat each scroll event as + one scroll unit (1 notch), so N events in a row = N notches without fighting the model. """ def __init__( self, session: "KernelBrowserSession", action_pause: float = 0.1, - scroll_amount: int = 100, wait_duration: float = 1.0, type_delay: int = 50, ): @@ -52,13 +55,11 @@ def __init__( Args: session: The Kernel browser session to control action_pause: Pause between actions in seconds - scroll_amount: Amount to scroll (pixels) wait_duration: Duration for wait actions in seconds type_delay: Delay between keystrokes in milliseconds """ self.session = session self.action_pause = action_pause - self.scroll_amount = scroll_amount self.wait_duration = wait_duration self.type_delay = type_delay @@ -239,21 +240,25 @@ def _execute_hotkey(self, keys: list[str]): keys=keys, ) - def _execute_scroll(self, x: int, y: int, direction: str): + def _execute_scroll(self, x: int, y: int, direction: str, notches: int = 1): """Execute a scroll action.""" - # Move to position first - self.session.kernel.browsers.computer.move_mouse( - id=self.session.session_id, - x=x, - y=y, - ) - # Scroll in the specified direction - delta_y = self.scroll_amount if direction == "up" else -self.scroll_amount + notches = max(notches, 1) + delta_x = 0 + delta_y = 0 + if direction == "up": + delta_y = -notches + elif direction == "down": + delta_y = notches + elif direction == "left": + delta_x = -notches + elif direction == "right": + delta_x = notches + self.session.kernel.browsers.computer.scroll( id=self.session.session_id, x=x, y=y, - delta_x=0, + delta_x=delta_x, delta_y=delta_y, ) @@ -298,7 +303,7 @@ def _execute_single_action(self, action: Action) -> None: case ActionType.SCROLL: x, y, direction = self._parse_scroll(arg) - self._execute_scroll(x, y, direction) + self._execute_scroll(x, y, direction, notches=1) case ActionType.FINISH: # Task completion - nothing to do @@ -316,32 +321,23 @@ def _execute_single_action(self, action: Action) -> None: print(f"Unknown action type: {action.type}") def _execute_action(self, action: Action) -> None: - """Execute an action, potentially multiple times.""" + """Execute an action, potentially multiple times. SCROLL: each event = 1 notch.""" count = action.count or 1 - for _ in range(count): self._execute_single_action(action) - # Small pause between repeated actions if count > 1: time.sleep(self.action_pause) async def __call__(self, actions: list[Action]) -> None: - """ - Execute a list of actions. - - Args: - actions: List of Action objects to execute - """ + """Execute a list of actions.""" if not self.session.session_id: raise RuntimeError("Browser session not initialized") for action in actions: try: - # Run the synchronous action execution in a thread pool await asyncio.get_event_loop().run_in_executor( None, self._execute_action, action ) - # Pause between actions await asyncio.sleep(self.action_pause) except Exception as e: print(f"Error executing action {action.type}: {e}") diff --git a/pkg/templates/python/yutori-computer-use/README.md b/pkg/templates/python/yutori/README.md similarity index 100% rename from pkg/templates/python/yutori-computer-use/README.md rename to pkg/templates/python/yutori/README.md diff --git a/pkg/templates/python/yutori-computer-use/_gitignore b/pkg/templates/python/yutori/_gitignore similarity index 100% rename from pkg/templates/python/yutori-computer-use/_gitignore rename to pkg/templates/python/yutori/_gitignore diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori/loop.py similarity index 100% rename from pkg/templates/python/yutori-computer-use/loop.py rename to pkg/templates/python/yutori/loop.py diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori/main.py similarity index 100% rename from pkg/templates/python/yutori-computer-use/main.py rename to pkg/templates/python/yutori/main.py diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori/pyproject.toml similarity index 100% rename from pkg/templates/python/yutori-computer-use/pyproject.toml rename to pkg/templates/python/yutori/pyproject.toml diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori/session.py similarity index 100% rename from pkg/templates/python/yutori-computer-use/session.py rename to pkg/templates/python/yutori/session.py diff --git a/pkg/templates/python/yutori-computer-use/tools/__init__.py b/pkg/templates/python/yutori/tools/__init__.py similarity index 100% rename from pkg/templates/python/yutori-computer-use/tools/__init__.py rename to pkg/templates/python/yutori/tools/__init__.py diff --git a/pkg/templates/python/yutori-computer-use/tools/base.py b/pkg/templates/python/yutori/tools/base.py similarity index 100% rename from pkg/templates/python/yutori-computer-use/tools/base.py rename to pkg/templates/python/yutori/tools/base.py diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py similarity index 95% rename from pkg/templates/python/yutori-computer-use/tools/computer.py rename to pkg/templates/python/yutori/tools/computer.py index 2d5784ec..d078aded 100644 --- a/pkg/templates/python/yutori-computer-use/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -140,24 +140,23 @@ async def _handle_click(self, action: N1Action, button: str, num_clicks: int) -> async def _handle_scroll(self, action: N1Action) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) direction = action.get("direction") - amount = action.get("amount", 3) + notches = max(action.get("amount", 3), 1) if direction not in ("up", "down", "left", "right"): raise ToolError(f"Invalid scroll direction: {direction}") - scroll_delta = amount * 100 - + # Backend (kernel-images) uses delta_x/delta_y as wheel-event repeat count (notches), not pixels. delta_x = 0 delta_y = 0 if direction == "up": - delta_y = -scroll_delta + delta_y = -notches elif direction == "down": - delta_y = scroll_delta + delta_y = notches elif direction == "left": - delta_x = -scroll_delta + delta_x = -notches elif direction == "right": - delta_x = scroll_delta + delta_x = notches self.kernel.browsers.computer.scroll( self.session_id, @@ -168,7 +167,9 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: ) await asyncio.sleep(SCREENSHOT_DELAY_S) - return await self.screenshot() + screenshot_result = await self.screenshot() + screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}." + return screenshot_result async def _handle_type(self, action: N1Action) -> ToolResult: text = action.get("text") diff --git a/pkg/templates/typescript/anthropic-computer-use/loop.ts b/pkg/templates/typescript/anthropic-computer-use/loop.ts index cc209d1d..cd582414 100644 --- a/pkg/templates/typescript/anthropic-computer-use/loop.ts +++ b/pkg/templates/typescript/anthropic-computer-use/loop.ts @@ -18,6 +18,7 @@ const SYSTEM_PROMPT = ` * As the initial step click on the search bar. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. * Either that, or make sure you scroll down to see everything before deciding something isn't available. +* Scroll action: scroll_amount and the tool result are in wheel units (not pixels). * When using your computer function calls, they take a while to run and send back to you. * Where possible/feasible, try to chain multiple of these calls all into one function calls request. * The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}. diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index 580ea238..05ee40f4 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -295,41 +295,35 @@ export class ComputerTool implements BaseAnthropicTool { const scrollDirection = scrollDirectionParam || kwargs.scroll_direction; const scrollAmountValue = scrollAmount || scroll_amount; - if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) { - throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`); + const dir = scrollDirection && typeof scrollDirection === 'string' && ['up', 'down', 'left', 'right'].includes(scrollDirection) ? scrollDirection : null; + if (!dir) { + throw new ToolError(`Scroll direction "${String(scrollDirection)}" must be 'up', 'down', 'left', or 'right'`); } if (typeof scrollAmountValue !== 'number' || scrollAmountValue < 0) { throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`); } - const [x, y] = coordinate + const [x, y] = coordinate ? ActionValidator.validateAndGetCoordinates(coordinate) : this.lastMousePosition; + // Backend (kernel-images) uses delta_x/delta_y as wheel-event repeat count (notches), not pixels. + const notches = Math.max(scrollAmountValue ?? 1, 1); let delta_x = 0; let delta_y = 0; - // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) - const scrollDelta = (scrollAmountValue ?? 1) * 120; - - if (scrollDirection === 'down') { - delta_y = scrollDelta; - } else if (scrollDirection === 'up') { - delta_y = -scrollDelta; - } else if (scrollDirection === 'right') { - delta_x = scrollDelta; - } else if (scrollDirection === 'left') { - delta_x = -scrollDelta; - } + if (dir === 'down') delta_y = notches; + if (dir === 'up') delta_y = -notches; + if (dir === 'right') delta_x = notches; + if (dir === 'left') delta_x = -notches; - await this.kernel.browsers.computer.scroll(this.sessionId, { - x, - y, - delta_x, - delta_y, - }); + await this.kernel.browsers.computer.scroll(this.sessionId, { x, y, delta_x, delta_y }); - await new Promise(resolve => setTimeout(resolve, 500)); - return await this.screenshot(); + await new Promise(resolve => setTimeout(resolve, 200)); + const screenshotResult = await this.screenshot(); + return { + ...screenshotResult, + output: `Scrolled ${notches} wheel unit(s) ${dir}.`, + }; } if (action === Action.WAIT) { diff --git a/pkg/templates/typescript/gemini-computer-use/index.ts b/pkg/templates/typescript/gemini-computer-use/index.ts index 91f47e4a..dd55ff10 100644 --- a/pkg/templates/typescript/gemini-computer-use/index.ts +++ b/pkg/templates/typescript/gemini-computer-use/index.ts @@ -73,9 +73,8 @@ app.action( }, ); -// Run locally if executed directly (not imported as a module) -// Execute via: npx tsx index.ts -if (import.meta.url === `file://${process.argv[1]}`) { +// Run locally when not in Kernel invocation. Execute via: npx tsx index.ts +if (!process.env.KERNEL_INVOCATION && import.meta.url === `file://${process.argv[1]}`) { const testQuery = "Navigate to https://www.google.com and describe what you see"; console.log('Running local test with query:', testQuery); diff --git a/pkg/templates/typescript/gemini-computer-use/loop.ts b/pkg/templates/typescript/gemini-computer-use/loop.ts index ba3dc5ce..bc917966 100644 --- a/pkg/templates/typescript/gemini-computer-use/loop.ts +++ b/pkg/templates/typescript/gemini-computer-use/loop.ts @@ -5,6 +5,7 @@ import { GoogleGenAI, + Environment, type Content, type FunctionCall, type Part, @@ -103,7 +104,7 @@ export async function samplingLoop({ tools: [ { computerUse: { - environment: 'ENVIRONMENT_BROWSER', + environment: Environment.ENVIRONMENT_BROWSER, }, }, ], @@ -119,7 +120,7 @@ export async function samplingLoop({ } const candidate = response.candidates[0]; - if (!candidate.content) { + if (!candidate?.content) { console.log('No content in candidate'); break; } @@ -155,6 +156,7 @@ export async function samplingLoop({ // Execute function calls and collect results const functionResponses: Part[] = []; for (const fc of functionCalls) { + if (!fc.name) continue; const args = fc.args as GeminiFunctionArgs || {}; // Handle safety decisions if present @@ -262,7 +264,7 @@ function pruneOldScreenshots(contents: Content[]): void { // Iterate in reverse to find recent turns with screenshots for (let i = contents.length - 1; i >= 0; i--) { const content = contents[i]; - if (content.role !== 'user' || !content.parts) continue; + if (!content || content.role !== 'user' || !content.parts) continue; // Check if this turn has screenshots from predefined functions let hasScreenshot = false; diff --git a/pkg/templates/typescript/gemini-computer-use/session.ts b/pkg/templates/typescript/gemini-computer-use/session.ts index 627b4420..89e7a8bf 100644 --- a/pkg/templates/typescript/gemini-computer-use/session.ts +++ b/pkg/templates/typescript/gemini-computer-use/session.ts @@ -84,8 +84,8 @@ export class KernelBrowserSession { }, }); - this._sessionId = browser.session_id; - this._liveViewUrl = browser.browser_live_view_url; + this._sessionId = browser.session_id ?? null; + this._liveViewUrl = browser.browser_live_view_url ?? null; console.log(`Kernel browser created: ${this._sessionId}`); console.log(`Live view URL: ${this._liveViewUrl}`); @@ -138,7 +138,7 @@ export class KernelBrowserSession { const replays = await this.kernel.browsers.replays.list(this._sessionId); for (const replay of replays) { if (replay.replay_id === this._replayId) { - this._replayViewUrl = replay.replay_view_url; + this._replayViewUrl = replay.replay_view_url ?? null; replayReady = true; break; } diff --git a/pkg/templates/typescript/gemini-computer-use/tools/computer.ts b/pkg/templates/typescript/gemini-computer-use/tools/computer.ts index f415f72a..9c459513 100644 --- a/pkg/templates/typescript/gemini-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/gemini-computer-use/tools/computer.ts @@ -17,6 +17,8 @@ import { const TYPING_DELAY_MS = 12; const SCREENSHOT_DELAY_MS = 500; +const PX_PER_NOTCH = 60; +const MAX_NOTCHES_PER_ACTION = 17; /** * Computer tool that maps Gemini actions to Kernel's Computer Controls API. @@ -146,23 +148,22 @@ export class ComputerTool { if (!args.direction) { return { error: 'scroll_document requires direction' }; } - // Scroll at center of viewport const centerX = Math.round(this.screenSize.width / 2); const centerY = Math.round(this.screenSize.height / 2); - const scrollDelta = 500; - - let deltaX = 0; - let deltaY = 0; - if (args.direction === 'down') deltaY = scrollDelta; - else if (args.direction === 'up') deltaY = -scrollDelta; - else if (args.direction === 'right') deltaX = scrollDelta; - else if (args.direction === 'left') deltaX = -scrollDelta; + const magnitudePx = args.magnitude ?? 400; + const docNotches = Math.min(MAX_NOTCHES_PER_ACTION, Math.max(1, Math.round(magnitudePx / PX_PER_NOTCH))); + let docDx = 0; + let docDy = 0; + if (args.direction === 'down') docDy = docNotches; + else if (args.direction === 'up') docDy = -docNotches; + else if (args.direction === 'right') docDx = docNotches; + else if (args.direction === 'left') docDx = -docNotches; await this.kernel.browsers.computer.scroll(this.sessionId, { x: centerX, y: centerY, - delta_x: deltaX, - delta_y: deltaY, + delta_x: docDx, + delta_y: docDy, }); break; } @@ -178,26 +179,19 @@ export class ComputerTool { const x = this.denormalizeX(args.x); const y = this.denormalizeY(args.y); - // Denormalize magnitude if provided - let magnitude = args.magnitude ?? 800; - if (args.direction === 'up' || args.direction === 'down') { - magnitude = this.denormalizeY(magnitude); - } else { - magnitude = this.denormalizeX(magnitude); - } - - let deltaX = 0; - let deltaY = 0; - if (args.direction === 'down') deltaY = magnitude; - else if (args.direction === 'up') deltaY = -magnitude; - else if (args.direction === 'right') deltaX = magnitude; - else if (args.direction === 'left') deltaX = -magnitude; - + const magnitudePx = args.magnitude ?? 400; + const notches = Math.min(MAX_NOTCHES_PER_ACTION, Math.max(1, Math.round(magnitudePx / PX_PER_NOTCH))); + let atDx = 0; + let atDy = 0; + if (args.direction === 'down') atDy = notches; + else if (args.direction === 'up') atDy = -notches; + else if (args.direction === 'right') atDx = notches; + else if (args.direction === 'left') atDx = -notches; await this.kernel.browsers.computer.scroll(this.sessionId, { x, y, - delta_x: deltaX, - delta_y: deltaY, + delta_x: atDx, + delta_y: atDy, }); break; } diff --git a/pkg/templates/typescript/yutori-computer-use/README.md b/pkg/templates/typescript/yutori/README.md similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/README.md rename to pkg/templates/typescript/yutori/README.md diff --git a/pkg/templates/typescript/yutori-computer-use/_gitignore b/pkg/templates/typescript/yutori/_gitignore similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/_gitignore rename to pkg/templates/typescript/yutori/_gitignore diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori/index.ts similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/index.ts rename to pkg/templates/typescript/yutori/index.ts diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori/loop.ts similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/loop.ts rename to pkg/templates/typescript/yutori/loop.ts diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori/package.json similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/package.json rename to pkg/templates/typescript/yutori/package.json diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori/session.ts similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/session.ts rename to pkg/templates/typescript/yutori/session.ts diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts similarity index 96% rename from pkg/templates/typescript/yutori-computer-use/tools/computer.ts rename to pkg/templates/typescript/yutori/tools/computer.ts index 59b60d16..5ba8e09e 100644 --- a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -160,29 +160,27 @@ export class ComputerTool { private async handleScroll(action: N1Action): Promise { const coords = this.getCoordinates(action.coordinates); const direction = action.direction; - const amount = action.amount ?? 3; + const notches = Math.max(action.amount ?? 3, 1); if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { throw new ToolError(`Invalid scroll direction: ${direction}`); } - const scrollDelta = amount * 100; - let delta_x = 0; let delta_y = 0; switch (direction) { case 'up': - delta_y = -scrollDelta; + delta_y = -notches; break; case 'down': - delta_y = scrollDelta; + delta_y = notches; break; case 'left': - delta_x = -scrollDelta; + delta_x = -notches; break; case 'right': - delta_x = scrollDelta; + delta_x = notches; break; } @@ -194,7 +192,11 @@ export class ComputerTool { }); await this.sleep(SCREENSHOT_DELAY_MS); - return this.screenshot(); + const screenshotResult = await this.screenshot(); + return { + ...screenshotResult, + output: `Scrolled ${notches} wheel unit(s) ${direction}.`, + }; } private async handleType(action: N1Action): Promise { diff --git a/pkg/templates/typescript/yutori-computer-use/tsconfig.json b/pkg/templates/typescript/yutori/tsconfig.json similarity index 100% rename from pkg/templates/typescript/yutori-computer-use/tsconfig.json rename to pkg/templates/typescript/yutori/tsconfig.json