kernel · dprevoznik · Mar 3, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md
@@ -58,7 +58,7 @@ Here are all valid language + template combinations:
 | typescript | openai-computer-use    | ts-openai-cua     | ts-openai-cua         | Yes            | OPENAI_API_KEY                 |
 | typescript | gemini-computer-use    | ts-gemini-cua     | ts-gemini-cua         | Yes            | GOOGLE_API_KEY                 |
 | typescript | claude-agent-sdk       | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes            | ANTHROPIC_API_KEY              |
-| typescript | yutori-computer-use    | ts-yutori-cua     | ts-yutori-cua         | Yes            | YUTORI_API_KEY                 |
+| typescript | yutori                 | ts-yutori-cua     | ts-yutori-cua         | Yes            | YUTORI_API_KEY                 |
 
 | python     | sample-app             | py-sample-app     | python-basic          | No             | -                              |
 | python     | gemini-computer-use    | py-gemini-cua     | python-gemini-cua     | Yes            | GOOGLE_API_KEY                 |
@@ -68,7 +68,7 @@ Here are all valid language + template combinations:
 | python     | openai-computer-use    | py-openai-cua     | python-openai-cua     | Yes            | OPENAI_API_KEY                 |
 | python     | openagi-computer-use   | py-openagi-cua    | python-openagi-cua    | Yes            | OAGI_API_KEY                   |
 | python     | claude-agent-sdk       | py-claude-agent-sdk | py-claude-agent-sdk | Yes            | ANTHROPIC_API_KEY              |
-| python     | yutori-computer-use    | py-yutori-cua     | python-yutori-cua     | Yes            | YUTORI_API_KEY                 |
+| python     | yutori                 | py-yutori-cua     | python-yutori-cua     | Yes            | YUTORI_API_KEY                 |
 
 > **Yutori:** Test both default browser and `"kiosk": true` (uses Playwright for goto_url when kiosk is enabled).
 
@@ -86,7 +86,7 @@ Run each of these (they are non-interactive when all flags are provided):
 ../bin/kernel create -n ts-openai-cua -l typescript -t openai-computer-use
 ../bin/kernel create -n ts-gemini-cua -l typescript -t gemini-computer-use
 ../bin/kernel create -n ts-claude-agent-sdk -l typescript -t claude-agent-sdk
-../bin/kernel create -n ts-yutori-cua -l typescript -t yutori-computer-use
+../bin/kernel create -n ts-yutori-cua -l typescript -t yutori
 
 # Python templates
 ../bin/kernel create -n py-sample-app -l python -t sample-app
@@ -97,7 +97,7 @@ Run each of these (they are non-interactive when all flags are provided):
 ../bin/kernel create -n py-openagi-cua -l python -t openagi-computer-use
 ../bin/kernel create -n py-claude-agent-sdk -l python -t claude-agent-sdk
 ../bin/kernel create -n py-gemini-cua -l python -t gemini-computer-use
-../bin/kernel create -n py-yutori-cua -l python -t yutori-computer-use
+../bin/kernel create -n py-yutori-cua -l python -t yutori
 ```
 
 ## Step 5: Deploy Each Template

diff --git a/pkg/create/templates.go b/pkg/create/templates.go
@@ -18,7 +18,7 @@ const (
 	TemplateStagehand            = "stagehand"
 	TemplateOpenAGIComputerUse   = "openagi-computer-use"
 	TemplateClaudeAgentSDK       = "claude-agent-sdk"
-	TemplateYutoriComputerUse    = "yutori-computer-use"
+	TemplateYutoriComputerUse    = "yutori"
 )
 
 type TemplateInfo struct {

diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py
@@ -50,6 +50,7 @@ class APIProvider(StrEnum):
 * As the initial step click on the search bar.
 * When viewing a page it can be helpful to zoom out so that you can see everything on the page.
 * Either that, or make sure you scroll down to see everything before deciding something isn't available.
+* Scroll action: scroll_amount and the tool result are in wheel units (not pixels).
 * When using your computer function calls, they take a while to run and send back to you.
 * Where possible/feasible, try to chain multiple of these calls all into one function calls request.
 * The current date is {datetime.now().strftime("%A, %B %d, %Y")}.

diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py
@@ -370,21 +370,17 @@ async def __call__(
             else:
                 x, y = self._last_mouse_position
 
-            # Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior)
-            scroll_factor = scroll_amount * 120
-
+            notches = max(scroll_amount or 1, 1)
             delta_x = 0
             delta_y = 0
             if scroll_direction == "up":
-                delta_y = -scroll_factor
+                delta_y = -notches
             elif scroll_direction == "down":
-                delta_y = scroll_factor
+                delta_y = notches
             elif scroll_direction == "left":
-                delta_x = -scroll_factor
+                delta_x = -notches
             elif scroll_direction == "right":
-                delta_x = scroll_factor
-
-            print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y)} pixels {scroll_direction}")
+                delta_x = notches
 
             self.kernel.browsers.computer.scroll(
                 id=self.session_id,
@@ -393,7 +389,12 @@ async def __call__(
                 delta_x=delta_x,
                 delta_y=delta_y,
             )
-            return await self.screenshot()
+
+            await asyncio.sleep(0.2)
+            screenshot_result = await self.screenshot()
+            return screenshot_result.replace(
+                output=f"Scrolled {notches} wheel unit(s) {scroll_direction}."
+            )
 
         if action in ("hold_key", "wait"):
             if duration is None or not isinstance(duration, (int, float)):

diff --git a/pkg/templates/python/gemini-computer-use/main.py b/pkg/templates/python/gemini-computer-use/main.py
@@ -75,9 +75,8 @@ async def cua_task(
     }
 
 
-# Run locally if executed directly (not imported as a module)
-# Execute via: uv run main.py
-if __name__ == "__main__":
+# Run locally when not in Kernel invocation. Execute via: uv run main.py
+if __name__ == "__main__" and not os.getenv("KERNEL_INVOCATION"):
     import asyncio
 
     async def main():

diff --git a/pkg/templates/python/gemini-computer-use/tools/computer.py b/pkg/templates/python/gemini-computer-use/tools/computer.py
@@ -21,6 +21,8 @@
 
 TYPING_DELAY_MS = 12
 SCREENSHOT_DELAY_SECS = 0.5
+PX_PER_NOTCH = 60
+MAX_NOTCHES_PER_ACTION = 17
 
 
 class ComputerTool:
@@ -131,22 +133,21 @@ async def execute_action(
             elif action_name == GeminiAction.SCROLL_DOCUMENT:
                 if "direction" not in args:
                     return ToolResult(error="scroll_document requires direction")
-                # Scroll at center of viewport
                 center_x = self.screen_size.width // 2
                 center_y = self.screen_size.height // 2
-                scroll_delta = 500
 
-                delta_x, delta_y = 0, 0
+                magnitude_px = args.get("magnitude", 400)
+                doc_notches = min(MAX_NOTCHES_PER_ACTION, max(1, round(magnitude_px / PX_PER_NOTCH)))
                 direction = args["direction"]
+                delta_x = delta_y = 0
                 if direction == "down":
-                    delta_y = scroll_delta
+                    delta_y = doc_notches
                 elif direction == "up":
-                    delta_y = -scroll_delta
+                    delta_y = -doc_notches
                 elif direction == "right":
-                    delta_x = scroll_delta
+                    delta_x = doc_notches
                 elif direction == "left":
-                    delta_x = -scroll_delta
-
+                    delta_x = -doc_notches
                 self.kernel.browsers.computer.scroll(
                     self.session_id,
                     x=center_x,
@@ -164,24 +165,18 @@ async def execute_action(
                 x = self.denormalize_x(args["x"])
                 y = self.denormalize_y(args["y"])
 
-                # Denormalize magnitude if provided
-                magnitude = args.get("magnitude", 800)
+                magnitude_px = args.get("magnitude", 400)
+                notches = min(MAX_NOTCHES_PER_ACTION, max(1, round(magnitude_px / PX_PER_NOTCH)))
                 direction = args["direction"]
-                if direction in ("up", "down"):
-                    magnitude = self.denormalize_y(magnitude)
-                else:
-                    magnitude = self.denormalize_x(magnitude)
-
-                delta_x, delta_y = 0, 0
+                delta_x = delta_y = 0
                 if direction == "down":
-                    delta_y = magnitude
+                    delta_y = notches
                 elif direction == "up":
-                    delta_y = -magnitude
+                    delta_y = -notches
                 elif direction == "right":
-                    delta_x = magnitude
+                    delta_x = notches
                 elif direction == "left":
-                    delta_x = -magnitude
-
+                    delta_x = -notches
                 self.kernel.browsers.computer.scroll(
                     self.session_id,
                     x=x,

diff --git a/pkg/templates/python/openagi-computer-use/kernel_handler.py b/pkg/templates/python/openagi-computer-use/kernel_handler.py
@@ -36,13 +36,16 @@ class KernelActionHandler:
     - HOTKEY -> press_key(keys=[...])
     - TYPE -> type_text(text=...)
     - SCROLL -> scroll(x, y, delta_y=...)
+
+    Note: OpenAGI/Lux tends to emit scroll N times for "scroll by N" (e.g. 3 identical
+    [scroll] actions for "scroll down with amount 3"). We treat each scroll event as
+    one scroll unit (1 notch), so N events in a row = N notches without fighting the model.
     """
 
     def __init__(
         self,
         session: "KernelBrowserSession",
         action_pause: float = 0.1,
-        scroll_amount: int = 100,
         wait_duration: float = 1.0,
         type_delay: int = 50,
     ):
@@ -52,13 +55,11 @@ def __init__(
         Args:
             session: The Kernel browser session to control
             action_pause: Pause between actions in seconds
-            scroll_amount: Amount to scroll (pixels)
             wait_duration: Duration for wait actions in seconds
             type_delay: Delay between keystrokes in milliseconds
         """
         self.session = session
         self.action_pause = action_pause
-        self.scroll_amount = scroll_amount
         self.wait_duration = wait_duration
         self.type_delay = type_delay
 
@@ -239,21 +240,25 @@ def _execute_hotkey(self, keys: list[str]):
             keys=keys,
         )
 
-    def _execute_scroll(self, x: int, y: int, direction: str):
+    def _execute_scroll(self, x: int, y: int, direction: str, notches: int = 1):
         """Execute a scroll action."""
-        # Move to position first
-        self.session.kernel.browsers.computer.move_mouse(
-            id=self.session.session_id,
-            x=x,
-            y=y,
-        )
-        # Scroll in the specified direction
-        delta_y = self.scroll_amount if direction == "up" else -self.scroll_amount
+        notches = max(notches, 1)
+        delta_x = 0
+        delta_y = 0
+        if direction == "up":
+            delta_y = -notches
+        elif direction == "down":
+            delta_y = notches
+        elif direction == "left":
+            delta_x = -notches
+        elif direction == "right":
+            delta_x = notches
+
         self.session.kernel.browsers.computer.scroll(
             id=self.session.session_id,
             x=x,
             y=y,
-            delta_x=0,
+            delta_x=delta_x,
             delta_y=delta_y,
         )
 
@@ -298,7 +303,7 @@ def _execute_single_action(self, action: Action) -> None:
 
             case ActionType.SCROLL:
                 x, y, direction = self._parse_scroll(arg)
-                self._execute_scroll(x, y, direction)
+                self._execute_scroll(x, y, direction, notches=1)
 
             case ActionType.FINISH:
                 # Task completion - nothing to do
@@ -316,32 +321,23 @@ def _execute_single_action(self, action: Action) -> None:
                 print(f"Unknown action type: {action.type}")
 
     def _execute_action(self, action: Action) -> None:
-        """Execute an action, potentially multiple times."""
+        """Execute an action, potentially multiple times. SCROLL: each event = 1 notch."""
         count = action.count or 1
-
         for _ in range(count):
             self._execute_single_action(action)
-            # Small pause between repeated actions
             if count > 1:
                 time.sleep(self.action_pause)
 
     async def __call__(self, actions: list[Action]) -> None:
-        """
-        Execute a list of actions.
-
-        Args:
-            actions: List of Action objects to execute
-        """
+        """Execute a list of actions."""
         if not self.session.session_id:
             raise RuntimeError("Browser session not initialized")
 
         for action in actions:
             try:
-                # Run the synchronous action execution in a thread pool
                 await asyncio.get_event_loop().run_in_executor(
                     None, self._execute_action, action
                 )
-                # Pause between actions
                 await asyncio.sleep(self.action_pause)
             except Exception as e:
                 print(f"Error executing action {action.type}: {e}")

diff --git a/...ates/python/yutori-computer-use/README.md → pkg/templates/python/yutori/README.md b/...ates/python/yutori-computer-use/README.md → pkg/templates/python/yutori/README.md
diff --git a/...tes/python/yutori-computer-use/_gitignore → pkg/templates/python/yutori/_gitignore b/...tes/python/yutori-computer-use/_gitignore → pkg/templates/python/yutori/_gitignore
diff --git a/...plates/python/yutori-computer-use/loop.py → pkg/templates/python/yutori/loop.py b/...plates/python/yutori-computer-use/loop.py → pkg/templates/python/yutori/loop.py
diff --git a/...plates/python/yutori-computer-use/main.py → pkg/templates/python/yutori/main.py b/...plates/python/yutori-computer-use/main.py → pkg/templates/python/yutori/main.py
diff --git a/...python/yutori-computer-use/pyproject.toml → pkg/templates/python/yutori/pyproject.toml b/...python/yutori-computer-use/pyproject.toml → pkg/templates/python/yutori/pyproject.toml
diff --git a/...tes/python/yutori-computer-use/session.py → pkg/templates/python/yutori/session.py b/...tes/python/yutori-computer-use/session.py → pkg/templates/python/yutori/session.py
diff --git a/...hon/yutori-computer-use/tools/__init__.py → ...templates/python/yutori/tools/__init__.py b/...hon/yutori-computer-use/tools/__init__.py → ...templates/python/yutori/tools/__init__.py
diff --git a/.../python/yutori-computer-use/tools/base.py → pkg/templates/python/yutori/tools/base.py b/.../python/yutori-computer-use/tools/base.py → pkg/templates/python/yutori/tools/base.py
diff --git a/...hon/yutori-computer-use/tools/computer.py → ...templates/python/yutori/tools/computer.py b/...hon/yutori-computer-use/tools/computer.py → ...templates/python/yutori/tools/computer.py
@@ -140,24 +140,23 @@ async def _handle_click(self, action: N1Action, button: str, num_clicks: int) ->
     async def _handle_scroll(self, action: N1Action) -> ToolResult:
         coords = self._get_coordinates(action.get("coordinates"))
         direction = action.get("direction")
-        amount = action.get("amount", 3)
+        notches = max(action.get("amount", 3), 1)
 
         if direction not in ("up", "down", "left", "right"):
             raise ToolError(f"Invalid scroll direction: {direction}")
 
-        scroll_delta = amount * 100
-
+        # Backend (kernel-images) uses delta_x/delta_y as wheel-event repeat count (notches), not pixels.
         delta_x = 0
         delta_y = 0
 
         if direction == "up":
-            delta_y = -scroll_delta
+            delta_y = -notches
         elif direction == "down":
-            delta_y = scroll_delta
+            delta_y = notches
         elif direction == "left":
-            delta_x = -scroll_delta
+            delta_x = -notches
         elif direction == "right":
-            delta_x = scroll_delta
+            delta_x = notches
 
         self.kernel.browsers.computer.scroll(
             self.session_id,
@@ -168,7 +167,9 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult:
         )
 
         await asyncio.sleep(SCREENSHOT_DELAY_S)
-        return await self.screenshot()
+        screenshot_result = await self.screenshot()
+        screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}."
+        return screenshot_result
 
     async def _handle_type(self, action: N1Action) -> ToolResult:
         text = action.get("text")

diff --git a/pkg/templates/typescript/anthropic-computer-use/loop.ts b/pkg/templates/typescript/anthropic-computer-use/loop.ts
@@ -18,6 +18,7 @@ const SYSTEM_PROMPT = `<SYSTEM_CAPABILITY>
 * As the initial step click on the search bar.
 * When viewing a page it can be helpful to zoom out so that you can see everything on the page.
 * Either that, or make sure you scroll down to see everything before deciding something isn't available.
+* Scroll action: scroll_amount and the tool result are in wheel units (not pixels).
 * When using your computer function calls, they take a while to run and send back to you.
 * Where possible/feasible, try to chain multiple of these calls all into one function calls request.
 * The current date is ${DateTime.now().toFormat('EEEE, MMMM d, yyyy')}.

diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts
@@ -295,41 +295,35 @@ export class ComputerTool implements BaseAnthropicTool {
       const scrollDirection = scrollDirectionParam || kwargs.scroll_direction;
       const scrollAmountValue = scrollAmount || scroll_amount;
 
-      if (!scrollDirection || !['up', 'down', 'left', 'right'].includes(scrollDirection)) {
-        throw new ToolError(`Scroll direction "${scrollDirection}" must be 'up', 'down', 'left', or 'right'`);
+      const dir = scrollDirection && typeof scrollDirection === 'string' && ['up', 'down', 'left', 'right'].includes(scrollDirection) ? scrollDirection : null;
+      if (!dir) {
+        throw new ToolError(`Scroll direction "${String(scrollDirection)}" must be 'up', 'down', 'left', or 'right'`);
       }
       if (typeof scrollAmountValue !== 'number' || scrollAmountValue < 0) {
         throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`);
       }
 
-      const [x, y] = coordinate 
+      const [x, y] = coordinate
         ? ActionValidator.validateAndGetCoordinates(coordinate)
         : this.lastMousePosition;
 
+      // Backend (kernel-images) uses delta_x/delta_y as wheel-event repeat count (notches), not pixels.
+      const notches = Math.max(scrollAmountValue ?? 1, 1);
       let delta_x = 0;
       let delta_y = 0;
-      // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior)
-      const scrollDelta = (scrollAmountValue ?? 1) * 120;
-
-      if (scrollDirection === 'down') {
-        delta_y = scrollDelta;
-      } else if (scrollDirection === 'up') {
-        delta_y = -scrollDelta;
-      } else if (scrollDirection === 'right') {
-        delta_x = scrollDelta;
-      } else if (scrollDirection === 'left') {
-        delta_x = -scrollDelta;
-      }
+      if (dir === 'down') delta_y = notches;
+      if (dir === 'up') delta_y = -notches;
+      if (dir === 'right') delta_x = notches;
+      if (dir === 'left') delta_x = -notches;
 
-      await this.kernel.browsers.computer.scroll(this.sessionId, {
-        x,
-        y,
-        delta_x,
-        delta_y,
-      });
+      await this.kernel.browsers.computer.scroll(this.sessionId, { x, y, delta_x, delta_y });
 
-      await new Promise(resolve => setTimeout(resolve, 500));
-      return await this.screenshot();
+      await new Promise(resolve => setTimeout(resolve, 200));
+      const screenshotResult = await this.screenshot();
+      return {
+        ...screenshotResult,
+        output: `Scrolled ${notches} wheel unit(s) ${dir}.`,
+      };
     }
 
     if (action === Action.WAIT) {