Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkg/create/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{
Languages: []string{LanguageTypeScript, LanguagePython},
},
TemplateYutoriComputerUse: {
Name: "Yutori n1 Computer Use",
Description: "Implements a Yutori n1 computer use agent",
Name: "Yutori n1.5 Computer Use",
Description: "Implements a Yutori n1.5 computer use agent",
Languages: []string{LanguageTypeScript, LanguagePython},
},
TemplateTzafonComputerUse: {
Expand Down
35 changes: 23 additions & 12 deletions pkg/templates/python/yutori/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Kernel Python Sample App - Yutori n1 Computer Use
# Kernel Python Sample App - Yutori n1.5 Computer Use

This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API.
This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API.

[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.

## Setup

Expand Down Expand Up @@ -55,35 +55,46 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa

## Viewport Configuration

Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.

> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.

See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations.

## Screenshots

Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori.

## n1-latest Supported Actions
## n1.5-latest Supported Actions

This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only.

| Action | Description |
|--------|-------------|
| `left_click` | Left mouse click at coordinates |
| `double_click` | Double-click at coordinates |
| `triple_click` | Triple-click at coordinates |
| `left_click` | Left mouse click at coordinates (supports `modifier`) |
| `double_click` | Double-click at coordinates (supports `modifier`) |
| `triple_click` | Triple-click at coordinates (supports `modifier`) |
| `middle_click` | Middle mouse click at coordinates |
| `right_click` | Right mouse click at coordinates |
| `mouse_move` | Move mouse to coordinates without clicking |
| `mouse_down` | Press the left mouse button at coordinates |
| `mouse_up` | Release the left mouse button at coordinates |
| `scroll` | Scroll page in a direction |
| `type` | Type text into focused element |
| `key_press` | Send keyboard input |
| `hover` | Move mouse without clicking |
| `key_press` | Send a single key or key combination |
| `hold_key` | Hold a key for a duration |
| `drag` | Click-and-drag operation |
| `wait` | Pause for UI to update |
| `refresh` | Reload current page |
| `go_back` | Navigate back in history |
| `go_forward` | Navigate forward in history |
| `goto_url` | Navigate to a URL |

### Disabled tools

The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model.

## Resources

- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1)
- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5)
- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
29 changes: 21 additions & 8 deletions pkg/templates/python/yutori/loop.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""
Yutori n1 Sampling Loop
Yutori n1.5 Sampling Loop

Implements the agent loop for Yutori's n1-latest computer use model.
n1-latest uses an OpenAI-compatible API with tool_calls:
Implements the agent loop for Yutori's n1.5-latest computer use model.
n1.5-latest uses an OpenAI-compatible API with tool_calls:
- Actions are returned via tool_calls in the assistant message
- Tool results use role: "tool" with matching tool_call_id
- The model stops by returning content without tool_calls
- Coordinates are returned in 1000x1000 space and need scaling

@see https://docs.yutori.com/reference/n1
@see https://docs.yutori.com/reference/n1-5
"""

import json
Expand All @@ -17,12 +17,18 @@
from kernel import Kernel
from openai import OpenAI

from tools import ComputerTool, N1Action, ToolResult
from tools import ComputerTool, N15Action, ToolResult

# Tools that require a Playwright page / DOM access. The default core tool set
# already excludes them, but we also list them in `disable_tools` so the
# exclusion is explicit and survives if the default ever changes.
DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
TOOL_SET = "browser_tools_core-20260403"


async def sampling_loop(
*,
model: str = "n1-latest",
model: str = "n1.5-latest",
task: str,
api_key: str,
kernel: Kernel,
Expand Down Expand Up @@ -69,6 +75,13 @@ async def sampling_loop(
messages=conversation_messages,
max_completion_tokens=max_completion_tokens,
temperature=0.3,
# n1.5-specific knobs go in extra_body.
# tool_set selects the core (coordinate-based) tools.
# disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
extra_body={
"tool_set": TOOL_SET,
"disable_tools": DISABLED_TOOLS,
},
)
except Exception as api_error:
print(f"API call failed: {api_error}")
Expand Down Expand Up @@ -108,7 +121,7 @@ async def sampling_loop(
})
continue

action: N1Action = {"action_type": action_name, **args}
action: N15Action = {"action_type": action_name, **args}
print(f"Executing action: {action_name}", args)

scaled_action = _scale_coordinates(action, viewport_width, viewport_height)
Expand Down Expand Up @@ -155,7 +168,7 @@ async def sampling_loop(
}


def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action:
def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
scaled = dict(action)

if "coordinates" in scaled and scaled["coordinates"]:
Expand Down
4 changes: 2 additions & 2 deletions pkg/templates/python/yutori/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def cua_task(
payload: QueryInput,
) -> QueryOutput:
"""
Process a user query using Yutori n1 Computer Use with Kernel's browser automation.
Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation.

Args:
ctx: Kernel context containing invocation information
Expand Down Expand Up @@ -58,7 +58,7 @@ async def cua_task(
print("Kernel browser live view url:", session.live_view_url)

loop_result = await sampling_loop(
model="n1-latest",
model="n1.5-latest",
task=payload["query"],
api_key=str(api_key),
kernel=session.kernel,
Expand Down
6 changes: 3 additions & 3 deletions pkg/templates/python/yutori/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Yutori n1 Computer Tools."""
"""Yutori n1.5 Computer Tools."""

from .base import ToolError, ToolResult
from .computer import ComputerTool, N1Action
from .computer import ComputerTool, N15Action

__all__ = [
"ToolError",
"ToolResult",
"ComputerTool",
"N1Action",
"N15Action",
]
Loading
Loading