Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
from windows_use.llms.google import ChatGoogle
from windows_use.llms.anthropic import ChatAnthropic
from windows_use.llms.ollama import ChatOllama
from windows_use.llms.mistral import ChatMistral
from windows_use.llms.azure_openai import ChatAzureOpenAI
from windows_use.agent import Agent, Browser
from dotenv import load_dotenv
import os
Expand All @@ -11,8 +7,10 @@

def main():
api_key = os.getenv("GOOGLE_API_KEY")
# llm=ChatMistral(model='magistral-small-latest',api_key=api_key,temperature=0.7)
llm=ChatGoogle(model="gemini-2.5-flash-lite",thinking_budget=0, api_key=api_key, temperature=0.7)
# llm=ChatMistral(model='magistral-small-latest',api_key=api_key,temperature=0.0, profile="deterministic")
# Some external ChatGoogle implementations may not accept `profile`.
# Use a backward-compatible call without `profile` to avoid TypeError.
llm=ChatGoogle(model="gemini-2.5-flash", api_key=api_key, temperature=0.0)
# llm=ChatAnthropic(model="claude-sonnet-4-5", api_key=api_key, temperature=0.7,max_tokens=1000)
# llm=ChatOllama(model="qwen3-vl:235b-cloud",temperature=0.2)
# llm=ChatAzureOpenAI(
Expand All @@ -23,7 +21,8 @@ def main():
# api_version=os.getenv("AOAI_API_VERSION", "2025-01-01-preview"),
# temperature=0.7
# )
agent = Agent(llm=llm, browser=Browser.EDGE, use_vision=False, auto_minimize=True)
# Configure agent for deterministic, low-latency operation: fewer retries and steps.
agent = Agent(llm=llm, browser=Browser.EDGE, use_vision=False, auto_minimize=True, max_consecutive_failures=1, max_steps=10)
agent.print_response(query=input("Enter a query: "))

if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/agent/test_deterministic_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from windows_use.llms.google import ChatGoogle


def test_google_llm_default_is_deterministic():
llm = ChatGoogle(model="test-model")
assert hasattr(llm, 'temperature')
assert llm.temperature == 0.0
assert hasattr(llm, 'profile')
2 changes: 2 additions & 0 deletions windows_use/agent/prompt/action.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
<action>
<name>{action_name}</name>
<input>{action_input}</input>
<evidence> Provide the minimal observable evidence you expect after this action (e.g., "Dialog 'Save as' open", "Button 'OK' visible").</evidence>
<post_check>Provide a one-line verification the agent must perform after the action to confirm success.</post_check>
</action>
</output>
```
35 changes: 33 additions & 2 deletions windows_use/agent/registry/service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from windows_use.agent.registry.views import ToolResult
from windows_use.agent.desktop.service import Desktop
from windows_use.tool import Tool
from windows_use.tool.service import ToolResult as RawToolResult
from textwrap import dedent
import json

Expand Down Expand Up @@ -31,8 +32,38 @@ def execute(self, tool_name: str, desktop: Desktop|None=None, **kwargs) -> ToolR
if tool is None:
return ToolResult(is_success=False, error=f"Tool '{tool_name}' not found.")
try:
# Preprocess common alternative selectors (e.g., label -> loc)
if desktop and 'label' in kwargs and 'loc' not in kwargs:
try:
label = int(kwargs.pop('label'))
coords = desktop.get_coordinates_from_label(label)
kwargs['loc'] = coords
except (IndexError, ValueError) as e:
return ToolResult(is_success=False, error=f"Invalid label selector: {e}")

args=tool.model.model_validate(kwargs)
content = tool.invoke(**({'desktop': desktop} | args.model_dump()))
return ToolResult(is_success=True, content=content)
raw = tool.invoke(**({'desktop': desktop} | args.model_dump()))
# If tool returned a Raw ToolResult (from windows_use.tool.service), map it
if isinstance(raw, RawToolResult):
is_ok = raw.status.lower() in ("ok", "success")
content_str = None
if raw.evidence:
# Prefer a readable representation of evidence
try:
import json
content_str = json.dumps(raw.evidence)
except Exception:
content_str = str(raw.evidence)
elif raw.details:
content_str = str(raw.details)
else:
content_str = None
return ToolResult(is_success=is_ok, content=content_str, error=None if is_ok else str(raw.details), confidence=raw.confidence, evidence=raw.evidence)
# If tool returned arbitrary dict or string, normalize
if isinstance(raw, dict):
return ToolResult(is_success=True, content=str(raw))
if isinstance(raw, str):
return ToolResult(is_success=True, content=raw)
return ToolResult(is_success=True, content=str(raw))
except Exception as error:
return ToolResult(is_success=False, error=str(error))
4 changes: 3 additions & 1 deletion windows_use/agent/registry/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
class ToolResult(BaseModel):
is_success: bool
content: str | None = None
error: str | None = None
error: str | None = None
confidence: float | None = None
evidence: dict | None = None
24 changes: 23 additions & 1 deletion windows_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,17 @@ def invoke(self,query: str)->AgentResult:
for consecutive_failures in range(1, self.max_consecutive_failures + 1):
try:
llm_response = self.llm.invoke(messages+error_messages)
# Ensure LLM returned usable content
if not hasattr(llm_response, 'content') or llm_response.content is None:
raise ValueError("LLM returned empty or invalid content")
agent_data = xml_parser(llm_response)
break
except ValueError as e:
error_messages.clear()
error_messages.append(llm_response)
# Append previous LLM content if available for context
if 'llm_response' in locals() and llm_response is not None:
prev_content = getattr(llm_response, 'content', None) or str(llm_response)
error_messages.append(HumanMessage(content=f"Previous response: {prev_content}"))
error_messages.append(HumanMessage(content=f"Response rejected, invalid response format\nError: {e}\nAdhere to the format specified in <output_contract>"))
logger.warning(f"[LLM]: Invalid response format, Retrying attempt {consecutive_failures}/{self.max_consecutive_failures}...")
if consecutive_failures == self.max_consecutive_failures:
Expand Down Expand Up @@ -147,10 +153,26 @@ def invoke(self,query: str)->AgentResult:
else:
logger.info(f"[Tool] 🔧 Action: {action_name}({', '.join(f'{k}={v}' for k, v in params.items())})")
action_response = self.registry.execute(tool_name=action_name, desktop=self.desktop, **params)
# Basic post-action verification and confidence handling
observation = action_response.content if action_response.is_success else action_response.error
logger.info(f"[Tool] 📝 Observation: {observation}\n")
agent_data.observation = observation

# If tool returned a confidence and it's low, treat as non-success and request retry
low_confidence = False
if getattr(action_response, 'confidence', None) is not None:
try:
conf = float(action_response.confidence)
if conf < 0.95:
low_confidence = True
except Exception:
pass

if low_confidence:
logger.warning(f"[Tool] ⚠️ Low confidence ({action_response.confidence}); requesting clarification/retry.")
# Convert to observation that signals failure for the LLM to reconsider
agent_data.observation = f"LOW_CONFIDENCE: {observation}"

desktop_state = self.desktop.get_state(use_vision=self.use_vision)
human_prompt = Prompt.observation_prompt(query=query, agent_step=self.agent_step,
tool_result=action_response, desktop_state=desktop_state
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@

@dataclass
class ChatAnthropic(BaseChatLLM):
def __init__(self, model: str, api_key: str, thinking_budget:int=-1, temperature: float = 0.7, max_tokens: int = 8192, auth_token: str | None = None, base_url: str | None = None, timeout: float | None = None, max_retries: int = 3, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
def __init__(self, model: str, api_key: str, thinking_budget:int=-1, temperature: float = 0.0, profile: str | None = None, max_tokens: int = 8192, auth_token: str | None = None, base_url: str | None = None, timeout: float | None = None, max_retries: int = 3, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
self.model = model
self.api_key = api_key
self.auth_token = auth_token
self.max_tokens = max_tokens
self.temperature = temperature
self.profile = profile
self.base_url = base_url
self.thinking_budget=thinking_budget
self.timeout = timeout
Expand Down
4 changes: 3 additions & 1 deletion windows_use/llms/azure_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def __init__(
api_key: str,
model: str | None = None,
api_version: str = "2024-10-21",
temperature: float = 0.7,
temperature: float = 0.0,
profile: str | None = None,
max_retries: int = 3,
timeout: float | None = None,
default_headers: dict[str, str] | None = None,
Expand All @@ -31,6 +32,7 @@ def __init__(
self.model = model
self.api_version = api_version
self.temperature = temperature
self.profile = profile
self.max_retries = max_retries
self.timeout = timeout
self.default_headers = default_headers
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/cerebras.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@

@dataclass
class ChatCerebras(BaseChatLLM):
def __init__(self, model: str, api_key: str, temperature: float = 0.7, base_url: str | None = None, timeout: float | None = None, max_retries: int = 3, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False, warm_tcp_connection: bool = True):
def __init__(self, model: str, api_key: str, temperature: float = 0.0, profile: str | None = None, base_url: str | None = None, timeout: float | None = None, max_retries: int = 3, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False, warm_tcp_connection: bool = True):
self.model = model
self.api_key = api_key
self.temperature = temperature
self.profile = profile
self.base_url = base_url
self.timeout = timeout
self.max_retries = max_retries
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ def run_async(coro):

@dataclass
class ChatGoogle(BaseChatLLM):
def __init__(self, model: str, thinking_budget: int=-1, api_key: str=None, vertexai: bool|None=None, project: str|None=None, location: str|None=None, credentials: Credentials|None=None,http_options: types.HttpOptions | types.HttpOptionsDict | None = None, debug_config: DebugConfig | None = None, temperature: float = 0.7):
def __init__(self, model: str, thinking_budget: int=-1, api_key: str=None, vertexai: bool|None=None, project: str|None=None, location: str|None=None, credentials: Credentials|None=None,http_options: types.HttpOptions | types.HttpOptionsDict | None = None, debug_config: DebugConfig | None = None, temperature: float = 0.0, profile: str | None = None):
self.model = model
self.api_key = api_key
self.vertexai = vertexai
self.temperature = temperature
self.profile = profile
self.credentials = credentials
self.project = project
self.location = location
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/groq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@

@dataclass
class ChatGroq(BaseChatLLM):
def __init__(self, model: str, api_key: str, base_url: str|None=None, temperature: float = 0.7,max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
def __init__(self, model: str, api_key: str, base_url: str|None=None, temperature: float = 0.0, profile: str | None = None, max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
self.model = model
self.api_key = api_key
self.temperature = temperature
self.profile = profile
self.max_retries = max_retries
self.base_url = base_url
self.timeout = timeout
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@

@dataclass
class ChatMistral(BaseChatLLM):
def __init__(self, model: str, api_key: str, max_tokens: int|None=None, temperature: float = 0.7, server: Union[str, None] = None, server_url: Union[str, None] = None, url_params: Dict[str, str] = None, client: Type[HttpClient] = None, async_client: Type[AsyncHttpClient] = None,retry_config: OptionalNullable[RetryConfig] = None,timeout_ms: Union[int, None] = None,debug_logger: Union[logging.Logger, None] = None):
def __init__(self, model: str, api_key: str, max_tokens: int|None=None, temperature: float = 0.0, profile: str | None = None, server: Union[str, None] = None, server_url: Union[str, None] = None, url_params: Dict[str, str] = None, client: Type[HttpClient] = None, async_client: Type[AsyncHttpClient] = None,retry_config: OptionalNullable[RetryConfig] = None,timeout_ms: Union[int, None] = None,debug_logger: Union[logging.Logger, None] = None):
self.model = model
self.api_key = api_key
self.temperature = temperature
self.profile = profile
self.server = server
self.max_tokens = max_tokens
self.server_url = server_url
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

@dataclass
class ChatOllama(BaseChatLLM):
def __init__(self,host: str|None=None, model: str|None=None, think:bool=False, temperature: float = 0.7,timeout: int|None=None):
def __init__(self,host: str|None=None, model: str|None=None, think:bool=False, temperature: float = 0.0, profile: str | None = None, timeout: int|None=None):
self.host = host
self.model = model
self.think=think
self.temperature = temperature
self.profile = profile
self.timeout = timeout
self._client = None

Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/open_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@

@dataclass
class ChatOpenRouter(BaseChatLLM):
def __init__(self, model: str, api_key: str, base_url: str|None=None, temperature: float = 0.7,max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
def __init__(self, model: str, api_key: str, base_url: str|None=None, temperature: float = 0.0, profile: str | None = None, max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
self.model = model
self.api_key = api_key
self.temperature = temperature
self.profile = profile
self.max_retries = max_retries
self.base_url = base_url
self.timeout = timeout
Expand Down
3 changes: 2 additions & 1 deletion windows_use/llms/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@

@dataclass
class ChatOpenAI(BaseChatLLM):
def __init__(self, model: str, api_key: str, organization: str|None=None, project: str|None=None, base_url: str|None=None, websocket_base_url: str|None=None, temperature: float = 0.7,max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
def __init__(self, model: str, api_key: str, organization: str|None=None, project: str|None=None, base_url: str|None=None, websocket_base_url: str|None=None, temperature: float = 0.0, profile: str | None = None, max_retries: int = 3,timeout: int|None=None, default_headers: dict[str, str] | None = None, default_query: dict[str, object] | None = None, http_client: Client | None = None, strict_response_validation: bool = False):
self.model = model
self.api_key = api_key
self.temperature = temperature
self.profile = profile
self.max_retries = max_retries
self.organization = organization
self.project = project
Expand Down
24 changes: 24 additions & 0 deletions windows_use/simulator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from dataclasses import dataclass

@dataclass
class DummyDesktopState:
tree: dict
screenshot: bytes | None = None

class Simulator:
"""Simple deterministic simulator harness for unit tests."""
def __init__(self):
self.state = DummyDesktopState(tree={"apps":[],"interactive":[]}, screenshot=None)

def step_click(self, target_id: str):
# deterministic behavior: if target exists, return success
for el in self.state.tree.get("interactive",[]):
if el.get("id") == target_id:
return {"status":"ok","evidence":{"clicked":target_id},"confidence":1.0}
return {"status":"error","details":"element_not_found","confidence":0.0}

def add_element(self, el: dict):
self.state.tree.setdefault("interactive",[]).append(el)

def get_state(self):
return self.state
5 changes: 5 additions & 0 deletions windows_use/telemetry/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,9 @@ class AgentTelemetryEvent(BaseTelemetryEvent):
error: str | None=None
event_name: str = "agent_event"
is_success:bool=False
action_intent_confidence: float | None = None
action_success: bool | None = None
evidence_mismatch: bool | None = None
rerun_count: int | None = None
post_check_pass: bool | None = None

21 changes: 19 additions & 2 deletions windows_use/tool/service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pydantic import BaseModel
from typing import Any
from typing import Any, Union

class Tool:
def __init__(self, name: str|None=None, description: str|None=None, args_schema:BaseModel|None=None):
Expand Down Expand Up @@ -28,4 +28,21 @@ def __call__(self, function):
return self

def invoke(self, *args, **kwargs):
return self.function(*args, **kwargs)
result = self.function(*args, **kwargs)
# Normalize and validate result to ToolResult when possible
if isinstance(result, dict):
try:
return ToolResult.parse_obj(result)
except Exception:
return ToolResult(status="error", evidence={"raw": result}, confidence=0.0, details="invalid tool result schema")
if isinstance(result, ToolResult):
return result
# Wrap arbitrary return values
return ToolResult(status="ok", evidence={"result": result}, confidence=1.0)


class ToolResult(BaseModel):
status: str
evidence: dict | None = None
confidence: float = 1.0
details: Union[str, dict, None] = None