From ecc657e227b36d148f86ce18d24195ed4baae362 Mon Sep 17 00:00:00 2001 From: papadie23 Date: Sun, 28 Jun 2026 13:33:34 +0300 Subject: [PATCH 1/2] Add DeepSeek integration and fix Linux/Wayland compatibility - Add model mode using text-only OCR approach (DeepSeek API doesn't support vision, so screen text is extracted via Tesseract/EasyOCR and sent as structured text) - Add config with OpenAI-compatible client - Add for text-only model guidance - Show DeepSeek reasoning tokens in terminal for transparency Fixes: - Replace broken X11 screenshot with flameshot (works on Wayland) with fallbacks to gnome-screenshot, mss, then ImageGrab - Add fuzzy text matching in OCR (diffs can now match 'Gooale' ~ 'Google') - Return None instead of raising on text-not-found to avoid crashes - Cache EasyOCR reader globally to avoid re-downloading models each loop - Strip premature 'done' operations (model must verify before claiming success) - Smarter delays: 4s after enter/navigation, 2s base - Update requirements.txt pins to >= for Python 3.13 compatibility - Fix numpy 1.26.1 -> 1.26.2 (yanked) --- operate/config.py | 30 +++++ operate/models/apis.py | 260 ++++++++++++++++++++++++++++++++++-- operate/models/prompts.py | 65 +++++++++ operate/operate.py | 27 +++- operate/utils/ocr.py | 85 +++++++----- operate/utils/screenshot.py | 35 ++++- requirements.txt | 104 +++++++-------- 7 files changed, 506 insertions(+), 100 deletions(-) diff --git a/operate/config.py b/operate/config.py index 09f78da0..f51ac9c3 100644 --- a/operate/config.py +++ b/operate/config.py @@ -46,6 +46,9 @@ def __init__(self): self.qwen_api_key = ( None # instance variables are backups in case saving to a `.env` fails ) + self.deepseek_api_key = ( + None # instance variables are backups in case saving to a `.env` fails + ) def initialize_openai(self): if self.verbose: @@ -92,6 +95,28 @@ def initialize_qwen(self): client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" return client + def initialize_deepseek(self): + if self.verbose: + print("[Config][initialize_deepseek]") + + if self.deepseek_api_key: + if self.verbose: + print("[Config][initialize_deepseek] using cached deepseek_api_key") + api_key = self.deepseek_api_key + else: + if self.verbose: + print( + "[Config][initialize_deepseek] no cached deepseek_api_key, try to get from env." + ) + api_key = os.getenv("DEEPSEEK_API_KEY") + + default_base = "https://api.deepseek.com" + base_url = os.getenv("DEEPSEEK_API_BASE_URL", default_base) + client = OpenAI(api_key=api_key, base_url=base_url) + client.api_key = api_key + client.base_url = base_url + return client + def initialize_google(self): if self.google_api_key: if self.verbose: @@ -149,6 +174,9 @@ def validation(self, model, voice_mode): "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" ) self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") + self.require_api_key( + "DEEPSEEK_API_KEY", "DeepSeek API key", model == "deepseek-with-ocr" + ) def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) @@ -177,6 +205,8 @@ def prompt_and_save_api_key(self, key_name, key_description): self.anthropic_api_key = key_value elif key_name == "QWEN_API_KEY": self.qwen_api_key = key_value + elif key_name == "DEEPSEEK_API_KEY": + self.deepseek_api_key = key_value self.save_api_key_to_env(key_name, key_value) load_dotenv() # Reload environment variables # Update the instance attribute with the new key diff --git a/operate/models/apis.py b/operate/models/apis.py index 23794fca..ed0425e0 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -8,6 +8,7 @@ import easyocr import ollama import pkg_resources +import pytesseract from PIL import Image from ultralytics import YOLO @@ -25,7 +26,7 @@ ) from operate.utils.ocr import get_text_coordinates, get_text_element from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot -from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET +from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET, ANSI_YELLOW # Load configuration config = Config() @@ -62,6 +63,9 @@ async def get_next_action(model, messages, objective, session_id): if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) return operation, None + if model == "deepseek-with-ocr": + operation = await call_deepseek_with_ocr(messages, objective, model) + return operation, None raise ModelNotRecognizedException(model) @@ -218,6 +222,11 @@ async def call_qwen_vl_with_ocr(messages, objective, model): text_element_index = get_text_element( result, text_to_click, screenshot_filename ) + if text_element_index is None: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}" + ) + continue coordinates = get_text_coordinates( result, text_element_index, screenshot_filename ) @@ -259,6 +268,218 @@ async def call_qwen_vl_with_ocr(messages, objective, model): traceback.print_exc() return gpt_4_fallback(messages, objective, model) +# Cache EasyOCR reader globally to avoid re-initializing every loop +_easyocr_reader = None + +def _get_easyocr_reader(): + global _easyocr_reader + if _easyocr_reader is None: + _easyocr_reader = easyocr.Reader(["en"]) + return _easyocr_reader + + +async def call_deepseek_with_ocr(messages, objective, model): + if config.verbose: + print("[call_deepseek_with_ocr]") + + try: + # Smarter delay: extra wait after enter/navigation, base otherwise + wait_time = 2 + if len(messages) >= 3: + try: + last_assistant = messages[-2].get("content", "") + last_ops = json.loads(last_assistant) + for op in last_ops: + keys = op.get("keys", []) + if any(k.lower() == "enter" for k in keys): + wait_time = 4 + break + except (json.JSONDecodeError, KeyError, IndexError): + pass + time.sleep(wait_time) + + client = config.initialize_deepseek() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + capture_screen_with_cursor(screenshot_filename) + + # Use Tesseract for fast, accurate text extraction with bounding boxes + import pytesseract + from PIL import Image as PILImage + img = PILImage.open(screenshot_filename) + ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + + ocr_text_elements = [] + ocr_text_list = "" + idx = 0 + for i, text in enumerate(ocr_data["text"]): + t = text.strip() + if t and len(t) > 0 and int(ocr_data["conf"][i]) > 20: + x = ocr_data["left"][i] + ocr_data["width"][i] // 2 + y = ocr_data["top"][i] + ocr_data["height"][i] // 2 + ocr_text_elements.append({"index": idx, "text": t, "x": x, "y": y}) + ocr_text_list += f"[{idx}] \"{t}\"\n" + idx += 1 + + # Show what Tesseract sees on screen + print(f"\n{ANSI_GREEN}═══ SCREEN TEXT ═══{ANSI_RESET}") + if ocr_text_elements: + for e in ocr_text_elements: + print(f" [{e['index']}] \"{e['text']}\"") + else: + print(f" {ANSI_YELLOW}(no text detected){ANSI_RESET}") + print(f"{ANSI_GREEN}═══════════════════{ANSI_RESET}\n") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if not ocr_text_elements: + # Check message history to understand what happened last + last_actions = "" + if len(messages) >= 3: + try: + last = json.loads(messages[-2].get("content", "[]")) + last_keys = [] + for op in last: + if op.get("operation") == "press": + last_keys.extend(op.get("keys", [])) + elif op.get("operation") == "write": + last_keys.append(f"typed '{op.get('content','')}'") + if last_keys: + last_actions = f" Your last actions: {', '.join(last_keys)}. " + except Exception: + pass + + screen_context = ( + f"SCREEN STATUS: No readable text detected on screen.{last_actions}" + f"If you just tried to launch an app, it may already be open - try Alt+Tab or check the panel/dock. " + f"If you're in terminal, you can run commands directly (e.g. type 'vivaldi' and press Enter). " + f"Keyboard shortcuts (Win/Super, Alt+Tab, Ctrl+T) are reliable even with no visible text." + ) + else: + screen_context = ( + f"Available text elements on screen ({len(ocr_text_elements)} found):\n" + f"{ocr_text_list}\n" + f"When you need to click, reference the exact text string from this list." + ) + + text_only_prompt = ( + f"{user_prompt}\n\n" + f"{screen_context}\n\n" + f"**REMEMBER** Only output valid JSON array. Do not append any other text." + ) + + messages.append({"role": "user", "content": text_only_prompt}) + + model_name = os.getenv("DEEPSEEK_MODEL_NAME", "deepseek-v4-pro") + response = client.chat.completions.create( + model=model_name, + messages=messages, + extra_body={"thinking": {"type": "enabled"}}, + ) + + # Show DeepSeek's reasoning/thought process + if hasattr(response.choices[0].message, "reasoning_content") and response.choices[0].message.reasoning_content: + print(f"\n{ANSI_BRIGHT_MAGENTA}[DeepSeek Reasoning]{ANSI_RESET}") + print(response.choices[0].message.reasoning_content[:500]) + print(f"{ANSI_BRIGHT_MAGENTA}[End Reasoning]{ANSI_RESET}\n") + + content = response.choices[0].message.content + + content = clean_json(content) + + content_str = content + + content = json.loads(content) + + processed_content = [] + skipped_clicks = False + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if not text_to_click or text_to_click == "nothing to click": + continue + + if config.verbose: + print( + "[call_deepseek_with_ocr][click] text_to_click", + text_to_click, + ) + + # Find coordinates from Tesseract data via fuzzy match + from difflib import SequenceMatcher + search_lower = text_to_click.lower().strip() + best_score = 0.0 + best_elem = None + for elem in ocr_text_elements: + score = SequenceMatcher(None, search_lower, elem["text"].lower()).ratio() + if score > best_score: + best_score = score + best_elem = elem + + if best_score >= 0.5 and best_elem: + img_w, img_h = PILImage.open(screenshot_filename).size + operation["x"] = round(best_elem["x"] / img_w, 3) + operation["y"] = round(best_elem["y"] / img_h, 3) + else: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found on screen (best match: {best_score:.2f}), skipping click. {ANSI_RESET}" + ) + skipped_clicks = True + continue + + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_deepseek_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_deepseek_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_deepseek_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # If clicks were skipped, tell the model to use keyboard navigation instead + if skipped_clicks: + assistant_message = { + "role": "assistant", + "content": content_str + + "\n[System: Some text elements were not found to click. Use keyboard shortcuts (Tab, Enter) or press operations instead of relying on clicks.]", + } + else: + assistant_message = {"role": "assistant", "content": content_str} + + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] Error: {e} {ANSI_RESET}" + ) + if config.verbose: + traceback.print_exc() + return [] + + def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision @@ -382,6 +603,11 @@ async def call_gpt_4o_with_ocr(messages, objective, model): text_element_index = get_text_element( result, text_to_click, screenshot_filename ) + if text_element_index is None: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}" + ) + continue coordinates = get_text_coordinates( result, text_element_index, screenshot_filename ) @@ -490,6 +716,11 @@ async def call_gpt_4_1_with_ocr(messages, objective, model): text_element_index = get_text_element( result, text_to_click, screenshot_filename ) + if text_element_index is None: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}" + ) + continue coordinates = get_text_coordinates( result, text_element_index, screenshot_filename ) @@ -601,6 +832,11 @@ async def call_o1_with_ocr(messages, objective, model): text_element_index = get_text_element( result, text_to_click, screenshot_filename ) + if text_element_index is None: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}" + ) + continue coordinates = get_text_coordinates( result, text_element_index, screenshot_filename ) @@ -1077,17 +1313,21 @@ def get_last_assistant_message(messages): def gpt_4_fallback(messages, objective, model): if config.verbose: print("[gpt_4_fallback]") - system_prompt = get_system_prompt("gpt-4o", objective) - new_system_message = {"role": "system", "content": system_prompt} - # remove and replace the first message in `messages` with `new_system_message` - - messages[0] = new_system_message + try: + system_prompt = get_system_prompt("gpt-4o", objective) + new_system_message = {"role": "system", "content": system_prompt} + messages[0] = new_system_message - if config.verbose: - print("[gpt_4_fallback][updated]") - print("[gpt_4_fallback][updated] len(messages)", len(messages)) + if config.verbose: + print("[gpt_4_fallback][updated]") + print("[gpt_4_fallback][updated] len(messages)", len(messages)) - return call_gpt_4o(messages) + return call_gpt_4o(messages) + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] GPT-4 fallback also failed: {e} {ANSI_RESET}" + ) + raise def confirm_system_prompt(messages, objective, model): diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 8cea8f3e..28fafd8d 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -195,6 +195,62 @@ Objective: {objective} """ +SYSTEM_PROMPT_OCR_TEXT_ONLY = """ +You are operating a {operating_system} computer, using the same operating system as a human. + +You will receive a list of text elements currently visible on the screen. The previous message may show what actions you attempted and their results. Use this feedback to decide your next action. + +You have 4 possible operation actions available. The `pyautogui` library will execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Click on text. Use the exact text string from the available text elements list. If nothing matches, use Tab/arrow keys to navigate instead. +``` +[{{ "thought": "write a thought here", "operation": "click", "text": "The exact text string to click" }}] +``` +2. write - Type text with your keyboard. Only use when you are sure the correct field is focused. +``` +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] +``` +3. press - Use a hotkey or press a key (Tab, Enter, Escape, arrows, ctrl+t, etc.) +``` +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] +``` +4. done - ONLY use this when the objective is verifiably complete. You must be certain the task succeeded. If unsure, take another action to check. +``` +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] +``` + +CRITICAL RULES: +- Take 1-2 actions at most per response. Do NOT plan more than 2 steps ahead. +- NEVER output "done" unless you have clear evidence the task succeeded. +- After pressing Enter to navigate to a URL, STOP. Wait for the next screenshot before doing anything else. +- When writing into a Google Docs document, first make sure the document body is focused (click into it or press Tab). +- If you don't see expected text on screen, your previous action may have failed. React to what you actually see, not what you hoped would happen. +- Use keyboard shortcuts over clicks whenever possible (Tab to move between fields, Enter to submit, Escape to close). +- To create a new blank Google Doc, navigate directly to `docs.google.com/document/create` — no clicking needed. +- If a browser is already open, use Ctrl+T for a new tab instead of launching the browser again. + +Example: Open a browser and go to a website (3 separate turns, NOT one turn): +``` +Turn 1: +[ + {{ "thought": "Opening app launcher to find browser", "operation": "press", "keys": {os_search_str} }} +] +Turn 2: +[ + {{ "thought": "Typing browser name", "operation": "write", "content": "Firefox" }}, + {{ "thought": "Pressing Enter to open browser", "operation": "press", "keys": ["enter"] }} +] +Turn 3 (after seeing browser has loaded): +[ + {{ "thought": "Opening a new tab", "operation": "press", "keys": [{cmd_string}, "t"] }}, + {{ "thought": "Typing the URL", "operation": "write", "content": "https://example.com" }}, + {{ "thought": "Pressing Enter to navigate", "operation": "press", "keys": ["enter"] }} +] +``` + +Objective: {objective} +""" + OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done @@ -232,6 +288,15 @@ def get_system_prompt(model, objective): os_search_str=os_search_str, operating_system=operating_system, ) + elif model == "deepseek-with-ocr": + + prompt = SYSTEM_PROMPT_OCR_TEXT_ONLY.format( + objective=objective, + cmd_string=cmd_string, + os_search_str=os_search_str, + operating_system=operating_system, + ) + elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl": prompt = SYSTEM_PROMPT_OCR.format( diff --git a/operate/operate.py b/operate/operate.py index c63d9851..60b0580f 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -112,9 +112,25 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): get_next_action(model, messages, objective, session_id) ) - stop = operate(operations, model) - if stop: - break + # Separate "done" from other operations + non_done_ops = [op for op in operations if op.get("operation", "").lower() != "done"] + done_ops = [op for op in operations if op.get("operation", "").lower() == "done"] + + # Execute non-done operations first + if non_done_ops: + stop = operate(non_done_ops, model) + if stop: + break + + # Only accept "done" if it was the ONLY operation (model is confident) + if done_ops and not non_done_ops: + stop = operate(done_ops, model) + if stop: + break + elif done_ops: + print( + f"{ANSI_YELLOW}[Self-Operating Computer] Ignored premature 'done' — still have actions to verify.{ANSI_RESET}" + ) loop_count += 1 if loop_count > 10: @@ -134,6 +150,7 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): def operate(operations, model): if config.verbose: print("[Self Operating Computer][operate]") + last_was_navigation = False for operation in operations: if config.verbose: print("[Self Operating Computer][operate] operation", operation) @@ -149,6 +166,8 @@ def operate(operations, model): keys = operation.get("keys") operate_detail = keys operating_system.press(keys) + if "enter" in [k.lower() for k in keys]: + last_was_navigation = True elif operate_type == "write": content = operation.get("content") operate_detail = content @@ -158,8 +177,8 @@ def operate(operations, model): y = operation.get("y") click_detail = {"x": x, "y": y} operate_detail = click_detail - operating_system.mouse(click_detail) + last_was_navigation = True elif operate_type == "done": summary = operation.get("summary") diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py index 937511b0..1391cdc9 100644 --- a/operate/utils/ocr.py +++ b/operate/utils/ocr.py @@ -2,65 +2,88 @@ from PIL import Image, ImageDraw import os from datetime import datetime +from difflib import SequenceMatcher # Load configuration config = Config() +def _fuzzy_match(text_a, text_b): + """Return similarity ratio between two strings (0.0 to 1.0)""" + return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio() + + def get_text_element(result, search_text, image_path): """ - Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image. + Searches for a text element in the OCR results using exact substring match + followed by fuzzy matching. Returns index or None if not found. + Args: result (list): The list of results returned by EasyOCR. search_text (str): The text to search for in the OCR results. image_path (str): Path to the original image. Returns: - int: The index of the element containing the search text. - - Raises: - Exception: If the text element is not found in the results. + int or None: The index of the element containing the search text. """ if config.verbose: print("[get_text_element]") print("[get_text_element] search_text", search_text) - # Create /ocr directory if it doesn't exist ocr_dir = "ocr" if not os.path.exists(ocr_dir): os.makedirs(ocr_dir) - - # Open the original image image = Image.open(image_path) draw = ImageDraw.Draw(image) - found_index = None + search_lower = search_text.lower().strip() + + # Phase 1: exact substring match for index, element in enumerate(result): text = element[1] - box = element[0] - - if config.verbose: - # Draw bounding box in blue - draw.polygon([tuple(point) for point in box], outline="blue") - - if search_text in text: - found_index = index + if search_lower in text.lower(): if config.verbose: - print("[get_text_element][loop] found search_text, index:", index) + print(f"[get_text_element] exact match found at index {index}: '{text}'") + _draw_debug(result, index, image, draw, ocr_dir) + return index - if found_index is not None: + # Phase 2: fuzzy match (for OCR errors) + best_score = 0.0 + best_index = None + for index, element in enumerate(result): + text = element[1] + score = _fuzzy_match(search_lower, text.lower()) + if score > best_score: + best_score = score + best_index = index + + if best_score >= 0.6 and best_index is not None: if config.verbose: - # Draw bounding box of the found text in red - box = result[found_index][0] - draw.polygon([tuple(point) for point in box], outline="red") - # Save the image with bounding boxes - datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") - ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png") - image.save(ocr_image_path) - print("[get_text_element] OCR image saved at:", ocr_image_path) - - return found_index - - raise Exception("The text element was not found in the image") + matched_text = result[best_index][1] + print(f"[get_text_element] fuzzy match (score={best_score:.2f}): '{matched_text}'") + _draw_debug(result, best_index, image, draw, ocr_dir) + return best_index + + if config.verbose: + print(f"[get_text_element] not found (best fuzzy score={best_score:.2f})") + + return None + + +def _draw_debug(result, index, image, draw, ocr_dir): + """Draw bounding boxes on the image for debug visualization.""" + if not config.verbose: + return + try: + for element in result: + draw.polygon([tuple(point) for point in element[0]], outline="blue") + box = result[index][0] + draw.polygon([tuple(point) for point in box], outline="red", width=3) + datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") + ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png") + image.save(ocr_image_path) + print("[get_text_element] OCR debug image saved at:", ocr_image_path) + except Exception: + pass def get_text_coordinates(result, index, image_path): diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index 23d492f1..af58dc30 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -15,7 +15,40 @@ def capture_screen_with_cursor(file_path): screenshot = pyautogui.screenshot() screenshot.save(file_path) elif user_platform == "Linux": - # Use xlib to prevent scrot dependency for Linux + # Try flameshot first (works on both X11 and Wayland) + try: + result = subprocess.run( + ["flameshot", "full", "-p", file_path], + capture_output=True, + timeout=10, + ) + if result.returncode == 0 and os.path.exists(file_path): + return + except Exception: + pass + + # Fallback: try gnome-screenshot + try: + result = subprocess.run( + ["gnome-screenshot", "-f", file_path], + capture_output=True, + timeout=10, + ) + if result.returncode == 0 and os.path.exists(file_path): + return + except Exception: + pass + + # Fallback: try mss + try: + import mss + with mss.MSS() as sct: + sct.shot(output=file_path) + return + except Exception: + pass + + # Fallback: original X11 method screen = Xlib.display.Display().screen() size = screen.width_in_pixels, screen.height_in_pixels screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1])) diff --git a/requirements.txt b/requirements.txt index c7a646be..e68ced80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,55 +1,51 @@ -annotated-types==0.6.0 -anyio==3.7.1 -certifi==2023.7.22 -charset-normalizer==3.3.2 -colorama==0.4.6 -contourpy==1.2.0 -cycler==0.12.1 -distro==1.8.0 -EasyProcess==1.1 -entrypoint2==1.1 -exceptiongroup==1.1.3 -fonttools==4.44.0 -h11==0.14.0 -httpcore==1.0.2 +annotated-types>=0.6.0 +anyio>=3.7.1 +certifi>=2023.7.22 +charset-normalizer>=3.3.2 +colorama>=0.4.6 +contourpy>=1.2.0 +cycler>=0.12.1 +distro>=1.8.0 +EasyProcess>=1.1 +entrypoint2>=1.1 +fonttools>=4.44.0 +h11>=0.14.0 +httpcore>=1.0.2 httpx>=0.25.2 -idna==3.4 -importlib-resources==6.1.1 -kiwisolver==1.4.5 -matplotlib==3.8.1 -MouseInfo==0.1.3 -mss==9.0.1 -numpy==1.26.1 -openai==1.2.3 -packaging==23.2 -Pillow==10.1.0 -prompt-toolkit==3.0.39 -PyAutoGUI==0.9.54 -pydantic==2.4.2 -pydantic_core==2.10.1 -PyGetWindow==0.0.9 -PyMsgBox==1.0.9 -pyparsing==3.1.1 -pyperclip==1.8.2 -PyRect==0.2.0 -pyscreenshot==3.1 -PyScreeze==0.1.29 -python3-xlib==0.15 -python-dateutil==2.8.2 -python-dotenv==1.0.0 -pytweening==1.0.7 -requests==2.31.0 -rubicon-objc==0.4.7 -six==1.16.0 -sniffio==1.3.0 -tqdm==4.66.1 -typing_extensions==4.8.0 -urllib3==2.0.7 -wcwidth==0.2.9 -zipp==3.17.0 -google-generativeai==0.3.0 -aiohttp==3.9.1 -ultralytics==8.0.227 -easyocr==1.7.1 -ollama==0.1.6 -anthropic \ No newline at end of file +idna>=3.4 +kiwisolver>=1.4.5 +matplotlib>=3.8.1 +MouseInfo>=0.1.3 +mss>=9.0.1 +numpy>=1.26.2 +openai>=1.2.3 +packaging>=23.2 +Pillow>=10.1.0 +prompt-toolkit>=3.0.39 +PyAutoGUI>=0.9.54 +pydantic>=2.4.2 +pydantic_core>=2.10.1 +PyGetWindow>=0.0.9 +PyMsgBox>=1.0.9 +pyparsing>=3.1.1 +pyperclip>=1.8.2 +PyRect>=0.2.0 +pyscreenshot>=3.1 +PyScreeze>=0.1.29 +python3-xlib>=0.15 +python-dateutil>=2.8.2 +python-dotenv>=1.0.0 +pytweening>=1.0.7 +requests>=2.31.0 +six>=1.16.0 +sniffio>=1.3.0 +tqdm>=4.66.1 +typing_extensions>=4.8.0 +urllib3>=2.0.7 +wcwidth>=0.2.9 +google-generativeai>=0.3.0 +aiohttp>=3.9.1 +ultralytics>=8.0.227 +easyocr>=1.7.1 +ollama>=0.1.6 +anthropic From 0a4e26ef8842f7820f43d6907f5657824ea26c3d Mon Sep 17 00:00:00 2001 From: papadie23 Date: Sun, 28 Jun 2026 14:17:18 +0300 Subject: [PATCH 2/2] Fix coordinates crash, enable mouse clicks, improve focus prompt --- operate/models/apis.py | 28 +++++++++------------------- operate/models/prompts.py | 4 ++-- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index ed0425e0..b36771ad 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -429,30 +429,20 @@ async def call_deepseek_with_ocr(messages, objective, model): img_w, img_h = PILImage.open(screenshot_filename).size operation["x"] = round(best_elem["x"] / img_w, 3) operation["y"] = round(best_elem["y"] / img_h, 3) + if config.verbose: + print( + "[call_deepseek_with_ocr][click] matched:", + best_elem["text"], + "at", + operation["x"], + operation["y"], + ) + processed_content.append(operation) else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found on screen (best match: {best_score:.2f}), skipping click. {ANSI_RESET}" ) skipped_clicks = True - continue - - operation["x"] = coordinates["x"] - operation["y"] = coordinates["y"] - - if config.verbose: - print( - "[call_deepseek_with_ocr][click] text_element_index", - text_element_index, - ) - print( - "[call_deepseek_with_ocr][click] coordinates", - coordinates, - ) - print( - "[call_deepseek_with_ocr][click] final operation", - operation, - ) - processed_content.append(operation) else: processed_content.append(operation) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 28fafd8d..15f1923f 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -225,9 +225,9 @@ - After pressing Enter to navigate to a URL, STOP. Wait for the next screenshot before doing anything else. - When writing into a Google Docs document, first make sure the document body is focused (click into it or press Tab). - If you don't see expected text on screen, your previous action may have failed. React to what you actually see, not what you hoped would happen. -- Use keyboard shortcuts over clicks whenever possible (Tab to move between fields, Enter to submit, Escape to close). +- Use both clicks and keyboard shortcuts freely. Keyboard shortcuts (Tab, Enter, Escape, Ctrl+T) are reliable, but clicks are equally valid when you can identify text to click. - To create a new blank Google Doc, navigate directly to `docs.google.com/document/create` — no clicking needed. -- If a browser is already open, use Ctrl+T for a new tab instead of launching the browser again. +- If a browser is already open, click on a visible browser element (tab, address bar, bookmark) FIRST to focus the window, THEN use Ctrl+T or keyboard shortcuts. Example: Open a browser and go to a website (3 separate turns, NOT one turn): ```