From ecc657e227b36d148f86ce18d24195ed4baae362 Mon Sep 17 00:00:00 2001
From: papadie23 <cosmineugen23@gmail.com>
Date: Sun, 28 Jun 2026 13:33:34 +0300
Subject: [PATCH 1/2] Add DeepSeek integration and fix Linux/Wayland
 compatibility

- Add  model mode using text-only OCR approach
  (DeepSeek API doesn't support vision, so screen text is extracted
  via Tesseract/EasyOCR and sent as structured text)
- Add  config with OpenAI-compatible client
- Add  for text-only model guidance
- Show DeepSeek reasoning tokens in terminal for transparency

Fixes:
- Replace broken X11 screenshot with flameshot (works on Wayland)
  with fallbacks to gnome-screenshot, mss, then ImageGrab
- Add fuzzy text matching in OCR (diffs can now match 'Gooale' ~ 'Google')
- Return None instead of raising on text-not-found to avoid crashes
- Cache EasyOCR reader globally to avoid re-downloading models each loop
- Strip premature 'done' operations (model must verify before claiming success)
- Smarter delays: 4s after enter/navigation, 2s base
- Update requirements.txt pins to >= for Python 3.13 compatibility
- Fix numpy 1.26.1 -> 1.26.2 (yanked)
---
 operate/config.py           |  30 +++++
 operate/models/apis.py      | 260 ++++++++++++++++++++++++++++++++++--
 operate/models/prompts.py   |  65 +++++++++
 operate/operate.py          |  27 +++-
 operate/utils/ocr.py        |  85 +++++++-----
 operate/utils/screenshot.py |  35 ++++-
 requirements.txt            | 104 +++++++--------
 7 files changed, 506 insertions(+), 100 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index 09f78da0..f51ac9c3 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -46,6 +46,9 @@ def __init__(self):
         self.qwen_api_key = (
             None  # instance variables are backups in case saving to a `.env` fails
         )
+        self.deepseek_api_key = (
+            None  # instance variables are backups in case saving to a `.env` fails
+        )
 
     def initialize_openai(self):
         if self.verbose:
@@ -92,6 +95,28 @@ def initialize_qwen(self):
         client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
         return client
 
+    def initialize_deepseek(self):
+        if self.verbose:
+            print("[Config][initialize_deepseek]")
+
+        if self.deepseek_api_key:
+            if self.verbose:
+                print("[Config][initialize_deepseek] using cached deepseek_api_key")
+            api_key = self.deepseek_api_key
+        else:
+            if self.verbose:
+                print(
+                    "[Config][initialize_deepseek] no cached deepseek_api_key, try to get from env."
+                )
+            api_key = os.getenv("DEEPSEEK_API_KEY")
+
+        default_base = "https://api.deepseek.com"
+        base_url = os.getenv("DEEPSEEK_API_BASE_URL", default_base)
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        client.api_key = api_key
+        client.base_url = base_url
+        return client
+
     def initialize_google(self):
         if self.google_api_key:
             if self.verbose:
@@ -149,6 +174,9 @@ def validation(self, model, voice_mode):
             "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
         )
         self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
+        self.require_api_key(
+            "DEEPSEEK_API_KEY", "DeepSeek API key", model == "deepseek-with-ocr"
+        )
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))
@@ -177,6 +205,8 @@ def prompt_and_save_api_key(self, key_name, key_description):
                 self.anthropic_api_key = key_value
             elif key_name == "QWEN_API_KEY":
                 self.qwen_api_key = key_value
+            elif key_name == "DEEPSEEK_API_KEY":
+                self.deepseek_api_key = key_value
             self.save_api_key_to_env(key_name, key_value)
             load_dotenv()  # Reload environment variables
             # Update the instance attribute with the new key
diff --git a/operate/models/apis.py b/operate/models/apis.py
index 23794fca..ed0425e0 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -8,6 +8,7 @@
 import easyocr
 import ollama
 import pkg_resources
+import pytesseract
 from PIL import Image
 from ultralytics import YOLO
 
@@ -25,7 +26,7 @@
 )
 from operate.utils.ocr import get_text_coordinates, get_text_element
 from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
-from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
+from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET, ANSI_YELLOW
 
 # Load configuration
 config = Config()
@@ -62,6 +63,9 @@ async def get_next_action(model, messages, objective, session_id):
     if model == "claude-3":
         operation = await call_claude_3_with_ocr(messages, objective, model)
         return operation, None
+    if model == "deepseek-with-ocr":
+        operation = await call_deepseek_with_ocr(messages, objective, model)
+        return operation, None
     raise ModelNotRecognizedException(model)
 
 
@@ -218,6 +222,11 @@ async def call_qwen_vl_with_ocr(messages, objective, model):
                 text_element_index = get_text_element(
                     result, text_to_click, screenshot_filename
                 )
+                if text_element_index is None:
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}"
+                    )
+                    continue
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
                 )
@@ -259,6 +268,218 @@ async def call_qwen_vl_with_ocr(messages, objective, model):
             traceback.print_exc()
         return gpt_4_fallback(messages, objective, model)
 
+# Cache EasyOCR reader globally to avoid re-initializing every loop
+_easyocr_reader = None
+
+def _get_easyocr_reader():
+    global _easyocr_reader
+    if _easyocr_reader is None:
+        _easyocr_reader = easyocr.Reader(["en"])
+    return _easyocr_reader
+
+
+async def call_deepseek_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_deepseek_with_ocr]")
+
+    try:
+        # Smarter delay: extra wait after enter/navigation, base otherwise
+        wait_time = 2
+        if len(messages) >= 3:
+            try:
+                last_assistant = messages[-2].get("content", "")
+                last_ops = json.loads(last_assistant)
+                for op in last_ops:
+                    keys = op.get("keys", [])
+                    if any(k.lower() == "enter" for k in keys):
+                        wait_time = 4
+                        break
+            except (json.JSONDecodeError, KeyError, IndexError):
+                pass
+        time.sleep(wait_time)
+
+        client = config.initialize_deepseek()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        capture_screen_with_cursor(screenshot_filename)
+
+        # Use Tesseract for fast, accurate text extraction with bounding boxes
+        import pytesseract
+        from PIL import Image as PILImage
+        img = PILImage.open(screenshot_filename)
+        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+
+        ocr_text_elements = []
+        ocr_text_list = ""
+        idx = 0
+        for i, text in enumerate(ocr_data["text"]):
+            t = text.strip()
+            if t and len(t) > 0 and int(ocr_data["conf"][i]) > 20:
+                x = ocr_data["left"][i] + ocr_data["width"][i] // 2
+                y = ocr_data["top"][i] + ocr_data["height"][i] // 2
+                ocr_text_elements.append({"index": idx, "text": t, "x": x, "y": y})
+                ocr_text_list += f"[{idx}] \"{t}\"\n"
+                idx += 1
+
+        # Show what Tesseract sees on screen
+        print(f"\n{ANSI_GREEN}═══ SCREEN TEXT ═══{ANSI_RESET}")
+        if ocr_text_elements:
+            for e in ocr_text_elements:
+                print(f"  [{e['index']}] \"{e['text']}\"")
+        else:
+            print(f"  {ANSI_YELLOW}(no text detected){ANSI_RESET}")
+        print(f"{ANSI_GREEN}═══════════════════{ANSI_RESET}\n")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        if not ocr_text_elements:
+            # Check message history to understand what happened last
+            last_actions = ""
+            if len(messages) >= 3:
+                try:
+                    last = json.loads(messages[-2].get("content", "[]"))
+                    last_keys = []
+                    for op in last:
+                        if op.get("operation") == "press":
+                            last_keys.extend(op.get("keys", []))
+                        elif op.get("operation") == "write":
+                            last_keys.append(f"typed '{op.get('content','')}'")
+                    if last_keys:
+                        last_actions = f" Your last actions: {', '.join(last_keys)}. "
+                except Exception:
+                    pass
+
+            screen_context = (
+                f"SCREEN STATUS: No readable text detected on screen.{last_actions}"
+                f"If you just tried to launch an app, it may already be open - try Alt+Tab or check the panel/dock. "
+                f"If you're in terminal, you can run commands directly (e.g. type 'vivaldi' and press Enter). "
+                f"Keyboard shortcuts (Win/Super, Alt+Tab, Ctrl+T) are reliable even with no visible text."
+            )
+        else:
+            screen_context = (
+                f"Available text elements on screen ({len(ocr_text_elements)} found):\n"
+                f"{ocr_text_list}\n"
+                f"When you need to click, reference the exact text string from this list."
+            )
+
+        text_only_prompt = (
+            f"{user_prompt}\n\n"
+            f"{screen_context}\n\n"
+            f"**REMEMBER** Only output valid JSON array. Do not append any other text."
+        )
+
+        messages.append({"role": "user", "content": text_only_prompt})
+
+        model_name = os.getenv("DEEPSEEK_MODEL_NAME", "deepseek-v4-pro")
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            extra_body={"thinking": {"type": "enabled"}},
+        )
+
+        # Show DeepSeek's reasoning/thought process
+        if hasattr(response.choices[0].message, "reasoning_content") and response.choices[0].message.reasoning_content:
+            print(f"\n{ANSI_BRIGHT_MAGENTA}[DeepSeek Reasoning]{ANSI_RESET}")
+            print(response.choices[0].message.reasoning_content[:500])
+            print(f"{ANSI_BRIGHT_MAGENTA}[End Reasoning]{ANSI_RESET}\n")
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+        skipped_clicks = False
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if not text_to_click or text_to_click == "nothing to click":
+                    continue
+
+                if config.verbose:
+                    print(
+                        "[call_deepseek_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+
+                # Find coordinates from Tesseract data via fuzzy match
+                from difflib import SequenceMatcher
+                search_lower = text_to_click.lower().strip()
+                best_score = 0.0
+                best_elem = None
+                for elem in ocr_text_elements:
+                    score = SequenceMatcher(None, search_lower, elem["text"].lower()).ratio()
+                    if score > best_score:
+                        best_score = score
+                        best_elem = elem
+
+                if best_score >= 0.5 and best_elem:
+                    img_w, img_h = PILImage.open(screenshot_filename).size
+                    operation["x"] = round(best_elem["x"] / img_w, 3)
+                    operation["y"] = round(best_elem["y"] / img_h, 3)
+                else:
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found on screen (best match: {best_score:.2f}), skipping click. {ANSI_RESET}"
+                    )
+                    skipped_clicks = True
+                    continue
+
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_deepseek_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_deepseek_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_deepseek_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # If clicks were skipped, tell the model to use keyboard navigation instead
+        if skipped_clicks:
+            assistant_message = {
+                "role": "assistant",
+                "content": content_str
+                + "\n[System: Some text elements were not found to click. Use keyboard shortcuts (Tab, Enter) or press operations instead of relying on clicks.]",
+            }
+        else:
+            assistant_message = {"role": "assistant", "content": content_str}
+
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] Error: {e} {ANSI_RESET}"
+        )
+        if config.verbose:
+            traceback.print_exc()
+        return []
+
+
 def call_gemini_pro_vision(messages, objective):
     """
     Get the next action for Self-Operating Computer using Gemini Pro Vision
@@ -382,6 +603,11 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
                 text_element_index = get_text_element(
                     result, text_to_click, screenshot_filename
                 )
+                if text_element_index is None:
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}"
+                    )
+                    continue
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
                 )
@@ -490,6 +716,11 @@ async def call_gpt_4_1_with_ocr(messages, objective, model):
                 text_element_index = get_text_element(
                     result, text_to_click, screenshot_filename
                 )
+                if text_element_index is None:
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}"
+                    )
+                    continue
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
                 )
@@ -601,6 +832,11 @@ async def call_o1_with_ocr(messages, objective, model):
                 text_element_index = get_text_element(
                     result, text_to_click, screenshot_filename
                 )
+                if text_element_index is None:
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found, skipping click. {ANSI_RESET}"
+                    )
+                    continue
                 coordinates = get_text_coordinates(
                     result, text_element_index, screenshot_filename
                 )
@@ -1077,17 +1313,21 @@ def get_last_assistant_message(messages):
 def gpt_4_fallback(messages, objective, model):
     if config.verbose:
         print("[gpt_4_fallback]")
-    system_prompt = get_system_prompt("gpt-4o", objective)
-    new_system_message = {"role": "system", "content": system_prompt}
-    # remove and replace the first message in `messages` with `new_system_message`
-
-    messages[0] = new_system_message
+    try:
+        system_prompt = get_system_prompt("gpt-4o", objective)
+        new_system_message = {"role": "system", "content": system_prompt}
+        messages[0] = new_system_message
 
-    if config.verbose:
-        print("[gpt_4_fallback][updated]")
-        print("[gpt_4_fallback][updated] len(messages)", len(messages))
+        if config.verbose:
+            print("[gpt_4_fallback][updated]")
+            print("[gpt_4_fallback][updated] len(messages)", len(messages))
 
-    return call_gpt_4o(messages)
+        return call_gpt_4o(messages)
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] GPT-4 fallback also failed: {e} {ANSI_RESET}"
+        )
+        raise
 
 
 def confirm_system_prompt(messages, objective, model):
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 8cea8f3e..28fafd8d 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -195,6 +195,62 @@
 Objective: {objective} 
 """
 
+SYSTEM_PROMPT_OCR_TEXT_ONLY = """
+You are operating a {operating_system} computer, using the same operating system as a human.
+
+You will receive a list of text elements currently visible on the screen. The previous message may show what actions you attempted and their results. Use this feedback to decide your next action.
+
+You have 4 possible operation actions available. The `pyautogui` library will execute your decision. Your output will be used in a `json.loads` loads statement.
+
+1. click - Click on text. Use the exact text string from the available text elements list. If nothing matches, use Tab/arrow keys to navigate instead.
+```
+[{{ "thought": "write a thought here", "operation": "click", "text": "The exact text string to click" }}]
+```
+2. write - Type text with your keyboard. Only use when you are sure the correct field is focused.
+```
+[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
+```
+3. press - Use a hotkey or press a key (Tab, Enter, Escape, arrows, ctrl+t, etc.)
+```
+[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
+```
+4. done - ONLY use this when the objective is verifiably complete. You must be certain the task succeeded. If unsure, take another action to check.
+```
+[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
+```
+
+CRITICAL RULES:
+- Take 1-2 actions at most per response. Do NOT plan more than 2 steps ahead.
+- NEVER output "done" unless you have clear evidence the task succeeded.
+- After pressing Enter to navigate to a URL, STOP. Wait for the next screenshot before doing anything else.
+- When writing into a Google Docs document, first make sure the document body is focused (click into it or press Tab).
+- If you don't see expected text on screen, your previous action may have failed. React to what you actually see, not what you hoped would happen.
+- Use keyboard shortcuts over clicks whenever possible (Tab to move between fields, Enter to submit, Escape to close).
+- To create a new blank Google Doc, navigate directly to `docs.google.com/document/create` — no clicking needed.
+- If a browser is already open, use Ctrl+T for a new tab instead of launching the browser again.
+
+Example: Open a browser and go to a website (3 separate turns, NOT one turn):
+```
+Turn 1:
+[
+  {{ "thought": "Opening app launcher to find browser", "operation": "press", "keys": {os_search_str} }}
+]
+Turn 2:
+[
+  {{ "thought": "Typing browser name", "operation": "write", "content": "Firefox" }},
+  {{ "thought": "Pressing Enter to open browser", "operation": "press", "keys": ["enter"] }}
+]
+Turn 3 (after seeing browser has loaded):
+[
+  {{ "thought": "Opening a new tab", "operation": "press", "keys": [{cmd_string}, "t"] }},
+  {{ "thought": "Typing the URL", "operation": "write", "content": "https://example.com" }},
+  {{ "thought": "Pressing Enter to navigate", "operation": "press", "keys": ["enter"] }}
+]
+```
+
+Objective: {objective}
+"""
+
 OPERATE_FIRST_MESSAGE_PROMPT = """
 Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
 
@@ -232,6 +288,15 @@ def get_system_prompt(model, objective):
             os_search_str=os_search_str,
             operating_system=operating_system,
         )
+    elif model == "deepseek-with-ocr":
+
+        prompt = SYSTEM_PROMPT_OCR_TEXT_ONLY.format(
+            objective=objective,
+            cmd_string=cmd_string,
+            os_search_str=os_search_str,
+            operating_system=operating_system,
+        )
+
     elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
 
         prompt = SYSTEM_PROMPT_OCR.format(
diff --git a/operate/operate.py b/operate/operate.py
index c63d9851..60b0580f 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -112,9 +112,25 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
                 get_next_action(model, messages, objective, session_id)
             )
 
-            stop = operate(operations, model)
-            if stop:
-                break
+            # Separate "done" from other operations
+            non_done_ops = [op for op in operations if op.get("operation", "").lower() != "done"]
+            done_ops = [op for op in operations if op.get("operation", "").lower() == "done"]
+
+            # Execute non-done operations first
+            if non_done_ops:
+                stop = operate(non_done_ops, model)
+                if stop:
+                    break
+
+            # Only accept "done" if it was the ONLY operation (model is confident)
+            if done_ops and not non_done_ops:
+                stop = operate(done_ops, model)
+                if stop:
+                    break
+            elif done_ops:
+                print(
+                    f"{ANSI_YELLOW}[Self-Operating Computer] Ignored premature 'done' — still have actions to verify.{ANSI_RESET}"
+                )
 
             loop_count += 1
             if loop_count > 10:
@@ -134,6 +150,7 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
 def operate(operations, model):
     if config.verbose:
         print("[Self Operating Computer][operate]")
+    last_was_navigation = False
     for operation in operations:
         if config.verbose:
             print("[Self Operating Computer][operate] operation", operation)
@@ -149,6 +166,8 @@ def operate(operations, model):
             keys = operation.get("keys")
             operate_detail = keys
             operating_system.press(keys)
+            if "enter" in [k.lower() for k in keys]:
+                last_was_navigation = True
         elif operate_type == "write":
             content = operation.get("content")
             operate_detail = content
@@ -158,8 +177,8 @@ def operate(operations, model):
             y = operation.get("y")
             click_detail = {"x": x, "y": y}
             operate_detail = click_detail
-
             operating_system.mouse(click_detail)
+            last_was_navigation = True
         elif operate_type == "done":
             summary = operation.get("summary")
 
diff --git a/operate/utils/ocr.py b/operate/utils/ocr.py
index 937511b0..1391cdc9 100644
--- a/operate/utils/ocr.py
+++ b/operate/utils/ocr.py
@@ -2,65 +2,88 @@
 from PIL import Image, ImageDraw
 import os
 from datetime import datetime
+from difflib import SequenceMatcher
 
 # Load configuration
 config = Config()
 
 
+def _fuzzy_match(text_a, text_b):
+    """Return similarity ratio between two strings (0.0 to 1.0)"""
+    return SequenceMatcher(None, text_a.lower(), text_b.lower()).ratio()
+
+
 def get_text_element(result, search_text, image_path):
     """
-    Searches for a text element in the OCR results and returns its index. Also draws bounding boxes on the image.
+    Searches for a text element in the OCR results using exact substring match
+    followed by fuzzy matching. Returns index or None if not found.
+
     Args:
         result (list): The list of results returned by EasyOCR.
         search_text (str): The text to search for in the OCR results.
         image_path (str): Path to the original image.
 
     Returns:
-        int: The index of the element containing the search text.
-
-    Raises:
-        Exception: If the text element is not found in the results.
+        int or None: The index of the element containing the search text.
     """
     if config.verbose:
         print("[get_text_element]")
         print("[get_text_element] search_text", search_text)
-        # Create /ocr directory if it doesn't exist
         ocr_dir = "ocr"
         if not os.path.exists(ocr_dir):
             os.makedirs(ocr_dir)
-
-        # Open the original image
         image = Image.open(image_path)
         draw = ImageDraw.Draw(image)
 
-    found_index = None
+    search_lower = search_text.lower().strip()
+
+    # Phase 1: exact substring match
     for index, element in enumerate(result):
         text = element[1]
-        box = element[0]
-
-        if config.verbose:
-            # Draw bounding box in blue
-            draw.polygon([tuple(point) for point in box], outline="blue")
-
-        if search_text in text:
-            found_index = index
+        if search_lower in text.lower():
             if config.verbose:
-                print("[get_text_element][loop] found search_text, index:", index)
+                print(f"[get_text_element] exact match found at index {index}: '{text}'")
+            _draw_debug(result, index, image, draw, ocr_dir)
+            return index
 
-    if found_index is not None:
+    # Phase 2: fuzzy match (for OCR errors)
+    best_score = 0.0
+    best_index = None
+    for index, element in enumerate(result):
+        text = element[1]
+        score = _fuzzy_match(search_lower, text.lower())
+        if score > best_score:
+            best_score = score
+            best_index = index
+
+    if best_score >= 0.6 and best_index is not None:
         if config.verbose:
-            # Draw bounding box of the found text in red
-            box = result[found_index][0]
-            draw.polygon([tuple(point) for point in box], outline="red")
-            # Save the image with bounding boxes
-            datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
-            ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png")
-            image.save(ocr_image_path)
-            print("[get_text_element] OCR image saved at:", ocr_image_path)
-
-        return found_index
-
-    raise Exception("The text element was not found in the image")
+            matched_text = result[best_index][1]
+            print(f"[get_text_element] fuzzy match (score={best_score:.2f}): '{matched_text}'")
+        _draw_debug(result, best_index, image, draw, ocr_dir)
+        return best_index
+
+    if config.verbose:
+        print(f"[get_text_element] not found (best fuzzy score={best_score:.2f})")
+
+    return None
+
+
+def _draw_debug(result, index, image, draw, ocr_dir):
+    """Draw bounding boxes on the image for debug visualization."""
+    if not config.verbose:
+        return
+    try:
+        for element in result:
+            draw.polygon([tuple(point) for point in element[0]], outline="blue")
+        box = result[index][0]
+        draw.polygon([tuple(point) for point in box], outline="red", width=3)
+        datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
+        ocr_image_path = os.path.join(ocr_dir, f"ocr_image_{datetime_str}.png")
+        image.save(ocr_image_path)
+        print("[get_text_element] OCR debug image saved at:", ocr_image_path)
+    except Exception:
+        pass
 
 
 def get_text_coordinates(result, index, image_path):
diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py
index 23d492f1..af58dc30 100644
--- a/operate/utils/screenshot.py
+++ b/operate/utils/screenshot.py
@@ -15,7 +15,40 @@ def capture_screen_with_cursor(file_path):
         screenshot = pyautogui.screenshot()
         screenshot.save(file_path)
     elif user_platform == "Linux":
-        # Use xlib to prevent scrot dependency for Linux
+        # Try flameshot first (works on both X11 and Wayland)
+        try:
+            result = subprocess.run(
+                ["flameshot", "full", "-p", file_path],
+                capture_output=True,
+                timeout=10,
+            )
+            if result.returncode == 0 and os.path.exists(file_path):
+                return
+        except Exception:
+            pass
+
+        # Fallback: try gnome-screenshot
+        try:
+            result = subprocess.run(
+                ["gnome-screenshot", "-f", file_path],
+                capture_output=True,
+                timeout=10,
+            )
+            if result.returncode == 0 and os.path.exists(file_path):
+                return
+        except Exception:
+            pass
+
+        # Fallback: try mss
+        try:
+            import mss
+            with mss.MSS() as sct:
+                sct.shot(output=file_path)
+            return
+        except Exception:
+            pass
+
+        # Fallback: original X11 method
         screen = Xlib.display.Display().screen()
         size = screen.width_in_pixels, screen.height_in_pixels
         screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
diff --git a/requirements.txt b/requirements.txt
index c7a646be..e68ced80 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,55 +1,51 @@
-annotated-types==0.6.0
-anyio==3.7.1
-certifi==2023.7.22
-charset-normalizer==3.3.2
-colorama==0.4.6
-contourpy==1.2.0
-cycler==0.12.1
-distro==1.8.0
-EasyProcess==1.1
-entrypoint2==1.1
-exceptiongroup==1.1.3
-fonttools==4.44.0
-h11==0.14.0
-httpcore==1.0.2
+annotated-types>=0.6.0
+anyio>=3.7.1
+certifi>=2023.7.22
+charset-normalizer>=3.3.2
+colorama>=0.4.6
+contourpy>=1.2.0
+cycler>=0.12.1
+distro>=1.8.0
+EasyProcess>=1.1
+entrypoint2>=1.1
+fonttools>=4.44.0
+h11>=0.14.0
+httpcore>=1.0.2
 httpx>=0.25.2
-idna==3.4
-importlib-resources==6.1.1
-kiwisolver==1.4.5
-matplotlib==3.8.1
-MouseInfo==0.1.3
-mss==9.0.1
-numpy==1.26.1
-openai==1.2.3
-packaging==23.2
-Pillow==10.1.0
-prompt-toolkit==3.0.39
-PyAutoGUI==0.9.54
-pydantic==2.4.2
-pydantic_core==2.10.1
-PyGetWindow==0.0.9
-PyMsgBox==1.0.9
-pyparsing==3.1.1
-pyperclip==1.8.2
-PyRect==0.2.0
-pyscreenshot==3.1
-PyScreeze==0.1.29
-python3-xlib==0.15
-python-dateutil==2.8.2
-python-dotenv==1.0.0
-pytweening==1.0.7
-requests==2.31.0
-rubicon-objc==0.4.7
-six==1.16.0
-sniffio==1.3.0
-tqdm==4.66.1
-typing_extensions==4.8.0
-urllib3==2.0.7
-wcwidth==0.2.9
-zipp==3.17.0
-google-generativeai==0.3.0
-aiohttp==3.9.1
-ultralytics==8.0.227
-easyocr==1.7.1
-ollama==0.1.6
-anthropic
\ No newline at end of file
+idna>=3.4
+kiwisolver>=1.4.5
+matplotlib>=3.8.1
+MouseInfo>=0.1.3
+mss>=9.0.1
+numpy>=1.26.2
+openai>=1.2.3
+packaging>=23.2
+Pillow>=10.1.0
+prompt-toolkit>=3.0.39
+PyAutoGUI>=0.9.54
+pydantic>=2.4.2
+pydantic_core>=2.10.1
+PyGetWindow>=0.0.9
+PyMsgBox>=1.0.9
+pyparsing>=3.1.1
+pyperclip>=1.8.2
+PyRect>=0.2.0
+pyscreenshot>=3.1
+PyScreeze>=0.1.29
+python3-xlib>=0.15
+python-dateutil>=2.8.2
+python-dotenv>=1.0.0
+pytweening>=1.0.7
+requests>=2.31.0
+six>=1.16.0
+sniffio>=1.3.0
+tqdm>=4.66.1
+typing_extensions>=4.8.0
+urllib3>=2.0.7
+wcwidth>=0.2.9
+google-generativeai>=0.3.0
+aiohttp>=3.9.1
+ultralytics>=8.0.227
+easyocr>=1.7.1
+ollama>=0.1.6
+anthropic

From 0a4e26ef8842f7820f43d6907f5657824ea26c3d Mon Sep 17 00:00:00 2001
From: papadie23 <cosmineugen23@gmail.com>
Date: Sun, 28 Jun 2026 14:17:18 +0300
Subject: [PATCH 2/2] Fix coordinates crash, enable mouse clicks, improve focus
 prompt

---
 operate/models/apis.py    | 28 +++++++++-------------------
 operate/models/prompts.py |  4 ++--
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index ed0425e0..b36771ad 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -429,30 +429,20 @@ async def call_deepseek_with_ocr(messages, objective, model):
                     img_w, img_h = PILImage.open(screenshot_filename).size
                     operation["x"] = round(best_elem["x"] / img_w, 3)
                     operation["y"] = round(best_elem["y"] / img_h, 3)
+                    if config.verbose:
+                        print(
+                            "[call_deepseek_with_ocr][click] matched:",
+                            best_elem["text"],
+                            "at",
+                            operation["x"],
+                            operation["y"],
+                        )
+                    processed_content.append(operation)
                 else:
                     print(
                         f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Text '{text_to_click}' not found on screen (best match: {best_score:.2f}), skipping click. {ANSI_RESET}"
                     )
                     skipped_clicks = True
-                    continue
-
-                operation["x"] = coordinates["x"]
-                operation["y"] = coordinates["y"]
-
-                if config.verbose:
-                    print(
-                        "[call_deepseek_with_ocr][click] text_element_index",
-                        text_element_index,
-                    )
-                    print(
-                        "[call_deepseek_with_ocr][click] coordinates",
-                        coordinates,
-                    )
-                    print(
-                        "[call_deepseek_with_ocr][click] final operation",
-                        operation,
-                    )
-                processed_content.append(operation)
 
             else:
                 processed_content.append(operation)
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 28fafd8d..15f1923f 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -225,9 +225,9 @@
 - After pressing Enter to navigate to a URL, STOP. Wait for the next screenshot before doing anything else.
 - When writing into a Google Docs document, first make sure the document body is focused (click into it or press Tab).
 - If you don't see expected text on screen, your previous action may have failed. React to what you actually see, not what you hoped would happen.
-- Use keyboard shortcuts over clicks whenever possible (Tab to move between fields, Enter to submit, Escape to close).
+- Use both clicks and keyboard shortcuts freely. Keyboard shortcuts (Tab, Enter, Escape, Ctrl+T) are reliable, but clicks are equally valid when you can identify text to click.
 - To create a new blank Google Doc, navigate directly to `docs.google.com/document/create` — no clicking needed.
-- If a browser is already open, use Ctrl+T for a new tab instead of launching the browser again.
+- If a browser is already open, click on a visible browser element (tab, address bar, bookmark) FIRST to focus the window, THEN use Ctrl+T or keyboard shortcuts.
 
 Example: Open a browser and go to a website (3 separate turns, NOT one turn):
 ```