Toufool · Avasam · Jun 16, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/README.md b/README.md
@@ -226,6 +226,38 @@ You can have one (and only one) image with the keyword `reset` in its name. Auto
 
 The Start Image is similar to the Reset Image. You can only have one Start Image with the keyword `start_auto_splitter`.You can reload the image using the "`Reload Start Image`" button. The pause time is the amount of seconds AutoSplit will wait before starting comparisons of the first split image. Delay times will be used to delay starting your timer after the threshold is met.
 
+### Text Recognition (OCR)
+
+You can use text recognition as an alternative comparison method.
+To use this feature you need to place a text file (.txt) in your splits folder instead of an image file.
+Place the expected text in the text file that should be looked for.
+
+An example file name and content could look like this:
+
+Filename: `001_start_auto_splitter.txt`
+
+Content:
+
+```
+texts = ["complete any 2 encounters"]
+top_left = 275
+top_right = 540
+bottom_left = 70
+bottom_right = 95
+fps_limit = 1
+```
+
+The `texts` field is an array and can take more than one text to look for:
+
+```
+texts = ["look for me", "or this text"]
+```
+
+The `top`, `bottom`, `left` and `right` options define a rectangle where the text you are looking for is expected to appear in the image.
+
+Note: This method can cause high CPU usage at the standard comparison FPS. You should therefor limit the comparison FPS when you use this method to 1 or 2 FPS using the `fps_limit` option.
+The size of the selected rectangle can also impact the CPU load (bigger = more CPU load).
+
 ### Profiles
 
 <!-- TODO: Profiles are saved under `%appdata%\AutoSplit\profiles` and -->

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -3,7 +3,9 @@
 # Read /docs/build%20instructions.md for more information on how to install, run and build the python code.
 #
 # Dependencies:
+easyocr
 git+https://github.com/boppreh/keyboard.git#egg=keyboard  # Fix install on macos and linux-ci https://github.com/boppreh/keyboard/pull/568
+Levenshtein
 numpy>=1.26  # Python 3.12 support
 opencv-python-headless>=4.9.0.80  # Typing fixes
 packaging
@@ -15,6 +17,8 @@ PySide6-Essentials>=6.6.0 ; sys_platform == 'win32' # Python 3.12 support
 PySide6-Essentials<6.5.1 ; sys_platform == 'linux' # Wayland issue on Ubuntu 22.04 https://bugreports.qt.io/browse/QTBUG-114635
 scipy>=1.11.2 # Python 3.12 support
 toml
+torch
+torchvision
 typing-extensions>=4.4.0  # @override decorator support
 #
 # Build and compile resources

diff --git a/src/AutoSplit.py b/src/AutoSplit.py
@@ -307,7 +307,10 @@ def __reload_start_image(self, started_by_button: bool = False, wait_for_delay:
         self.highest_similarity = 0.0
         self.reset_highest_similarity = 0.0
         self.split_below_threshold = False
-        self.timer_start_image.start(int(ONE_SECOND / self.settings_dict["fps_limit"]))
+        start_image_fps = self.settings_dict["fps_limit"]
+        if self.start_image.fps != 0:
+            start_image_fps = self.start_image.fps
+        self.timer_start_image.start(int(ONE_SECOND / start_image_fps))
 
         QApplication.processEvents()
 
@@ -682,8 +685,12 @@ def __similarity_threshold_loop(self, number_of_split_images: int, dummy_splits_
                 self.undo_split_button.setEnabled(self.split_image_number != 0)
             QApplication.processEvents()
 
+            fps = self.settings_dict["fps_limit"]
+            if self.split_image.fps != 0:
+                fps = self.split_image.fps
+
             # Limit the number of time the comparison runs to reduce cpu usage
-            frame_interval = 1 / self.settings_dict["fps_limit"]
+            frame_interval = 1 / fps
             # Use a time delta to have a consistant check interval
             wait_delta_ms = int((frame_interval - (time() - start) % frame_interval) * ONE_SECOND)
 
@@ -867,7 +874,10 @@ def __update_split_image(self, specific_image: AutoSplitImage | None = None):
 
         # Get split image
         self.split_image = specific_image or self.split_images_and_loop_number[0 + self.split_image_number][0]
-        if is_valid_image(self.split_image.byte_array):
+        if self.split_image.ocr:
+            text = "\nor\n".join(self.split_image.texts)
+            self.current_split_image.setText(f"Looking for OCR text:\n{text}")
+        elif is_valid_image(self.split_image.byte_array):
             set_preview_image(self.current_split_image, self.split_image.byte_array)
 
         self.current_image_file_label.setText(self.split_image.filename)

diff --git a/src/AutoSplitImage.py b/src/AutoSplitImage.py
@@ -5,6 +5,7 @@
 
 import cv2
 import numpy as np
+import tomllib
 from cv2.typing import MatLike
 
 import error_messages
@@ -37,16 +38,23 @@ class AutoSplitImage:
     filename: str
     flags: int
     loops: int
+    fps: int
     image_type: ImageType
     byte_array: MatLike | None = None
     mask: MatLike | None = None
+    texts: []
+    ocr: bool
     # This value is internal, check for mask instead
     _has_transparency = False
     # These values should be overriden by some Defaults if None. Use getters instead
     __delay_time: float | None = None
     __comparison_method: int | None = None
     __pause_time: float | None = None
     __similarity_threshold: float | None = None
+    __x: int
+    __xx: int
+    __y: int
+    __yy: int
 
     def get_delay_time(self, default: "AutoSplit | int"):
         """Get image's delay time or fallback to the default value from spinbox."""
@@ -89,7 +97,18 @@ def __init__(self, path: str):
         self.__comparison_method = comparison_method_from_filename(self.filename)
         self.__pause_time = pause_from_filename(self.filename)
         self.__similarity_threshold = threshold_from_filename(self.filename)
-        self.__read_image_bytes(path)
+        self.__x = 0
+        self.__xx = 0
+        self.__y = 0
+        self.__yy = 0
+        self.texts = []
+        self.fps = 0
+        self.ocr = False
+        if path.endswith("txt"):
+            self.ocr = True
+            self.__parse_text_file(path)
+        else:
+            self.__read_image_bytes(path)
 
         if START_KEYWORD in self.filename:
             self.image_type = ImageType.START
@@ -98,6 +117,18 @@ def __init__(self, path: str):
         else:
             self.image_type = ImageType.SPLIT
 
+    def __parse_text_file(self, path: str):
+        with open(path, "rb") as f:
+            data = tomllib.load(f)
+            self.texts = data["texts"]
+            self.__x = data["top_left"]
+            self.__xx = data["top_right"]
+            self.__y = data["bottom_left"]
+            self.__yy = data["bottom_right"]
+            self.fps = 1
+            if "fps_limit" in data:
+                self.fps = data["fps_limit"]
+
     def __read_image_bytes(self, path: str):
         image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
         if not is_valid_image(image):
@@ -140,7 +171,13 @@ def compare_with_capture(
         default: "AutoSplit | int",
         capture: MatLike | None,
     ):
-        """Compare image with capture using image's comparison method. Falls back to combobox."""
+        """
+        Compare image with capture using image's comparison method. Falls back to combobox.
+        For OCR text files: extract image text from rectangle position and compare it with the expected string.
+        """
+        if self.ocr:
+            return extract_and_compare_text(capture[self.__y:self.__yy, self.__x:self.__xx], self.texts)
+
         if not is_valid_image(self.byte_array) or not is_valid_image(capture):
             return 0.0
         resized_capture = cv2.resize(capture, self.byte_array.shape[1::-1])
@@ -155,6 +192,7 @@ def compare_with_capture(
 
 
 if True:
+    from compare import extract_and_compare_text
     from split_parser import (
         comparison_method_from_filename,
         delay_time_from_filename,

diff --git a/src/compare.py b/src/compare.py
@@ -1,11 +1,15 @@
+from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
+from scipy import fft
+from cv2.typing import MatLike
 from math import sqrt
 
 import cv2
+import Levenshtein
 import numpy as np
-from cv2.typing import MatLike
-from scipy import fft
+from easyocr import Reader
+
+OCR = Reader(["en"], gpu=False, verbose=False, download_enabled=False)
 
-from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
 
 MAXRANGE = MAXBYTE + 1
 CHANNELS = [ColorChannel.Red.value, ColorChannel.Green.value, ColorChannel.Blue.value]
@@ -126,6 +130,31 @@ def compare_phash(source: MatLike, capture: MatLike, mask: MatLike | None = None
     return 1 - (hash_diff / 64.0)
 
 
+def extract_and_compare_text(capture: MatLike, texts):
+    """
+    Compares the extracted text of the given image and returns the similarity between the two texts.
+    The best match of all texts is returned.
+
+    @param capture: Image of any given shape as a numpy array
+    @param texts: a list of strings to match for
+    @return: The similarity between the text in the image and the text supplied as a number 0 to 1.
+    """
+    # if the string is found 1:1 in the string extracted from the image a 1 is returned.
+    # otherwise the levenshtein ratio is calculated between the two strings and gets returned.
+    image_string = "".join(OCR.readtext(capture, detail=0)).lower().strip()
+
+    ratio = 0.0
+    for text in texts:
+        if text in image_string:
+            ratio = 1.0
+            break
+        ratio = max(ratio, Levenshtein.ratio(text, image_string))
+    # TODO: debug: remove me
+    if ratio > 0.9:  # noqa: PLR2004
+        print(f"text from image ({ratio:,.2f}): {image_string}")
+    return ratio
+
+
 def __compare_dummy(*_: object):
     return 0.0
 

diff --git a/src/split_parser.py b/src/split_parser.py
@@ -208,7 +208,7 @@ def parse_and_validate_images(autosplit: "AutoSplit"):
     else:
         for image in split_images:
             # Test for image without transparency
-            if not is_valid_image(image.byte_array):
+            if not image.ocr and not is_valid_image(image.byte_array):
 
                 def image_validity(filename: str):
                     return lambda: error_messages.image_validity(filename)