diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..5194b35
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+MIT License
+Copyright (c) 2025, Anonymous
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the “Software”), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---
+This project includes and builds upon the BLIP model developed by Salesforce.com, Inc., which is licensed under the BSD 3-Clause License:
+BSD 3-Clause License
+Copyright (c) 2022, Salesforce.com, Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index a14c62b..d1fcde3 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,134 @@
+[](https://research.macpaw.com)
+
# Screen2AX
-Repository for Screen2AX paper
+
+A research-driven project for generating accessibility of macOS applications using computer vision and deep learning. Read more about the project in our [paper]().
+
+---
+
+## 📁 Datasets
+
+- [Screen2AX-Tree](https://huggingface.co/datasets/macpaw-research/Screen2AX-Tree)
+- [Screen2AX-Element](https://huggingface.co/datasets/macpaw-research/Screen2AX-Element)
+- [Screen2AX-Group](https://huggingface.co/datasets/macpaw-research/Screen2AX-Group)
+- [Screen2AX-Task](https://huggingface.co/datasets/macpaw-research/Screen2AX-Task)
+
+## 🤖 Models
+
+- [YOLOv11l — UI Elements Detection](https://huggingface.co/macpaw-research/yolov11l-ui-elements-detection)
+- [BLIP — UI Elements Captioning](https://huggingface.co/macpaw-research/blip-icon-captioning)
+- [YOLOv11l — UI Groups Detection](https://huggingface.co/macpaw-research/yolov11l-ui-groups-detection)
+
+---
+
+## 🛠 Requirements
+
+- macOS
+- Python (recommended ≥ 3.11)
+- Conda
+- Pip
+
+---
+
+## ⚙️ Installation
+
+Create and activate the project environment:
+
+```bash
+conda create -n screen2ax python=3.11
+conda activate screen2ax
+pip install -r requirements.txt
+```
+
+## 🚀 Usage
+
+> ⚠️ The first run may take longer due to model downloads and initial setup.
+
+### Accessibility generation
+Run the accessibility generation script:
+
+```bash
+python -m hierarchy_dl.hierarchy --help
+```
+#### Available Options
+
+```
+usage: hierarchy.py [-h] [--image IMAGE] [--save] [--filename FILENAME] [--save_dir SAVE_DIR] [--flat]
+
+options:
+ -h, --help show this help message and exit
+ --image IMAGE Path to the image
+ --save Save the result
+ --filename FILENAME Filename to save the result
+ --save_dir SAVE_DIR Directory to save the result. Default is './results/'
+ --flat Generate flat hierarchy (no groups)
+```
+
+##### Example
+Run the accessibility generation script on a screenshot of the Spotify app:
+
+```bash
+python -m hierarchy_dl.hierarchy --image ./screenshots/spotify.png --save --filename spotify.json
+```
+
+This will generate a JSON file with the accessibility of the app in the results folder.
+
+### Screen Reader
+Run the screen reader:
+
+```bash
+python -m screen_reader.screen_reader --help
+```
+
+#### Available Options
+
+```
+usage: screen_reader.py [-h] [-b BUNDLE_ID] [-n NAME] [-dw] [-dh] [-r RATE] [-v VOICE] [-sa] [-sk SKIP_GROUPS]
+
+options:
+ -h, --help show this help message and exit
+ -b, --bundle_id BUNDLE_ID Bundle ID of the target application
+ -n, --name NAME Name of the target application (alternative to bundle_id)
+ -dw, --deactivate_welcome Skip the "Welcome to the ScreenReader." message
+ -dh, --deactivate_help Skip reading the help message on startup
+ -r, --rate RATE Set speech rate for macOS `say` command (default: 190)
+ -v, --voice VOICE Set voice for macOS `say` command (see `say -v "?" | grep en`)
+ -sa, --system_accessibility Use macOS system accessibility data instead of vision-generated
+ -sk, --skip-groups N Skip groups with fewer than N children (default: 5)
+```
+
+##### Example
+
+Run the screen reader for the Spotify app:
+```bash
+python -m screen_reader.screen_reader --name Spotify
+```
+
+## 📜 License
+### 🔍 YOLO Models
+The YOLO models used for UI elements and UI groups detection are licensed under the GNU Affero General Public License (AGPL). This is inherited from the original YOLO model licensing.
+
+### 🧠 BLIP Model
+The BLIP model for captioning UI elements is provided under the MIT License.
+
+### 📂 Datasets
+All datasets (Screen2AX-Tree, Screen2AX-Element, Screen2AX-Group, Screen2AX-Task) are released under the Apache 2.0 license.
+
+### 💻 Codebase
+All source code in this repository is licensed under the MIT License. See the [LICENSE](LICENSE) file for full terms and conditions.
+
+## 📚 Citation
+If you use this code in your research, please cite our paper:
+
+```bibtex
+...
+```
+
+## 🙌 Acknowledgements
+We would like to express our deepest gratitude to the Armed Forces of Ukraine. Your courage and unwavering defense of our country make it possible for us to live, work, and create in freedom. This work would not be possible without your sacrifice. Thank you.
+
+## MacPaw Research
+
+Visit our site to learn more 😉
+
+https://research.macpaw.com
diff --git a/hierarchy_dl/application.py b/hierarchy_dl/application.py
new file mode 100644
index 0000000..9aadc4e
--- /dev/null
+++ b/hierarchy_dl/application.py
@@ -0,0 +1,82 @@
+import time
+import threading
+
+import tkinter as tk
+
+from hierarchy import generate_hierarchy
+from screen_reader.screenshot import screenshot_app, open_app_in_foreground
+
+
+run = True
+thread = None
+
+
+def start_action():
+ bundle_id = entry.get()
+ open_app_in_foreground(bundle_id, wait_time=2)
+
+ global run
+ run = True
+
+ i = 0
+ while run:
+ try:
+ start = time.time()
+ open_app_in_foreground(bundle_id, wait_time=0.25)
+ screen_path = screenshot_app(bundle_id, f"./screenshots/")[0]
+
+ tree = generate_hierarchy(screen_path, save=True, save_dir=f"./result/{bundle_id}/")
+
+ end = time.time()
+
+ i += 1
+ print(f"Frame #{i}, time taken: {end - start}")
+
+ except Exception as e:
+ print(f"Error: {e}")
+ break
+
+
+def stop_action():
+ global run, thread
+ run = False
+ print(f"Stopping process")
+
+ if thread:
+ thread.join()
+
+ print(f"Thread has stopped")
+
+
+def start_thread():
+ global thread
+ thread = threading.Thread(target=start_action, daemon=True)
+ thread.start()
+
+
+if __name__ == "__main__":
+ # Create main window
+ root = tk.Tk()
+ root.title("Bundle ID Manager")
+ root.geometry("300x200")
+
+ # Create input field
+ label = tk.Label(root, text="bundle_id:")
+ label.pack(pady=5)
+
+ entry = tk.Entry(root)
+ entry.pack(pady=5)
+
+ # Copyable text with suggestion
+ suggestion = tk.Label(root, text="osascript -e 'id of app \"Spotify\"' \n e.g. com.spotify.client")
+ suggestion.pack(pady=5)
+
+ # Create buttons
+ start_button = tk.Button(root, text="Start", command=start_thread)
+ start_button.pack(pady=5)
+
+ stop_button = tk.Button(root, text="Stop", command=stop_action)
+ stop_button.pack(pady=5)
+
+ # Run application
+ root.mainloop()
\ No newline at end of file
diff --git a/hierarchy_dl/blip.py b/hierarchy_dl/blip.py
new file mode 100644
index 0000000..5f5fb5f
--- /dev/null
+++ b/hierarchy_dl/blip.py
@@ -0,0 +1,25 @@
+import torch
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+
+if torch.cuda.is_available():
+ device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+ device = torch.device("mps")
+else:
+ device = torch.device("cpu")
+
+print(f"Using device: {device}")
+
+cache_dir = "./.models"
+
+model_path = "macpaw-research/blip-icon-captioning"
+processor = BlipProcessor.from_pretrained(model_path, cache_dir=cache_dir)
+model = BlipForConditionalGeneration.from_pretrained(model_path, cache_dir=cache_dir).to(device)
+model.eval()
+
+@torch.no_grad()
+def generate_captions(images: list[Image.Image]) -> list[str]:
+ inputs = processor(images, return_tensors="pt").to(device)
+ outputs = model.generate(**inputs, max_new_tokens=25)
+ return processor.batch_decode(outputs, skip_special_tokens=True)
diff --git a/hierarchy_dl/hierarchy.py b/hierarchy_dl/hierarchy.py
new file mode 100644
index 0000000..19797e7
--- /dev/null
+++ b/hierarchy_dl/hierarchy.py
@@ -0,0 +1,120 @@
+import os
+import json
+import time
+from os import path
+from typing import Optional
+
+import numpy as np
+from PIL import Image
+from ocrmac import ocrmac
+from ultralytics import YOLO
+
+from hierarchy_dl.utils import *
+
+from huggingface_hub import hf_hub_download
+cache_dir = "./.models"
+
+ui_elements_model_path = hf_hub_download(
+ repo_id="macpaw-research/yolov11l-ui-elements-detection",
+ filename="ui-elements-detection.pt",
+ cache_dir=cache_dir
+)
+
+ui_groups_model_path = hf_hub_download(
+ repo_id="macpaw-research/yolov11l-ui-groups-detection",
+ filename="ui-groups-detection.pt",
+ cache_dir=cache_dir
+)
+
+ui_elements_model = YOLO(ui_elements_model_path)
+ui_groups_model = YOLO(ui_groups_model_path)
+
+
+def generate_hierarchy(
+ img: str | Image.Image | np.ndarray,
+ save_dir: str = "./results/",
+ save: bool = False,
+ filename: Optional[str] = None,
+ flat: bool = False
+) -> UIElement:
+ """
+ Generate UI hierarchy from an image
+ """
+ # load image
+ if isinstance(img, str):
+ img_pil = Image.open(img)
+
+ if isinstance(img, np.ndarray):
+ img_pil = Image.fromarray(img)
+
+ if isinstance(img, Image.Image):
+ img_pil = img
+
+ width, height = img_pil.size
+
+ # detect ui elements
+ ui_elements = ui_elements_model(img_pil, verbose=False)[0].boxes
+ ui_elements = [UIElement(box, cls) for box, cls in zip(ui_elements.xyxy, ui_elements.cls)]
+
+ # detect ui groups
+ ui_groups = ui_groups_model(img_pil, conf=0.5, verbose=False)[0].boxes
+ ui_groups = [UIElement(box, "Group") for box in ui_groups.xyxy]
+
+ # ocr
+ annotations = ocrmac.OCR(img_pil, language_preference=['en-US']).recognize(px=True)
+ annotations = [UIElement(box, "Text", value=val) for val, _, box in annotations]
+
+ # merge texts and elements
+ annotations = group_texts(annotations)
+ ui_elements = merge_text_and_elements(ui_elements, annotations, iou_threshold=0.2)
+
+ # icons
+ ui_elements = caption_buttons(ui_elements, img_pil, batch_size=16)
+
+ if not flat:
+ # build tree
+ tree = build_tree(ui_groups, ui_elements, (width, height), iou_threshold=0.0)
+ clean_tree(tree)
+
+ if len(tree.children) == 1:
+ tree = tree.children[0]
+ else:
+ ui_elements.sort(key=lambda x: x.box[0] ** 2 + x.box[1] ** 2)
+ tree = UIElement(
+ box=[0, 0, width, height],
+ cls="Group",
+ value="Screen"
+ )
+ tree.children = ui_elements
+
+ if save or filename:
+ os.makedirs(save_dir, exist_ok=True)
+
+ filename = f"{path.basename(img)}.json" if isinstance(img, str) and not filename else filename
+ filename = filename or f"{time.time()}.json"
+
+ full_path = path.join(save_dir, filename)
+
+ with open(full_path, "w", encoding='utf-8') as f:
+ json.dump(tree.to_dict(), f, indent=4)
+
+ return tree
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--image", type=str, default="./screen.png", help="Path to the image")
+ parser.add_argument("--save", action="store_true", help="Save the result")
+ parser.add_argument("--filename", type=str, default=None, help="Filename to save the result")
+ parser.add_argument("--save_dir", type=str, default="./results/", help="Directory to save the result. Default is './results/'")
+ parser.add_argument("--flat", action="store_true", help="Generate flat hierarchy (no groups)")
+ args = parser.parse_args()
+
+ image = args.image
+ save_dir = args.save_dir
+ save = args.save
+ filename = args.filename
+ flat = args.flat
+
+ tree = generate_hierarchy(image, save_dir, save, filename, flat)
\ No newline at end of file
diff --git a/hierarchy_dl/requirements.txt b/hierarchy_dl/requirements.txt
new file mode 100644
index 0000000..5fc7fbe
--- /dev/null
+++ b/hierarchy_dl/requirements.txt
@@ -0,0 +1,78 @@
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.10
+aiosignal==1.2.0
+attrs==24.3.0
+Bottleneck==1.4.2
+Brotli==1.0.9
+certifi==2025.1.31
+charset-normalizer==3.3.2
+click==8.1.8
+contourpy==1.3.1
+cycler==0.12.1
+datasets==2.12.0
+Deprecated==1.2.13
+dill==0.3.6
+filelock==3.13.1
+fonttools==4.56.0
+frozenlist==1.5.0
+fsspec==2024.12.0
+gmpy2==2.2.1
+huggingface-hub==0.24.6
+idna==3.7
+ImageHash==4.3.2
+importlib-metadata==7.0.1
+Jinja2==3.1.5
+kiwisolver==1.4.8
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.14
+networkx==3.4.2
+numexpr==2.10.1
+numpy==1.26.4
+ocrmac==1.0.0
+opencv-python==4.11.0.86
+opentelemetry-api==1.26.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+propcache==0.2.0
+psutil==7.0.0
+py-cpuinfo==9.0.0
+pyarrow==19.0.0
+pyobjc-core==11.0
+pyobjc-framework-Cocoa==11.0
+pyobjc-framework-CoreML==11.0
+pyobjc-framework-Quartz==11.0
+pyobjc-framework-Vision==11.0
+pyparsing==3.2.1
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyWavelets==1.8.0
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+responses==0.13.3
+safetensors==0.4.5
+scipy==1.15.2
+seaborn==0.13.2
+setuptools==75.8.0
+six==1.16.0
+sympy==1.13.1
+tokenizers==0.21.2
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.48.0
+typing_extensions==4.12.2
+tzdata==2023.3
+ultralytics==8.3.82
+ultralytics-thop==2.0.14
+urllib3==2.3.0
+wheel==0.45.1
+wrapt==1.17.0
+xxhash==3.5.0
+yarl==1.18.0
+zipp==3.21.0
diff --git a/hierarchy_dl/system_buttons.png b/hierarchy_dl/system_buttons.png
new file mode 100644
index 0000000..7ca0709
Binary files /dev/null and b/hierarchy_dl/system_buttons.png differ
diff --git a/hierarchy_dl/utils.py b/hierarchy_dl/utils.py
new file mode 100644
index 0000000..504ad36
--- /dev/null
+++ b/hierarchy_dl/utils.py
@@ -0,0 +1,353 @@
+from PIL import Image
+from ocrmac import ocrmac
+from imagehash import average_hash, ImageHash
+import cv2
+import numpy as np
+
+from .blip import generate_captions
+
+
+icons_cache = {}
+
+
+def iou(box1: list[int], box2: list[int]) -> float:
+ """
+ Calculate IoU between two boxes
+ """
+ # Extract the coordinates of both boxes
+ x1, y1, x2, y2 = box1
+ x1_other, y1_other, x2_other, y2_other = box2
+
+ # Calculate the coordinates of the intersection rectangle
+ inter_x1 = max(x1, x1_other)
+ inter_y1 = max(y1, y1_other)
+ inter_x2 = min(x2, x2_other)
+ inter_y2 = min(y2, y2_other)
+
+ # Check if there is an intersection
+ inter_width = max(0, inter_x2 - inter_x1)
+ inter_height = max(0, inter_y2 - inter_y1)
+
+ # Area of intersection
+ intersection_area = inter_width * inter_height
+
+ # Area of union
+ union_area = abs((x2 - x1) * (y2 - y1)) + abs((x2_other - x1_other) * (y2_other - y1_other)) - intersection_area
+
+ if union_area == 0:
+ return 0
+
+ # IoU calculation
+ return intersection_area / union_area
+
+
+class UIElement:
+ names = [
+ 'AXButton', 'AXDisclosureTriangle', 'AXImage',
+ 'AXLink', 'AXTextArea', 'Text',
+ 'Group'
+ ]
+
+ def __init__(self, box: list[int], cls: int | str, value: str = None):
+ self.value = value or None
+ self.box = list(map(int, box)) # [x1, y1, x2, y2]
+ self.cls = self.names.index(cls) if isinstance(cls, str) else int(cls)
+ self.children: list[UIElement] = []
+
+
+ @property
+ def area(self) -> int:
+ return abs((self.box[2] - self.box[0]) * (self.box[3] - self.box[1]))
+
+ def iou(self, other: "UIElement") -> float:
+ # Extract the coordinates of both boxes
+ x1, y1, x2, y2 = self.box
+ x1_other, y1_other, x2_other, y2_other = other.box
+
+ # Calculate the coordinates of the intersection rectangle
+ inter_x1 = max(x1, x1_other)
+ inter_y1 = max(y1, y1_other)
+ inter_x2 = min(x2, x2_other)
+ inter_y2 = min(y2, y2_other)
+
+ # Check if there is an intersection
+ inter_width = max(0, inter_x2 - inter_x1)
+ inter_height = max(0, inter_y2 - inter_y1)
+
+ # Area of intersection
+ intersection_area = inter_width * inter_height
+
+ # Area of union
+ union_area = self.area + other.area - intersection_area
+
+ if union_area == 0:
+ return 0
+
+ # IoU calculation
+ return intersection_area / union_area
+
+
+ def merge(self, other: "UIElement"):
+ self.box[0] = min(self.box[0], other.box[0])
+ self.box[1] = min(self.box[1], other.box[1])
+ self.box[2] = max(self.box[2], other.box[2])
+ self.box[3] = max(self.box[3], other.box[3])
+
+
+ def __dict__(self):
+ return {
+ "cls": self.names[self.cls],
+ "value": self.value,
+ "box": self.box,
+ "children": [child.__dict__() for child in self.children]
+ }
+
+
+ def to_dict(self):
+ return self.__dict__()
+
+
+ def __repr__(self):
+ return str(self.__dict__())
+
+
+def group_texts(annotations: list[UIElement], max_height_frac: int = 2) -> list[UIElement]:
+ """
+ Group rows of texts in a single annotation (paragraph)
+ """
+ # Sort annotations by their vertical position (top Y-coordinate)
+ annotations.sort(key=lambda x: x.box[1])
+
+ i = 0
+ while i < len(annotations):
+ j = i + 1
+ ref_height = abs(annotations[i].box[3] - annotations[i].box[1])
+
+ while j < len(annotations):
+ box1, box2 = annotations[i].box, annotations[j].box
+ height2 = abs(box2[3] - box2[1])
+
+ # Check if annotations are close on the Y-axis
+ if abs(box1[3] - box2[1]) > height2 * 0.5:
+ j += 1
+ continue
+
+ # Check if the height difference is reasonable (similar heights)
+ if not (1 / max_height_frac) <= (ref_height / height2) <= max_height_frac: # 1/2 <= h1/h2 <= 2
+ j += 1
+ continue
+
+ # Check if annotations overlap on the X-axis
+ if (box1[0] <= box2[0] <= box1[2] or box1[0] <= box2[2] <= box1[2] or
+ box2[0] <= box1[0] <= box2[2] or box2[0] <= box1[2] <= box2[2]):
+ # Merge the two annotations
+ annotations[i].value += f"\n{annotations[j].value}"
+ annotations[i].merge(annotations[j])
+
+ # Update height and remove merged annotation
+ ref_height = height2
+ del annotations[j]
+ continue
+
+ j += 1
+ i += 1
+
+ return annotations
+
+
+def merge_text_and_elements(elements: list[UIElement], texts: list[UIElement], iou_threshold=0.2) -> list[UIElement]:
+ """
+ Merge texts with elements (buttons, images, etc.) based on IoU
+ """
+ remaining_texts = []
+
+ for text in texts:
+ max_iou, best_match = max([(text.iou(element), element) for element in elements], key=lambda x: x[0])
+
+
+ if max_iou > iou_threshold:
+ best_match.value = f"{best_match.value}\n{text.value}" if best_match.value else text.value
+ best_match.merge(text)
+ else:
+ remaining_texts.append(text)
+
+ elements.extend(remaining_texts)
+ return elements
+
+
+def caption_buttons(ui_elements: list[UIElement], image: Image.Image, batch_size: int = 16) -> list[UIElement]:
+ """
+ Generate captions for buttons
+ """
+ # find elements that need to be captioned
+ to_be_captioned = [e for e in ui_elements if e.cls in (0, 2) and not e.value]
+
+ # calculate crop and hash
+ to_be_captioned = [
+ ( e, image.crop(e.box), average_hash( image.crop(e.box) ) )
+ for e in to_be_captioned
+ ]
+
+ # check cache table
+ uncached_elements: list[tuple[UIElement, Image.Image, ImageHash]] = []
+ for element, cropped, img_hash in to_be_captioned:
+ if img_hash in icons_cache:
+ element.value = icons_cache[img_hash]
+ else:
+ uncached_elements.append((element, cropped, img_hash))
+
+ # caption close, minimize, and maximize buttons
+ # read in BGR
+ system_buttons_img = cv2.imread("./hierarchy_dl/system_buttons.png", cv2.IMREAD_GRAYSCALE)
+ cv2_image = np.array(image)[:85, :250] # already in RGB
+ cv2_image_bgr = cv2.cvtColor(cv2_image, cv2.COLOR_RGB2GRAY)
+
+ screenshot_edges = cv2.Canny(system_buttons_img, 50, 200)
+ template_edges = cv2.Canny(cv2_image_bgr, 50, 200)
+
+ # find the system buttons using pattern matching. Find area with biggest match
+ result = cv2.matchTemplate(screenshot_edges, template_edges, cv2.TM_CCOEFF_NORMED)
+ _, maxV, _, max_loc = cv2.minMaxLoc(result)
+
+ if maxV > 0.4:
+ x, y = max_loc
+
+ r = 28
+ offset = 12
+
+ close_app_location = ([x, y, x + r, y + r], "Close")
+ minimize_location = ([x + r + offset, y, x + offset + 2 * r, y + r], "Minimize")
+ maximize_location = ([x + 2 * (r + offset), y, x + 2 * (r + offset) + r, y + r], "Zoom")
+
+ uncached_elements.sort(key=lambda x: x[0].box[0] ** 2 + x[0].box[1] ** 2)
+
+ for element, cropped, img_hash in uncached_elements:
+ for location, name in [close_app_location, minimize_location, maximize_location]:
+ if iou(element.box, location) > 0.1 and element.value is None:
+ element.value = name
+ break
+
+ # remove elements that have been captioned
+ uncached_elements = [e for e in uncached_elements if e[0].value is None]
+
+ # try with ocr
+ remaining_elements: list[tuple[UIElement, Image.Image, ImageHash]] = []
+ for element, cropped, img_hash in uncached_elements:
+ vals = ocrmac.OCR(cropped, language_preference=['en-US'], recognition_level="accurate").recognize()
+
+ recognized_texts = [val for val, conf, _ in vals if conf == 1.0]
+
+ if recognized_texts:
+ element.value = "\n".join(recognized_texts)
+ icons_cache[img_hash] = element.value
+ else:
+ remaining_elements.append((element, cropped, img_hash))
+
+ # generate captions using DeepLearning
+ for i in range(0, len(remaining_elements), batch_size):
+ batch = remaining_elements[i:i+batch_size]
+ crops = [cropped for _, cropped, _ in batch]
+
+ captions = generate_captions(crops)
+
+ for (el, _, img_hash), caption in zip(batch, captions):
+ el.value = caption
+ icons_cache[img_hash] = caption
+
+ return ui_elements
+
+
+def build_tree(
+ ui_groups: list[UIElement],
+ ui_elements: list[UIElement],
+ size: tuple[int, int],
+ iou_threshold=0.1
+) -> UIElement:
+ """
+ Build a tree from a list of UI elements and groups
+ """
+ remaining_elements: list[UIElement] = []
+
+ # Assign elements to groups based on IoU
+ if ui_groups:
+ for element in ui_elements:
+ max_iou, best_group = max(
+ ((group.iou(element), group) for group in ui_groups), key=lambda x: x[0]
+ )
+
+ if max_iou > iou_threshold:
+ best_group.children.append(element)
+ else:
+ remaining_elements.append(element)
+ else:
+ remaining_elements.extend( ui_elements )
+
+ # Include groups in remaining elements and sort by area (smallest first)
+ remaining_elements.extend( ui_groups )
+ remaining_elements.sort(key=lambda x: x.area)
+
+ merge_occurred = True
+ # Continue looping until no merge occurs in a full pass.
+ while merge_occurred:
+ merge_occurred = False
+ i = 0
+
+ # Use a while loop to manage indices when items are removed.
+ while i < len(remaining_elements):
+ A = remaining_elements[i]
+
+ # Check for a larger element to merge A into.
+ for j in range(i + 1, len(remaining_elements)):
+ B = remaining_elements[j]
+
+ if A.iou(B) > 0 and B.cls == B.names.index("Group"):
+ # Merge A into B: update B's bounding box and add A as a child.
+ B.merge(A)
+ B.children.append(A)
+
+ # Remove A from the list.
+ remaining_elements.pop(i)
+ merge_occurred = True
+
+ # Re-sort remaining_elements as B's area may have increased.
+ remaining_elements.sort(key=lambda x: x.area) # O(n log n), but inserting 1 element could be O(n)
+
+ # Break out to restart checking from the beginning.
+ break
+
+ else:
+ # Only increment if A wasn't merged, because removal shifts indices.
+ i += 1
+
+
+ root = UIElement([0, 0, size[0], size[1]], "Group")
+ root.children = remaining_elements
+
+ return root
+
+
+def clean_tree(tree: UIElement):
+ """
+ Delete empty groups if they are leafs
+ """
+ i = 0
+
+ while i < len(tree.children):
+ child = tree.children[i]
+
+ if child.children:
+ clean_tree(child)
+
+ if child.cls == 6:
+ if not child.children:
+ del tree.children[i]
+ continue
+
+ elif len(child.children) == 1:
+ tree.children[i] = child.children[0]
+ continue
+
+ i += 1
+
+ # Sort children by distance from the parent
+ tree.children.sort(key=lambda x: (x.box[0] - tree.box[0]) ** 2 + (x.box[1] - tree.box[1]) ** 2)
diff --git a/hierarchy_heuristics/box.py b/hierarchy_heuristics/box.py
new file mode 100644
index 0000000..70853c9
--- /dev/null
+++ b/hierarchy_heuristics/box.py
@@ -0,0 +1,233 @@
+class BBox:
+ def __init__(self, box):
+ self.box: tuple[int, int, int, int] = tuple(map(int, box))
+
+ def __iter__(self):
+ return iter(self.box)
+
+ def __getitem__(self, idx):
+ return self.box[idx]
+
+ @property
+ def top_left(self):
+ return self.box[:2]
+
+ @property
+ def bottom_right(self):
+ return self.box[2:]
+
+ @property
+ def width(self):
+ return self.box[2] - self.box[0]
+
+ @property
+ def height(self):
+ return self.box[3] - self.box[1]
+
+ @property
+ def area(self):
+ return self.width * self.height
+
+ @property
+ def x1(self):
+ return self.box[0]
+
+ @property
+ def y1(self):
+ return self.box[1]
+
+ @property
+ def x2(self):
+ return self.box[2]
+
+ @property
+ def y2(self):
+ return self.box[3]
+
+ def iou(self, other):
+ x1 = max(self.box[0], other.box[0])
+ y1 = max(self.box[1], other.box[1])
+ x2 = min(self.box[2], other.box[2])
+ y2 = min(self.box[3], other.box[3])
+
+ intersection = max(0, x2 - x1) * max(0, y2 - y1)
+ union = self.area + other.area - intersection
+
+ if union == 0:
+ return 0
+
+ return intersection / union
+
+ def merge_bboxes(self, other, inplace=False):
+ x1 = min(self.box[0], other.box[0])
+ y1 = min(self.box[1], other.box[1])
+ x2 = max(self.box[2], other.box[2])
+ y2 = max(self.box[3], other.box[3])
+
+ if inplace:
+ self.box = (x1, y1, x2, y2)
+ return self
+
+ return BBox((x1, y1, x2, y2))
+
+ def y_distance(self, other: "BBox"):
+ return min(
+ abs(self.y1 - other.y2),
+ abs(self.y2 - other.y1),
+ abs(self.y1 - other.y1),
+ abs(self.y2 - other.y2),
+ )
+
+ def x_distance(self, other: "BBox"):
+ return min(
+ abs(self.x1 - other.x2),
+ abs(self.x2 - other.x1),
+ abs(self.x1 - other.x1),
+ abs(self.x2 - other.x2),
+ )
+
+
+class Text(BBox):
+ def __init__(self, bbox, text: str) -> None:
+ super().__init__(bbox)
+ self.text_value = text
+
+ def __repr__(self) -> str:
+ return f"Text(box={self.box}, text={self.text_value})"
+
+ def __bool__(self):
+ return bool(self.text_value)
+
+ def __add__(self, other):
+ if not other:
+ return self
+
+ if not self:
+ return other
+
+ bbox = (
+ min(self.x1, other.x1),
+ min(self.y1, other.y1),
+ max(self.x2, other.x2),
+ max(self.y2, other.y2),
+ )
+ text = self.text_value + " " + other.text_value
+ return Text(bbox, text)
+
+
+class UIBox(BBox):
+ """Abstract class for UI elements"""
+
+ id2class = {
+ 0: "AXButton",
+ 1: "AXCheckBox",
+ 2: "AXComboBox",
+ 3: "AXHeading",
+ 4: "AXImage",
+ 5: "AXLink",
+ 6: "AXRadioButton",
+ 7: "AXScrollBar",
+ 8: "AXSlider",
+ 9: "AXStaticText",
+ 10: "AXTextField",
+ 11: "TextGroup",
+ 12: "OCRText",
+ 13: "Group",
+ }
+
+ class2id = {v: k for k, v in id2class.items()}
+
+ def __init__(self, box):
+ super().__init__(box)
+ self.parent = None
+ self.cls = None
+
+ def __repr__(self):
+ return f"{self.__class__.__name__}(box={self.box}, cls={self.cls})"
+
+ def merge(self, other, cls=None, inplace=False):
+ raise NotImplementedError
+
+ def other_is_inside(self, other, margin=0):
+ margin_x1, margin_y1, margin_x2, margin_y2 = (
+ margin if isinstance(margin, tuple) else (margin, margin, margin, margin)
+ )
+
+ x1 = max(self.x1 + margin_x1, 0)
+ y1 = max(self.y1 + margin_y1, 0)
+ x2 = self.x2 - margin_x2
+ y2 = self.y2 - margin_y2
+
+ return (
+ x1 <= other.x1 <= x2
+ and y1 <= other.y1 <= y2
+ and x1 <= other.x2 <= x2
+ and y1 <= other.y2 <= y2
+ )
+
+
+class Group(UIBox):
+ group_types = ["window", "row", "text", "column"]
+
+ def __init__(self, box, children=None, group_type="text"):
+ super().__init__(box)
+ self.cls = self.class2id["Group"]
+ self.group_type = group_type
+ self.children = children or []
+
+ def __repr__(self):
+ return f"Group(box={self.box}, group_type={self.group_type}, children={self.children})"
+
+ def append(self, child):
+ self.children.append(child)
+ child.parent = self
+
+ def merge(self, other, cls=None, inplace=False):
+ if not other:
+ return self
+
+ cls = cls or self.cls
+
+ if inplace:
+ self.merge_bboxes(other, inplace=True)
+ self.cls = cls
+ self.children += other.children
+ return self
+
+ return Group(self.merge_bboxes(other).box, self.children + other.children)
+
+ def finalize_bbox(self):
+ x1, x2, y1, y2 = 9999, 0, 9999, 0
+
+ for child in self.children:
+ x1 = min(x1, child.x1)
+ y1 = min(y1, child.y1)
+ x2 = max(x2, child.x2)
+ y2 = max(y2, child.y2)
+
+ self.box = (x1, y1, x2, y2)
+ return self
+
+
+class Box(UIBox):
+ def __init__(self, box, cls, text: Text | None = None):
+ super().__init__(box)
+ self.cls: int = int(cls)
+ self.text = text or Text(box, "")
+
+ def __repr__(self):
+ return f"Box(box={self.box}, cls={self.cls}, text={self.text})"
+
+ def merge(self, other, cls=None, inplace=False):
+ if not other:
+ return self
+
+ cls = cls or self.cls
+
+ if inplace:
+ self.merge_bboxes(other, inplace=True)
+ self.cls = cls
+ self.text = self.text + other.text
+ return self
+
+ return Box(self.merge_bboxes(other).box, cls, self.text + other.text)
diff --git a/hierarchy_heuristics/group.ipynb.py b/hierarchy_heuristics/group.ipynb.py
new file mode 100644
index 0000000..672c5ef
--- /dev/null
+++ b/hierarchy_heuristics/group.ipynb.py
@@ -0,0 +1,186 @@
+## Imports
+import json
+import os
+import signal
+from glob import glob
+
+import cv2
+from tqdm import tqdm
+
+from box import *
+from metrics import *
+from utils import group_elements
+
+config = {
+ # YOLO model
+ "ui_model_conf": 0.3, # [0, 1]
+ # OCR model
+ "ocr_conf_threshold": 0.3, # [0, 1]
+ # bind_text_and_boxes
+ "bind_text_and_boxes_iou_threshold": 0.05, # [0, 1]
+ # merge_overlapping_boxes
+ "merge_overlapping_boxes_iou_threshold": 0.3, # [0, 1]
+ # clickability confidence
+ "clickability_model_conf": 0.3, # [0, 1]
+ # merge_images_and_captions
+ "merge_images_and_captions_x_overlap_percent_threshold": 0.25, # [0, 1]
+ "merge_images_and_captions_y_distance_threshold": 0.02, # [0, 1]
+ "merge_images_and_captions_y_overlap_percent_threshold": 0.4, # [0, 1]
+ "merge_images_and_captions_x_distance_threshold": 0.02, # [0, 1]
+ # group_by_column
+ "group_by_column_y_distance_coefficient": 1.25, # [0, inf)
+ "group_by_column_width_threshold": 40, # [0, inf)
+ "group_by_column_max_width_coefficient": 0.55, # [0, 1]
+ # add_color_groups
+ "add_color_groups_min_box_threshold": 0.03, # [0, 1]
+ "add_color_groups_max_box_threshold": 0.95, # [0, 1]
+ "add_color_groups_color_diff_threshold": 2, # [0, 255]
+ "add_color_groups_area_threshold": 0.01, # [0, 1]
+ # group_by_row
+ "group_by_row_y_distance_threshold": 50, # [0, inf)
+ "group_by_row_height_threshold": 0.8, # [0, 1]
+}
+
+if __name__ == "__main__":
+ [os.remove(img) for img in glob("./test/ideal-test/*/*/*.png") if "predicted" in img]
+
+
+ images = glob("./test/ideal-test/*/*/*.png")
+ images = list(filter(lambda x: "segmented" not in x, images))
+ images.sort()
+
+ jsons = list(sorted(glob("./test/ideal-test/*/*/*_simplified.json")))
+
+ dataset = list(zip(images, jsons))
+
+
+ def handler(signum, frame):
+ raise Exception("Timeout")
+
+
+ apps = [group_elements(img, **config) for img, _ in tqdm(dataset)]
+
+
+ apps_json = [json.load(open(json_path)) for _, json_path in tqdm(dataset)]
+
+ bgr_colors = [
+ (255, 0, 0), # Blue
+ (0, 255, 0), # Green
+ (0, 0, 255), # Red
+ (255, 255, 0), # Yellow
+ (0, 255, 255), # Cyan
+ (255, 0, 255), # Magenta
+ (128, 0, 0), # Maroon
+ (0, 128, 0), # Dark Green
+ (0, 0, 128), # Navy
+ (128, 128, 0), # Olive
+ (128, 0, 128), # Purple
+ (0, 128, 128), # Teal
+ (192, 192, 192), # Silver
+ (128, 128, 128), # Gray
+ (255, 165, 0), # Orange
+ (255, 192, 203), # Pink
+ (210, 105, 30), # Chocolate
+ (34, 139, 34), # Forest Green
+ (255, 215, 0), # Gold
+ (135, 206, 250), # Sky Blue
+ ]
+
+
+ def plot_app(img, app, depth=0):
+ color_id = app.cls
+ label = Box.id2class[app.cls]
+
+ if isinstance(app, Group):
+ label += f"[{app.group_type} {depth}]"
+ color_id += (
+ Group.group_types.index(app.group_type)
+ if app.group_type in Group.group_types
+ else len(Group.group_types)
+ )
+
+ color = bgr_colors[color_id % len(bgr_colors)]
+
+ x1, y1, x2, y2 = app.box
+ cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+
+ text_color = (255, 255, 255)
+
+ (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
+
+ # Prints the text
+ cv2.rectangle(img, (x1, y1 - 20), (x1 + w, y1), color, -1)
+ cv2.putText(
+ img, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, text_color, 1
+ )
+
+ if hasattr(app, "children"):
+ for child in app.children:
+ plot_app(img, child, depth + 1)
+
+ # if save_path is not None:
+ # cv2.imwrite(save_path, img)
+ return img
+
+
+ for app, (img, _) in zip(apps, dataset):
+ image = cv2.imread(img)
+ image = plot_app(image, app)
+
+ segmented_image_path = f"{img[:-4]}_simplified-segmented.png"
+ original_segmented_image = cv2.imread(segmented_image_path)
+
+ output_image = np.concatenate((original_segmented_image, image), axis=1)
+ img_name = f"{img.split('/')[-1]}"
+ cv2.imwrite(f"./test-output/{img_name}", output_image)
+
+ metrics = []
+ leaf_metrics = []
+ geds = []
+ mIoU = []
+
+ for app, app_json in tqdm(zip(apps, apps_json), total=len(apps)):
+ ## F1 for edges
+ metrics.append(get_metrics(app, app_json, iou_threshold=0.3, leaf=False))
+ leaf_metrics.append(get_metrics(app, app_json, iou_threshold=0.3, leaf=True))
+
+ ## GED
+ # try to get GED for 10 seconds. In other case add length of Ground Truth
+ try:
+ signal.signal(signal.SIGALRM, handler)
+
+ signal.alarm(10) # 10 seconds
+ ged = calc_ged(app, app_json, iou_threshold=0.5)
+ signal.alarm(0)
+
+ geds.append(ged)
+
+ except Exception as e:
+ gt_edges = []
+ get_gt_edges(app_json, gt_edges)
+ length = len(gt_edges)
+ geds.append(length)
+
+ ## Average IoU for groups
+ mIoU.append(mean_groups_iou(app, app_json))
+
+ # ## Results
+ print(
+ f"""
+ Mean precision {np.mean([i[0] for i in metrics]):.2f} ± {np.std([i[0] for i in metrics]):.2f}
+ Mean recall {np.mean([i[1] for i in metrics]):.2f} ± {np.std([i[1] for i in metrics]):.2f}
+ Mean F1 score {np.mean([i[2] for i in metrics]):.2f} ± {np.std([i[2] for i in metrics]):.2f}
+ """
+ )
+
+ print(
+ f"""
+ Mean leaf precision {np.mean([i[0] for i in leaf_metrics]):.2f} ± {np.std([i[0] for i in leaf_metrics]):.2f}
+ Mean leaf recall {np.mean([i[1] for i in leaf_metrics]):.2f} ± {np.std([i[1] for i in leaf_metrics]):.2f}
+ Mean leaf F1 score {np.mean([i[2] for i in leaf_metrics]):.2f} ± {np.std([i[2] for i in leaf_metrics]):.2f}
+ """
+ )
+
+ print(f"Mean GEDs {np.mean(geds):.2f} ± {np.std(geds):.2f}")
+
+ print(f"Mean average IoU for groups {np.mean(mIoU):.2f} ± {np.std(mIoU):.2f}")
diff --git a/hierarchy_heuristics/main.py b/hierarchy_heuristics/main.py
new file mode 100644
index 0000000..88dc05d
--- /dev/null
+++ b/hierarchy_heuristics/main.py
@@ -0,0 +1,45 @@
+import cv2
+from utils import group_elements, plot_app, plot_all_boxes
+import time
+
+config = {
+ # YOLO model
+ "ui_model_conf": 0.3, # [0, 1]
+ # OCR model
+ "ocr_conf_threshold": 0.3, # [0, 1]
+ # bind_text_and_boxes
+ "bind_text_and_boxes_iou_threshold": 0.05, # [0, 1]
+ # merge_overlapping_boxes
+ "merge_overlapping_boxes_iou_threshold": 0.3, # [0, 1]
+ # clickability confidence
+ "clickability_model_conf": 0.3, # [0, 1]
+ # merge_images_and_captions
+ "merge_images_and_captions_x_overlap_percent_threshold": 0.25, # [0, 1]
+ "merge_images_and_captions_y_distance_threshold": 0.02, # [0, 1]
+ "merge_images_and_captions_y_overlap_percent_threshold": 0.4, # [0, 1]
+ "merge_images_and_captions_x_distance_threshold": 0.02, # [0, 1]
+ # group_by_column
+ "group_by_column_y_distance_coefficient": 1.25, # [0, inf)
+ "group_by_column_width_threshold": 40, # [0, inf)
+ "group_by_column_max_width_coefficient": 0.55, # [0, 1]
+ # add_color_groups
+ "add_color_groups_min_box_threshold": 0.03, # [0, 1]
+ "add_color_groups_max_box_threshold": 0.95, # [0, 1]
+ "add_color_groups_color_diff_threshold": 2, # [0, 255]
+ "add_color_groups_area_threshold": 0.01, # [0, 1]
+ # group_by_row
+ "group_by_row_y_distance_threshold": 50, # [0, inf)
+ "group_by_row_height_threshold": 0.8, # [0, 1]
+}
+
+if __name__ == "__main__":
+ img = cv2.imread("./visual-test-images/test4.png")
+
+ start = time.time()
+ app = group_elements(img, verbose=False, **config)
+ print("Time:", time.time() - start)
+
+ img = plot_all_boxes(img, app)
+ cv2.imshow("image", img)
+ cv2.waitKey(0)
+ cv2.destroyAllWindows()
\ No newline at end of file
diff --git a/hierarchy_heuristics/metrics.py b/hierarchy_heuristics/metrics.py
new file mode 100644
index 0000000..2e42ba4
--- /dev/null
+++ b/hierarchy_heuristics/metrics.py
@@ -0,0 +1,182 @@
+import networkx as nx
+import numpy as np
+
+
+def get_predicted_edges(app, edges, leaf=False):
+ if hasattr(app, "children"):
+ for child in app.children:
+ get_predicted_edges(child, edges)
+
+ if leaf and (not hasattr(child, "children") or not child.children):
+ edges.append((app, child))
+
+ if not leaf:
+ edges.append((app, child))
+
+ return edges
+
+
+def get_gt_edges(app, edges, leaf=False):
+ if "children" in app:
+ for child in app["children"]:
+ get_gt_edges(child, edges)
+
+ if leaf and ("children" not in child or not child["children"]):
+ edges.append((app, child))
+ elif not leaf:
+ edges.append((app, child))
+
+ return edges
+
+
+def iou(box1, box2):
+ x1 = max(box1[0], box2[0])
+ y1 = max(box1[1], box2[1])
+ x2 = min(box1[2], box2[2])
+ y2 = min(box1[3], box2[3])
+
+ intersection = max(0, x2 - x1) * max(0, y2 - y1)
+ union = (
+ (box1[2] - box1[0]) * (box1[3] - box1[1])
+ + (box2[2] - box2[0]) * (box2[3] - box2[1])
+ - intersection
+ )
+
+ return intersection / union
+
+
+def get_metrics(app, app_json, iou_threshold=0.3, leaf=False):
+ preds = []
+ get_predicted_edges(app, preds, leaf=leaf)
+
+ gts = []
+ get_gt_edges(app_json, gts, leaf=leaf)
+
+ tp = 0
+ fp = 0
+ fn = 0
+
+ for pred in preds:
+ iou_max = 0
+
+ for gt in gts:
+ gt_box = gt[0]["xyxy_retina"]
+
+ iou_max = max(iou_max, iou(pred[0].box, gt_box))
+
+ if iou_max >= iou_threshold:
+ tp += 1
+ else:
+ fp += 1
+
+ for gt in gts:
+ iou_max = 0
+ gt_box = gt[0]["xyxy_retina"]
+
+ for pred in preds:
+ iou_max = max(iou_max, iou(pred[0].box, gt_box))
+
+ if iou_max < iou_threshold:
+ fn += 1
+
+ precision = tp / (tp + fp) if (tp + fp) != 0 else 0
+ recall = tp / (tp + fn) if (tp + fn) != 0 else 0
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0
+
+ return precision, recall, f1
+
+
+def node_match_function_wrapper(iou_threshold):
+ def node_match_function(node1, node2):
+ return iou(node1["box"], node2["box"]) >= iou_threshold
+
+ return node_match_function
+
+
+def calc_ged(app, app_json, iou_threshold=0.3):
+ predicted_edges = []
+ get_predicted_edges(app, predicted_edges)
+
+ gt_edges = []
+ get_gt_edges(app_json, gt_edges)
+
+ G1 = nx.Graph()
+ for edge in predicted_edges:
+ if edge[0] not in G1.nodes:
+ G1.add_node(edge[0], box=edge[0].box)
+
+ if edge[1] not in G1.nodes:
+ G1.add_node(edge[1], box=edge[1].box)
+
+ G1.add_edge(edge[0], edge[1])
+
+ G2 = nx.Graph()
+ for edge in gt_edges:
+ node1 = edge[0].copy()
+ node1["children"] = None
+ box = node1["xyxy_retina"]
+ node1 = tuple(node1.items())
+
+ if node1 not in G2.nodes:
+ G2.add_node(node1, box=box)
+
+ node2 = edge[1].copy()
+ node2["children"] = None
+ box = node2["xyxy_retina"]
+ node2 = tuple(node2.items())
+
+ if node2 not in G2.nodes:
+ G2.add_node(node2, box=box)
+
+ G2.add_edge(node1, node2)
+
+ return next(
+ nx.optimize_graph_edit_distance(
+ G1, G2, node_match=node_match_function_wrapper(iou_threshold)
+ )
+ )
+
+
+def get_groups(app, groups):
+ if hasattr(app, "children"):
+ for child in app.children:
+ get_groups(child, groups)
+
+ if app.children:
+ groups.append(app)
+
+ return groups
+
+
+def get_gt_groups(app, groups):
+ if "children" in app:
+ for child in app["children"]:
+ get_gt_groups(child, groups)
+
+ if app["children"]:
+ groups.append(app)
+
+ return groups
+
+
+def mean_groups_iou(app, app_json):
+ groups = []
+ get_groups(app, groups)
+
+ gt_groups = []
+ gt_groups = get_gt_groups(app_json, gt_groups)
+
+ ious = []
+
+ for group in groups:
+ iou_max = max(
+ [iou(group.box, gt_group["xyxy_retina"]) for gt_group in gt_groups]
+ )
+ ious.append(iou_max)
+
+ # for gt_group in gt_groups:
+ # iou_max = max([iou(group.box, gt_group["xyxy_retina"]) for group in groups])
+ #
+ # ious.append(iou_max)
+
+ return np.mean(ious) if ious else 0
diff --git a/hierarchy_heuristics/requirements.txt b/hierarchy_heuristics/requirements.txt
new file mode 100644
index 0000000..6e83597
--- /dev/null
+++ b/hierarchy_heuristics/requirements.txt
@@ -0,0 +1,9 @@
+matplotlib==3.9.2
+networkx==3.4.1
+numpy==2.1.2
+ocrmac==0.1.6
+opencv-python==4.10.0.84
+pillow==11.0.0
+tqdm==4.66.5
+ultralytics==8.3.18
+gdown==5.2.0
\ No newline at end of file
diff --git a/hierarchy_heuristics/simplify_tree.py b/hierarchy_heuristics/simplify_tree.py
new file mode 100644
index 0000000..e5ff293
--- /dev/null
+++ b/hierarchy_heuristics/simplify_tree.py
@@ -0,0 +1,327 @@
+import json
+import math
+
+import cv2
+from bs4 import Tag
+
+
+def extract_ids_from_output(item):
+ """
+ Convert the extracted IDs to a list of integers.
+ The format is a bit complex to motivate the model to return a more accurate output.
+ """
+ numbers = []
+ if isinstance(item, dict):
+ for value in item.values():
+ numbers.extend(extract_ids_from_output(value))
+ elif isinstance(item, list):
+ for element in item:
+ numbers.extend(extract_ids_from_output(element))
+ elif isinstance(item, int):
+ numbers.append(item)
+ return numbers
+
+
+def create_xml_element(element):
+ # Only include elements with meaningful content
+ if (
+ not any(
+ [
+ element.get("name"),
+ element.get("description"),
+ element.get("value"),
+ element.get("children"),
+ ]
+ )
+ and element["role"] == "AXGroup"
+ ):
+ return None
+
+ attrib = {
+ "role_description": element.get("role_description", ""),
+ "id": element["id"],
+ "name": element.get("name") if element.get("name") else "",
+ "description": (
+ element.get("description") if element.get("description") else ""
+ ),
+ "value": str(element["value"]) if element.get("value") is not None else "",
+ "position": element.get("position", "0.0;0.0"),
+ "size": element.get("size", "0.0;0.0"),
+ }
+
+ xml_element = Tag(name=element["role"], attrs=attrib)
+ meaningful_children = list(
+ filter(
+ lambda x: x is not None,
+ [create_xml_element(child) for child in element.get("children", [])],
+ )
+ )
+
+ # Merge condition: Check if it's a row with only one cell
+ if (
+ element["role"] == "AXRow"
+ and len(meaningful_children) == 1
+ and meaningful_children[0].name == "AXCell"
+ ):
+ cell = meaningful_children[0]
+ cell_children = list(cell.children)
+ if len(cell_children) == 1:
+ merged_element = cell_children[0]
+ merged_element["role_description"] = f"merged {attrib['role_description']}"
+ return merged_element
+
+ for child in meaningful_children:
+ xml_element.append(child)
+
+ # Remove elements that are groups without meaningful children
+ if not list(xml_element.children) and element["role"] == "AXGroup":
+ return None
+
+ return xml_element
+
+
+def pretty_print_xml(xml_tree):
+ if xml_tree is None:
+ return ""
+
+ if type(xml_tree) == str:
+ return xml_tree
+
+ return xml_tree.prettify()
+
+
+def map_ids(element, id_mapping, current_id=1):
+ original_id = element["id"]
+ id_mapping[current_id] = original_id
+ element["id"] = str(current_id)
+ current_id += 1
+
+ for child in element.find_all(recursive=False):
+ current_id = map_ids(child, id_mapping, current_id)
+
+ return current_id
+
+
+def json_to_xml(json_obj):
+ id_mapping = {}
+ root_element = create_xml_element(json_obj)
+ if root_element is None:
+ return None, id_mapping
+
+ # Map ids of elements
+ map_ids(root_element, id_mapping)
+ return root_element, id_mapping
+
+
+def add_ids_to_json(json_obj, curr_id=0):
+ json_obj["id"] = curr_id
+ curr_id += 1
+
+ for child in json_obj["children"]:
+ _, curr_id = add_ids_to_json(child, curr_id)
+ curr_id += 1
+
+ return json_obj, curr_id
+
+
+def format_json(json_object):
+ new_json_object = {}
+ new_json_object["@children"] = []
+
+ for key, value in json_object.items():
+ print(key, value)
+ if not key.startswith("@"):
+ if isinstance(value, dict):
+ format_json(value)
+ value["@role"] = key
+ new_json_object["@children"].append(value)
+ else:
+ for child in value:
+ format_json(child)
+ child["@role"] = key
+ new_json_object["@children"].append(value)
+ else:
+ new_json_object[key] = value
+
+ return new_json_object
+
+
+def xml2dict(xml_object: Tag, width, height):
+ dict_object = {"role": xml_object.name, **xml_object.attrs, "children": list()}
+ position = xml_object.get("position", "0.0;0.0")
+ size = xml_object.get("size", "0.0;0.0")
+
+ position = list(map(float, position.split(";"))) if position else (0, 0)
+ size = list(map(float, size.split(";"))) if size else (0, 0)
+
+ try:
+ position = list(map(int, position))
+ except Exception as e:
+ if abs(position[0]) == float("inf"):
+ position[0] = 0 if position[0] < 0 else width // 2
+ elif math.isnan(position[0]):
+ position[0] = 0
+ else:
+ position[0] = int(position[0])
+
+ if abs(position[1]) == float("inf"):
+ position[1] = 0 if position[1] < 0 else height // 2
+ elif math.isnan(position[1]):
+ position[1] = 0
+ else:
+ position[1] = int(position[1])
+
+ try:
+ size = list(map(int, size))
+ except Exception as e:
+ if abs(size[0]) == float("inf"):
+ size[0] = 0 if position[0] < 0 else width // 2 - position[0]
+ elif math.isnan(size[0]):
+ size[0] = 0
+ else:
+ size[0] = int(size[0])
+
+ if abs(size[1]) == float("inf"):
+ size[1] = 0 if size[1] < 0 else height // 2 - position[1]
+ elif math.isnan(size[1]):
+ size[1] = 0
+ else:
+ size[1] = int(size[1])
+
+ del dict_object["position"]
+ del dict_object["size"]
+
+ dict_object["xyxy"] = [*position, position[0] + size[0], position[1] + size[1]]
+ dict_object["xyxy_retina"] = [
+ 2 * position[0],
+ 2 * position[1],
+ 2 * (position[0] + size[0]),
+ 2 * (position[1] + size[1]),
+ ]
+
+ if dict_object["role"] == "AXWindow":
+ dict_object["xyxy"] = [0, 0, width // 2, height // 2]
+ dict_object["xyxy_retina"] = [0, 0, width, height]
+
+ for child in xml_object.children:
+ dict_object["children"].append(xml2dict(child, width, height))
+
+ return dict_object
+
+
+def simplify_tree(json_path: str, width, height):
+ with open(json_path, "r") as f:
+ json_object = json.load(f)
+
+ json_object, _ = add_ids_to_json(json_object)
+
+ root_element, id_mapping = json_to_xml(json_object)
+
+ # convert to dict
+ root_element_json = xml2dict(root_element, width, height)
+
+ # save
+ with open(f"{json_path[:-5]}_simplified.json", "w") as f:
+ json.dump(root_element_json, f, indent=2)
+
+
+types = [
+ "AXComboBox",
+ "AXLink",
+ "AXMenuBar",
+ "AXPage",
+ "AXHeading",
+ "AXListMarker",
+ "AXList",
+ "AXOpaqueProviderGroup",
+ "AXDateTimeArea",
+ "AXSlider",
+ "AXWindow",
+ "AXDisclosureTriangle",
+ "AXSheet",
+ "AXMenu",
+ "AXMenuButton",
+ "No role",
+ "AXCell",
+ "AXColorWell",
+ "AXTextField",
+ "AXIncrementor",
+ "AXScrollArea",
+ "AXButton",
+ "AXPopover",
+ "AXColumn",
+ "JavaAxIgnore",
+ "AXRadioButton",
+ "AXLevelIndicator",
+ "AXMenuItem",
+ "AXStaticText",
+ "AXRadioGroup",
+ "AXGroup",
+ "AXScrollBar",
+ "AXSplitGroup",
+ "AXToolbar",
+ "AXRuler",
+ "AXProgressIndicator",
+ "AXValueIndicator",
+ "AXTabGroup",
+ "AXGrowArea",
+ "AXImage",
+ "AXRow",
+ "AXGenericElement",
+ "AXWebArea",
+ "AXCheckBox",
+ "AXOutline",
+ "AXGrid",
+ "AXBrowser",
+ "AXSplitter",
+ "AXBusyIndicator",
+ "AXUnknown",
+ "AXTextArea",
+ "AXPopUpButton",
+ "AXTable",
+]
+
+bgr_colors = [
+ (255, 0, 0), # Blue
+ (0, 255, 0), # Green
+ (0, 0, 255), # Red
+ (255, 255, 0), # Yellow
+ (0, 255, 255), # Cyan
+ (255, 0, 255), # Magenta
+ (128, 0, 0), # Maroon
+ (0, 128, 0), # Dark Green
+ (0, 0, 128), # Navy
+ (128, 128, 0), # Olive
+ (128, 0, 128), # Purple
+ (0, 128, 128), # Teal
+ (192, 192, 192), # Silver
+ (128, 128, 128), # Gray
+ (255, 165, 0), # Orange
+ (255, 192, 203), # Pink
+ (210, 105, 30), # Chocolate
+ (34, 139, 34), # Forest Green
+ (255, 215, 0), # Gold
+ (135, 206, 250), # Sky Blue
+]
+
+
+def plot_json(img, json_object, save_path=None):
+ color = bgr_colors[types.index(json_object["role"]) % len(bgr_colors)]
+
+ x1, y1, x2, y2 = json_object["xyxy_retina"]
+ cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+
+ label = json_object["role"]
+ text_color = (255, 255, 255)
+
+ (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
+
+ # Prints the text
+ cv2.rectangle(img, (x1, y1 - 20), (x1 + w, y1), color, -1)
+ cv2.putText(img, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, text_color, 1)
+
+ for child in json_object["children"]:
+ plot_json(img, child)
+
+ if save_path is not None:
+ cv2.imwrite(save_path, img)
diff --git a/hierarchy_heuristics/utils.py b/hierarchy_heuristics/utils.py
new file mode 100644
index 0000000..a08b22c
--- /dev/null
+++ b/hierarchy_heuristics/utils.py
@@ -0,0 +1,785 @@
+from collections import defaultdict
+import os
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from ocrmac import ocrmac
+from ultralytics import YOLO
+import gdown
+
+from box import *
+
+os.makedirs("models", exist_ok=True)
+
+if not os.path.exists("./models/ui_types_best_v4.pt"):
+ print("Downloading element detection model...")
+ gdown.download(
+ "https://drive.google.com/uc?id=1kFmzkda5k-88Lp59P4LTRSq1HDzyHyQ5",
+ "./models/ui_types_best_v4.pt",
+ quiet=False,
+ )
+
+if not os.path.exists("./models/clickability.pt"):
+ print("Downloading clickability model...")
+ gdown.download(
+ "https://drive.google.com/uc?id=1Q0GPtEgqpFmXOlIVC4Z2wJAzjLToWCLH",
+ "./models/clickability.pt",
+ quiet=False,
+ )
+
+model = YOLO("./models/ui_types_best_v4.pt")
+clickability_model = YOLO("./models/clickability.pt")
+
+
+def plot_app(image, app, curr_depth=0, depths=[]):
+ if curr_depth == 0:
+ depths.clear()
+ img = None
+ if curr_depth >= len(depths):
+ img = image.copy()
+ depths.append(img)
+ else:
+ img = depths[curr_depth]
+
+ if isinstance(app, Group):
+ app = [app]
+
+ for box in app:
+ if isinstance(box, Group):
+ plot_app(image, box.children, curr_depth=curr_depth + 1)
+ cv2.rectangle(img, box.top_left, box.bottom_right, (0, 255, 0), 2)
+ else:
+ cv2.rectangle(img, box.top_left, box.bottom_right, (255, 0, 0), 2)
+
+ # add class name and rectangle behind it so it is more visible
+ cv2.rectangle(img, (box.x1, box.y1), (box.x2, box.y1 - 20), (0, 0, 0), -1)
+ cv2.putText(
+ img,
+ Box.id2class[box.cls],
+ (box.x1, box.y1),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.9,
+ (255, 255, 255),
+ 2,
+ )
+
+ return depths
+
+
+def plot_all_boxes(image, app, depth = 0):
+ cv2.putText(image, str(depth), app.top_left, cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 0), 2)
+
+ if isinstance(app, Group):
+ cv2.rectangle(image, app.top_left, app.bottom_right, (255, 0, 0), 2)
+
+ for child in app.children:
+ image = plot_all_boxes(image, child, depth + 1)
+
+ else:
+ cv2.rectangle(image, app.top_left, app.bottom_right, (0, 0, 255), 2)
+
+ return image
+
+
+# ## Find texts, bind with boxes and merge
+def ocr_image(img, conf_threshold=0.5) -> list[Text]:
+ ocr = ocrmac.OCR(Image.fromarray(img), language_preference=["en-US"]).recognize(
+ px=True
+ )
+ return [Text(box, text) for text, conf, box in ocr if conf > conf_threshold]
+
+
+def bind_text_and_boxes(
+ boxes: list[Box], text_boxes: list[Text], iou_threshold=0.5
+) -> list[Box]:
+ # bind Text to Boxes. Bind Text with largest iou but not less than iou_threshold
+ for text_box in text_boxes:
+ max_iou = 0
+ max_box = None
+
+ for box in boxes:
+ iou = text_box.iou(box)
+
+ if iou > max_iou:
+ max_iou = iou
+ max_box = box
+
+ if max_iou > iou_threshold:
+ max_box.text += text_box
+ else:
+ boxes.append(Box(text_box, Box.class2id["OCRText"], text=text_box))
+
+ return boxes
+
+
+def mergable(box1: UIBox, box2: UIBox) -> int:
+ """
+ Returns:
+ int: -1 if not mergable, class id if mergable
+ """
+ if box1.cls == box2.cls:
+ return box1.cls
+
+ text_class = [Box.class2id["OCRText"], Box.class2id["AXStaticText"]]
+
+ text_mergeable = [
+ Box.class2id["AXButton"],
+ Box.class2id["AXLink"],
+ Box.class2id["AXStaticText"],
+ Box.class2id["AXRadioButton"],
+ Box.class2id["AXCheckBox"],
+ Box.class2id["AXComboBox"],
+ Box.class2id["AXTextField"],
+ Box.class2id["AXHeading"],
+ ]
+
+ # if static text and button or link
+ if box1.cls in text_class and box2.cls in text_mergeable:
+ return box2.cls
+
+ if box2.cls in text_class and box1.cls in text_mergeable:
+ return box1.cls
+
+ # TODO: forbid. Make image a child of button
+ if box1.cls == Box.class2id["AXButton"] and box2.cls == Box.class2id["AXImage"]:
+ return Box.class2id["AXButton"]
+
+ if box2.cls == Box.class2id["AXButton"] and box1.cls == Box.class2id["AXImage"]:
+ return Box.class2id["AXButton"]
+
+ return -1
+
+
+def merge_overlapping_boxes(boxes: list[UIBox], iou_threshold=0.5):
+ # sort by y1
+ boxes.sort(key=lambda box: box.y1)
+
+ i = 0
+ while i < len(boxes):
+ j = i + 1
+
+ while j < len(boxes):
+ if i == j:
+ j += 1
+ continue
+
+ if boxes[i].x1 < boxes[j].x2 and boxes[j].x1 < boxes[i].x2: # x-overlap
+ merge_cls = mergable(boxes[i], boxes[j])
+
+ if boxes[i].y2 > boxes[j].y1 and merge_cls != -1: # y-overlap
+ iou = boxes[i].iou(boxes[j])
+
+ if iou > iou_threshold:
+ boxes[j].merge(boxes[i], inplace=True, cls=merge_cls)
+
+ del boxes[i]
+ i -= 1
+
+ if j > i:
+ j -= 1
+
+ break
+ j += 1
+ # else:
+ # break
+
+ i += 1
+
+ return boxes
+
+
+def text_grouping(boxes: list[Box]) -> list[UIBox]:
+ """
+ https://docs-assets.developer.apple.com/ml-research/papers/screen-recognition-chi-2021.pdf
+
+ We group a TextT1 with a Text below T2 if they satisfy:
+ 1) they have x-overlap, and
+ 2) the y-distance between the two texts should be less than a threshold —
+ we choose min(T1.heiht, T2.heiht).
+ """
+ text_boxes = [
+ box
+ for box in boxes
+ if (hasattr(box, "text") and box.text)
+ or (isinstance(box, Group) and box.group_type == "text")
+ ]
+
+ text_boxes.sort(key=lambda box: box.text.y1)
+
+ for i, box1 in enumerate(text_boxes):
+ group = Group(box1.text.box, group_type="text")
+ # group = Group(box1.box, group_type="text")
+ group.append(box1)
+
+ for j, box2 in enumerate(text_boxes[i + 1 :], start=i + 1):
+ if i == j:
+ continue
+
+ if box2.parent:
+ continue
+
+ bbox = group.children[-1].text
+
+ if (
+ bbox.x1 < box2.text.x1 < bbox.x2
+ or bbox.x1 < box2.text.x2 < bbox.x2
+ or box2.text.x1 < bbox.x1 < box2.text.x2
+ or box2.text.x1 < bbox.x2 < box2.text.x2
+ ): # x-overlap
+ y_distance = bbox.y_distance(box2.text)
+ y_threshold = min(bbox.height, box2.text.height) + 15
+
+ if y_distance < y_threshold:
+ if box2 in boxes:
+ boxes.remove(box2)
+
+ group.merge_bboxes(box2.text, inplace=True)
+ group.append(box2)
+
+ if len(group.children) > 1:
+ if box1 in boxes:
+ boxes.remove(box1)
+
+ group.finalize_bbox()
+ boxes.append(group)
+
+ else:
+ box1.parent = None
+
+ return boxes
+
+
+# ## Create image groups (image + text)
+# Also with button
+def get_overlap_percent(image: BBox, text: BBox, coordinate="x") -> float:
+ if coordinate == "x":
+ overlap = max(0, min(image.x2, text.x2) - max(image.x1, text.x1))
+ return overlap / max(image.width, text.width)
+ else:
+ if text.height > image.height * 1.1:
+ return 0
+
+ overlap = max(0, min(image.y2, text.y2) - max(image.y1, text.y1))
+ return overlap / max(image.height, text.height)
+
+
+def merge_images_and_captions(
+ boxes: list[UIBox],
+ screen_shape: list[int, int],
+ x_overlap_percent_threshold=0.25,
+ y_distance_threshold=0.02,
+ y_overlap_percent_threshold=0.4,
+ x_distance_threshold=0.02,
+) -> list[UIBox]:
+ screen_height, screen_width = screen_shape
+
+ # mergable elements
+ elements_with_caption = (
+ Box.class2id["AXImage"],
+ Box.class2id["AXButton"],
+ Box.class2id["AXRadioButton"],
+ Box.class2id["AXCheckBox"],
+ Box.class2id["AXComboBox"],
+ )
+
+ images = [box for box in boxes if box.cls in elements_with_caption]
+ text_boxes = [
+ box
+ for box in boxes
+ if (hasattr(box, "text") and box.text)
+ or (isinstance(box, Group) and box.group_type == "text")
+ ]
+
+ images.sort(key=lambda box: box.y1, reverse=True)
+ text_boxes.sort(key=lambda box: box.y1, reverse=True)
+
+ for image in images:
+ group = Group(
+ image.box, children=[image], group_type=Box.id2class[image.cls][2:]
+ )
+
+ for text in text_boxes:
+ if (
+ text.parent
+ and text.parent.cls == Box.class2id["Group"]
+ and text.parent.group_type != "text"
+ ):
+ continue
+
+ if (
+ get_overlap_percent(group, text, coordinate="x")
+ > x_overlap_percent_threshold
+ ):
+ y_distance = min(
+ abs(group.y2 - text.y1), abs(group.y2 - text.y2)
+ ) # distance between image and text. image must be above text
+ y_threshold = y_distance_threshold * screen_height
+
+ if y_distance < y_threshold:
+ if text in boxes:
+ boxes.remove(text)
+
+ group.merge_bboxes(text, inplace=True)
+ group.append(text)
+ text.parent = group
+
+ if (
+ get_overlap_percent(group, text, coordinate="y")
+ > y_overlap_percent_threshold
+ ):
+ x_distance = group.x_distance(
+ text
+ ) # image might be on the left or right of text
+ x_threshold = x_distance_threshold * screen_width
+
+ if x_distance < x_threshold:
+ if text in boxes:
+ boxes.remove(text)
+
+ group.merge_bboxes(text, inplace=True)
+ group.append(text)
+ text.parent = group
+
+ if len(group.children) > 1:
+ boxes.append(group)
+ image.parent = group
+
+ if image in boxes:
+ boxes.remove(image)
+
+ else:
+ image.parent = None
+
+ return boxes
+
+
+# ## Group by columns
+# elements that are
+# 1) close to each other (y coordinate)
+# 2) has +- same x1 (maybe x2 too?)
+# 3) +- same height?
+# 4) has same type?
+#
+def group_by_column(
+ boxes: list[UIBox],
+ y_distance_coefficient=1.25,
+ width_threshold=40,
+ max_width_coefficient=0.55,
+) -> list[UIBox]:
+ boxes.sort(key=lambda box: box.x1)
+
+ i = 0
+ while i < len(boxes):
+ box1 = boxes[i]
+ group = Group(box1.box, children=[box1], group_type="column")
+
+ j = 0
+ while j < len(boxes):
+ if i == j:
+ j += 1
+ continue
+
+ box2 = boxes[j]
+
+ min_height = min(group.height, box2.height)
+
+ if (
+ group.y_distance(box2) < y_distance_coefficient * min_height
+ ): # which thresold to use?
+ max_width = max(group.width, box2.width)
+ min_width = min(group.width, box2.width)
+
+ if (
+ abs(group.x1 - box2.x1) < width_threshold
+ or abs(group.x2 - box2.x2) < width_threshold
+ ) and max_width * max_width_coefficient < min_width:
+ if box2 in boxes:
+ j -= 1
+ boxes.remove(box2)
+
+ if i > j:
+ i -= 1
+
+ group.merge_bboxes(box2, inplace=True)
+ group.append(box2)
+
+ j += 1
+
+ if len(group.children) > 1:
+ if box1 in boxes:
+ boxes.remove(box1)
+ i -= 1
+
+ box1.parent = group
+ boxes.append(group)
+
+ else:
+ box1.parent = None
+
+ i += 1
+
+ return boxes
+
+
+# create histogram of colors in image
+
+
+def color_histogram(img, threshold):
+ # Read the grayscale image
+ unique_colors, counts = np.unique(img, return_counts=True)
+
+ # Create a dictionary to hold color areas
+ color_area_dict = defaultdict(int)
+
+ # Populate the dictionary with colors and their areas
+ for color, count in zip(unique_colors, counts):
+ color_area_dict[color] += count
+
+ # Convert dictionary to arrays for faster processing
+ colors = np.array(list(color_area_dict.keys()))
+ areas = np.array(list(color_area_dict.values()))
+
+ # Merging colors based on the threshold
+ merged_colors = []
+ merged_areas = []
+
+ while colors.size > 0:
+ base_color = colors[0]
+ merged_area = areas[0]
+
+ # Create a mask for similar colors
+ mask = np.abs(colors - base_color) <= threshold
+
+ # Sum areas of similar colors
+ merged_area += areas[mask].sum()
+
+ # Append merged color and area
+ merged_colors.append(base_color)
+ merged_areas.append(merged_area)
+
+ # Remove merged colors from the list
+ colors = colors[~mask]
+ areas = areas[~mask]
+
+ # Sort colors by area in descending order
+ sorted_indices = np.argsort(merged_areas)[::-1]
+ sorted_colors = [merged_colors[i] for i in sorted_indices]
+ sorted_areas = [merged_areas[i] for i in sorted_indices]
+
+ return sorted_colors, sorted_areas, sum(sorted_areas)
+
+
+def get_mask(img, threshold=2, area_threshold=0.01):
+ colors_list, sorted_areas, colors_sum = color_histogram(img, threshold)
+
+ for i in range(len(colors_list)):
+ color = colors_list[i]
+
+ predicted_perc = sorted_areas[i] / colors_sum
+ if predicted_perc < area_threshold:
+ break
+
+ lower_bound = np.int16([color]) - threshold
+ upper_bound = np.int16([color]) + threshold
+
+ lower_bound = np.where(lower_bound < 0, 0, lower_bound)
+ upper_bound = np.where(upper_bound > 255, 255, upper_bound)
+
+ mask = (img >= lower_bound) & (img <= upper_bound)
+
+ elements = np.sum(np.sum(mask))
+
+ percentage = elements / img.size
+ if percentage < area_threshold:
+ break
+
+ mask = mask.astype(np.uint8) * 255
+
+ yield mask, percentage
+
+
+def add_color_groups(
+ img,
+ boxes: list[UIBox],
+ verbose=0,
+ min_box_threshold=0.03,
+ max_box_threshold=0.95,
+ color_diff_threshold=2,
+ area_threshold=0.01,
+):
+ min_area = min_box_threshold * img.size
+ max_area = max_box_threshold * img.size
+
+ kernel = np.ones((10, 10), np.uint8)
+
+ color_groups: list[Group] = []
+
+ for mask, percentage in get_mask(img, color_diff_threshold, area_threshold):
+ if verbose >= 1:
+ print(f"Percentage: {percentage * 100:.2f}%")
+
+ original_mask = mask.copy()
+
+ mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+
+ if verbose >= 3:
+ mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB)
+ plt.imshow(mask)
+ plt.show()
+ mask = cv2.cvtColor(mask, cv2.COLOR_RGB2GRAY)
+
+ contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[
+ -2
+ ]
+ for cnt in contours:
+ x, y, w, h = cv2.boundingRect(cnt)
+ box_area = w * h
+
+ if not (min_area < box_area < max_area):
+ continue
+
+ group = Group((x, y, x + w, y + h), group_type="color")
+ color_groups.append(group)
+
+ if verbose >= 1:
+ print(f" - Box area: {box_area / img.size * 100:.5f}%")
+
+ if verbose >= 2:
+ mask_copy = original_mask.copy()
+ mask_copy = cv2.cvtColor(mask_copy, cv2.COLOR_GRAY2BGR)
+ cv2.rectangle(mask_copy, (x, y), (x + w, y + h), (255, 0, 0), 2)
+ plt.imshow(mask_copy)
+ plt.show()
+
+ if verbose >= 1:
+ print("End")
+
+ color_groups.sort(key=lambda group: group.area)
+
+ # add boxes to color groups
+ for color_group in color_groups:
+ i = 0
+ while i < len(boxes):
+ box = boxes[i]
+
+ if box.parent:
+ continue
+
+ # box is inside color box
+ if color_group.other_is_inside(box, margin=10):
+ color_group.append(box)
+
+ if box in boxes:
+ boxes.remove(box)
+ i -= 1
+
+ i += 1
+
+ color_groups = [box for box in color_groups if box.children]
+
+ # merge color groups
+ i = 0
+ while i < len(color_groups):
+ gr1 = color_groups[i]
+ j = 0
+
+ while j < len(color_groups):
+ if i == j:
+ j += 1
+ continue
+
+ gr2 = color_groups[j]
+
+ if gr1.other_is_inside(gr2):
+ gr1.append(gr2)
+ color_groups.remove(gr2)
+ j -= 1
+
+ if i > j:
+ i -= 1
+
+ j += 1
+
+ i += 1
+
+ boxes += color_groups
+
+ return boxes
+
+
+# ## Group by row
+def group_by_row(app: Group, y_distance_threshold=20, height_threshold=0.8) -> Group:
+ app.children.sort(key=lambda group: group.x1)
+
+ for child in app.children:
+ if isinstance(child, Group):
+ group_by_row(child, y_distance_threshold, height_threshold)
+
+ i = 0
+ while i < len(app.children):
+ box1 = app.children[i]
+
+ row_group = Group(box1.box, group_type="row")
+ prev_parent = box1.parent
+ row_group.append(box1)
+
+ j = 0
+ while j < len(app.children):
+ box2 = app.children[j]
+
+ if box1 is box2:
+ j += 1
+ continue
+
+ # if they have +- same y1 or y2 coordinate
+ if (
+ abs(box2.y1 - row_group.y1) <= y_distance_threshold
+ or abs(box2.y2 - row_group.y2) <= y_distance_threshold
+ ):
+ # if they have +- same height
+ min_height = min(row_group.height, box2.height)
+ max_height = max(row_group.height, box2.height)
+
+ if min_height / max_height > height_threshold:
+ if box2 in app.children:
+ app.children.remove(box2)
+ j -= 1
+
+ if i > j:
+ i -= 1
+
+ row_group.merge_bboxes(box2, inplace=True)
+ row_group.append(box2)
+
+ j += 1
+
+ if len(row_group.children) > 1:
+ app.children[i] = row_group
+ else:
+ box1.parent = prev_parent
+
+ i += 1
+
+ return app
+
+
+def add_clickability(image, boxes, verbose, conf=0.3):
+ clickability_pred = clickability_model(image, verbose=verbose, conf=conf)[0]
+ clickability_boxes = zip(clickability_pred.boxes.xyxy, clickability_pred.boxes.cls)
+ clickability_boxes = [(BBox(box), cls) for box, cls in clickability_boxes]
+
+ for box in boxes:
+ if box.cls in (
+ Box.class2id["AXStaticText"],
+ Box.class2id["OCRText"],
+ Box.class2id["AXImage"],
+ ):
+ max_iou = 0
+ max_elem = None
+
+ for bbox, cls in clickability_boxes:
+ curr_iou = box.iou(bbox)
+
+ if curr_iou > 0.1 and curr_iou > max_iou:
+ max_iou = curr_iou
+ max_elem = (bbox, cls)
+
+ if max_elem and max_elem[1] == 1:
+ box.cls = Box.class2id["AXButton"]
+ box.merge_bboxes(max_elem[0], inplace=True)
+
+ return boxes
+
+
+def group_elements(img: np.ndarray, **kwargs) -> Group:
+ verbose = kwargs.get("verbose", False)
+ preds = model(img, conf=kwargs.get("ui_model_conf", 0.3), verbose=verbose)[0]
+
+ img = cv2.cvtColor(preds.orig_img, cv2.COLOR_BGR2RGB)
+ img_gray = cv2.cvtColor(img.copy(), cv2.COLOR_BGR2GRAY)
+
+ boxes, classes = (
+ preds.boxes.xyxy,
+ preds.boxes.cls,
+ )
+ boxes = [Box(box, cls) for box, cls in zip(boxes, classes)]
+
+ # use OCR to get more boxes
+ ocr_text = ocr_image(img, conf_threshold=kwargs.get("ocr_conf_threshold", 0.3))
+
+ boxes = bind_text_and_boxes(
+ boxes,
+ ocr_text,
+ iou_threshold=kwargs.get("bind_text_and_boxes_iou_threshold", 0.05),
+ )
+
+ # # delete boxes with iou > threshold
+ boxes = merge_overlapping_boxes(
+ boxes, iou_threshold=kwargs.get("merge_overlapping_boxes_iou_threshold", 0.3)
+ )
+
+ # add clickability
+ boxes = add_clickability(
+ img, boxes, verbose, conf=kwargs.get("clickability_model_conf", 0.3)
+ )
+
+ # # text grouping
+ boxes = text_grouping(boxes)
+
+ boxes = merge_images_and_captions(
+ boxes,
+ img.shape[:2],
+ x_overlap_percent_threshold=kwargs.get(
+ "merge_images_and_captions_x_overlap_percent_threshold", 0.25
+ ),
+ y_distance_threshold=kwargs.get(
+ "merge_images_and_captions_y_distance_threshold", 0.02
+ ),
+ y_overlap_percent_threshold=kwargs.get(
+ "merge_images_and_captions_y_overlap_percent_threshold", 0.4
+ ),
+ x_distance_threshold=kwargs.get(
+ "merge_images_and_captions_x_distance_threshold", 0.02
+ ),
+ )
+
+ boxes = group_by_column(
+ boxes,
+ y_distance_coefficient=kwargs.get(
+ "group_by_column_y_distance_coefficient", 1.25
+ ),
+ width_threshold=kwargs.get("group_by_column_width_threshold", 40),
+ max_width_coefficient=kwargs.get("group_by_column_max_width_coefficient", 0.55),
+ )
+
+ # # color groups
+ boxes = add_color_groups(
+ img_gray,
+ boxes,
+ verbose=0,
+ min_box_threshold=kwargs.get("add_color_groups_min_box_threshold", 0.03),
+ max_box_threshold=kwargs.get("add_color_groups_max_box_threshold", 0.95),
+ color_diff_threshold=kwargs.get("add_color_groups_color_diff_threshold", 2),
+ area_threshold=kwargs.get("add_color_groups_area_threshold", 0.01),
+ )
+
+ app = Group((0, 0, img.shape[1], img.shape[0]), children=boxes, group_type="Window")
+
+ app = group_by_row(
+ app,
+ y_distance_threshold=kwargs.get("group_by_row_y_distance_threshold", 10),
+ height_threshold=kwargs.get("group_by_row_height_threshold", 0.8),
+ )
+
+ if verbose:
+ depths = plot_app(img, app)
+
+ for i, img_plot in enumerate(depths):
+ print(f"Depth: {i}")
+ plt.figure(figsize=(15, 15))
+ plt.imshow(img_plot)
+ plt.show()
+
+ return app
diff --git a/output_visualisation/example_data/custom_acc.json b/output_visualisation/example_data/custom_acc.json
new file mode 100644
index 0000000..37ff305
--- /dev/null
+++ b/output_visualisation/example_data/custom_acc.json
@@ -0,0 +1,2915 @@
+{
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 0,
+ 0,
+ 3456,
+ 1976
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 173,
+ 14,
+ 261,
+ 82
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "go to news | news | news",
+ "box": [
+ 171,
+ 0,
+ 261,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3101,
+ 13,
+ 3180,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "add to cart | go to cart | go to cart",
+ "box": [
+ 3097,
+ 0,
+ 3185,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2998,
+ 8,
+ 3091,
+ 91
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "edit | edit text | edit text",
+ "box": [
+ 3006,
+ 0,
+ 3092,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3187,
+ 12,
+ 3287,
+ 92
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "click to save | save | save the article",
+ "box": [
+ 3190,
+ 0,
+ 3275,
+ 100
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3275,
+ 15,
+ 3398,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "delete | delete the file | delete the file",
+ "box": [
+ 3276,
+ 0,
+ 3361,
+ 100
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "save | save the image | save the image",
+ "box": [
+ 3362,
+ 1,
+ 3447,
+ 98
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3187,
+ 12,
+ 3287,
+ 92
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "click to save | save | save the article",
+ "box": [
+ 3190,
+ 0,
+ 3275,
+ 100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 5,
+ 98,
+ 261,
+ 140
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Al Pro",
+ "box": [
+ 25,
+ 110,
+ 95,
+ 135
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 13,
+ 140,
+ 260,
+ 195
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "(AI",
+ "box": [
+ 35,
+ 153,
+ 70,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Chats",
+ "box": [
+ 80,
+ 160,
+ 155,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add a | add to chat | send message",
+ "box": [
+ 32,
+ 150,
+ 67,
+ 190
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 4,
+ 252,
+ 260,
+ 308
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "\uf8ff Referral",
+ "box": [
+ 35,
+ 264,
+ 178,
+ 296
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add to cart | add to cart | go to cart",
+ "box": [
+ 33,
+ 264,
+ 66,
+ 296
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 7,
+ 195,
+ 260,
+ 252
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "Help",
+ "box": [
+ 70,
+ 208,
+ 140,
+ 240
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "go to search | search",
+ "box": [
+ 34,
+ 208,
+ 66,
+ 240
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2,
+ 308,
+ 258,
+ 365
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "M About",
+ "box": [
+ 35,
+ 321,
+ 160,
+ 352
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 288,
+ 271,
+ 823,
+ 320
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 284,
+ 437,
+ 312
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Now",
+ "box": [
+ 758,
+ 285,
+ 813,
+ 305
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 287,
+ 318,
+ 820,
+ 371
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 330,
+ 437,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "11:48",
+ "box": [
+ 753,
+ 331,
+ 813,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 288,
+ 271,
+ 823,
+ 320
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 284,
+ 437,
+ 312
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Now",
+ "box": [
+ 758,
+ 285,
+ 813,
+ 305
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 283,
+ 208,
+ 825,
+ 269
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "\u2022 History",
+ "box": [
+ 316,
+ 235,
+ 442,
+ 260
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2829,
+ 969,
+ 3443,
+ 1093
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 2822,
+ 967,
+ 3431,
+ 1101
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Set up a budget\nfor a student.",
+ "box": [
+ 2863,
+ 997,
+ 3064,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the home page | go to the home page | go to home page",
+ "box": [
+ 2829,
+ 960,
+ 3410,
+ 1101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1589,
+ 966,
+ 2198,
+ 1094
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Start conversations\nconfidently at events.",
+ "box": [
+ 1632,
+ 997,
+ 1898,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | start conversations | start conversations",
+ "box": [
+ 1588,
+ 964,
+ 2201,
+ 1099
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2204,
+ 965,
+ 2820,
+ 1095
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Organize vegetarian meals\nfor a busy student.",
+ "box": [
+ 2245,
+ 997,
+ 2576,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | open the homepage",
+ "box": [
+ 2208,
+ 962,
+ 2816,
+ 1100
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2211,
+ 958,
+ 3402,
+ 1097
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2829,
+ 969,
+ 3443,
+ 1093
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 2822,
+ 967,
+ 3431,
+ 1101
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Set up a budget\nfor a student.",
+ "box": [
+ 2863,
+ 997,
+ 3064,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the home page | go to the home page | go to home page",
+ "box": [
+ 2829,
+ 960,
+ 3410,
+ 1101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2204,
+ 965,
+ 2820,
+ 1095
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Organize vegetarian meals\nfor a busy student.",
+ "box": [
+ 2245,
+ 997,
+ 2576,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | open the homepage",
+ "box": [
+ 2208,
+ 962,
+ 2816,
+ 1100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1661,
+ 842,
+ 3456,
+ 951
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Help",
+ "box": [
+ 1883,
+ 867,
+ 1974,
+ 917
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Plan",
+ "box": [
+ 2501,
+ 872,
+ 2581,
+ 907
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1575,
+ 952,
+ 3189,
+ 1100
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1589,
+ 966,
+ 2198,
+ 1094
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Start conversations\nconfidently at events.",
+ "box": [
+ 1632,
+ 997,
+ 1898,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | start conversations | start conversations",
+ "box": [
+ 1588,
+ 964,
+ 2201,
+ 1099
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2211,
+ 958,
+ 3402,
+ 1097
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2829,
+ 969,
+ 3443,
+ 1093
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 2822,
+ 967,
+ 3431,
+ 1101
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Set up a budget\nfor a student.",
+ "box": [
+ 2863,
+ 997,
+ 3064,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the home page | go to the home page | go to home page",
+ "box": [
+ 2829,
+ 960,
+ 3410,
+ 1101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2204,
+ 965,
+ 2820,
+ 1095
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Organize vegetarian meals\nfor a busy student.",
+ "box": [
+ 2245,
+ 997,
+ 2576,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | open the homepage",
+ "box": [
+ 2208,
+ 962,
+ 2816,
+ 1100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 835,
+ 844,
+ 3164,
+ 947
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "2/ Write",
+ "box": [
+ 1179,
+ 865,
+ 1371,
+ 920
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1661,
+ 842,
+ 3456,
+ 951
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Help",
+ "box": [
+ 1883,
+ 867,
+ 1974,
+ 917
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Plan",
+ "box": [
+ 2501,
+ 872,
+ 2581,
+ 907
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 762,
+ 0,
+ 3214,
+ 103
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "+",
+ "box": [
+ 766,
+ 26,
+ 815,
+ 75
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "GPT-40 v",
+ "box": [
+ 884,
+ 40,
+ 1019,
+ 65
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the previous page | go to the previous page | go to the previous page",
+ "box": [
+ 853,
+ 18,
+ 1035,
+ 87
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3101,
+ 13,
+ 3180,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "add to cart | go to cart | go to cart",
+ "box": [
+ 3097,
+ 0,
+ 3185,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2998,
+ 8,
+ 3091,
+ 91
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "edit | edit text | edit text",
+ "box": [
+ 3006,
+ 0,
+ 3092,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 792,
+ 1809,
+ 3315,
+ 1961
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 872,
+ 1786,
+ 1900,
+ 1818
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 874,
+ 1787,
+ 1502,
+ 1819
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Message",
+ "box": [
+ 894,
+ 1900,
+ 1014,
+ 1930
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 0,
+ 107,
+ 275,
+ 1944
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "v2.0.5 (78)",
+ "box": [
+ 30,
+ 1910,
+ 145,
+ 1936
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to settings | settings",
+ "box": [
+ 190,
+ 1901,
+ 267,
+ 1960
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to previous | select the number | select number",
+ "box": [
+ 0,
+ 1898,
+ 181,
+ 1964
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 5,
+ 98,
+ 261,
+ 140
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Al Pro",
+ "box": [
+ 25,
+ 110,
+ 95,
+ 135
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 13,
+ 140,
+ 260,
+ 195
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "(AI",
+ "box": [
+ 35,
+ 153,
+ 70,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Chats",
+ "box": [
+ 80,
+ 160,
+ 155,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add a | add to chat | send message",
+ "box": [
+ 32,
+ 150,
+ 67,
+ 190
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 4,
+ 252,
+ 260,
+ 308
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "\uf8ff Referral",
+ "box": [
+ 35,
+ 264,
+ 178,
+ 296
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add to cart | add to cart | go to cart",
+ "box": [
+ 33,
+ 264,
+ 66,
+ 296
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 7,
+ 195,
+ 260,
+ 252
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "Help",
+ "box": [
+ 70,
+ 208,
+ 140,
+ 240
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "go to search | search",
+ "box": [
+ 34,
+ 208,
+ 66,
+ 240
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2,
+ 308,
+ 258,
+ 365
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "M About",
+ "box": [
+ 35,
+ 321,
+ 160,
+ 352
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 277,
+ 204,
+ 842,
+ 1873
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 287,
+ 318,
+ 820,
+ 371
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 330,
+ 437,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "11:48",
+ "box": [
+ 753,
+ 331,
+ 813,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 288,
+ 271,
+ 823,
+ 320
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 284,
+ 437,
+ 312
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Now",
+ "box": [
+ 758,
+ 285,
+ 813,
+ 305
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 283,
+ 208,
+ 825,
+ 269
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "\u2022 History",
+ "box": [
+ 316,
+ 235,
+ 442,
+ 260
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 835,
+ 844,
+ 3164,
+ 947
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "2/ Write",
+ "box": [
+ 1179,
+ 865,
+ 1371,
+ 920
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1661,
+ 842,
+ 3456,
+ 951
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Help",
+ "box": [
+ 1883,
+ 867,
+ 1974,
+ 917
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Plan",
+ "box": [
+ 2501,
+ 872,
+ 2581,
+ 907
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 792,
+ 1809,
+ 3315,
+ 1961
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 872,
+ 1786,
+ 1900,
+ 1818
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 874,
+ 1787,
+ 1502,
+ 1819
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Message",
+ "box": [
+ 894,
+ 1900,
+ 1014,
+ 1930
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 850,
+ 0,
+ 3451,
+ 864
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3275,
+ 15,
+ 3398,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "delete | delete the file | delete the file",
+ "box": [
+ 3276,
+ 0,
+ 3361,
+ 100
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "save | save the image | save the image",
+ "box": [
+ 3362,
+ 1,
+ 3447,
+ 98
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3187,
+ 12,
+ 3287,
+ 92
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "click to save | save | save the article",
+ "box": [
+ 3190,
+ 0,
+ 3275,
+ 100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 762,
+ 0,
+ 3214,
+ 103
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "+",
+ "box": [
+ 766,
+ 26,
+ 815,
+ 75
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "GPT-40 v",
+ "box": [
+ 884,
+ 40,
+ 1019,
+ 65
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the previous page | go to the previous page | go to the previous page",
+ "box": [
+ 853,
+ 18,
+ 1035,
+ 87
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3101,
+ 13,
+ 3180,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "add to cart | go to cart | go to cart",
+ "box": [
+ 3097,
+ 0,
+ 3185,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2998,
+ 8,
+ 3091,
+ 91
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "edit | edit text | edit text",
+ "box": [
+ 3006,
+ 0,
+ 3092,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 833,
+ 960,
+ 3456,
+ 1967
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Pen a dog's diary\nfor a day.",
+ "box": [
+ 1014,
+ 997,
+ 1230,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to previous | go to previous page | previous page",
+ "box": [
+ 3372,
+ 1886,
+ 3431,
+ 1946
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1575,
+ 952,
+ 3189,
+ 1100
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1589,
+ 966,
+ 2198,
+ 1094
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Start conversations\nconfidently at events.",
+ "box": [
+ 1632,
+ 997,
+ 1898,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | start conversations | start conversations",
+ "box": [
+ 1588,
+ 964,
+ 2201,
+ 1099
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2211,
+ 958,
+ 3402,
+ 1097
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2829,
+ 969,
+ 3443,
+ 1093
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 2822,
+ 967,
+ 3431,
+ 1101
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Set up a budget\nfor a student.",
+ "box": [
+ 2863,
+ 997,
+ 3064,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the home page | go to the home page | go to home page",
+ "box": [
+ 2829,
+ 960,
+ 3410,
+ 1101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2204,
+ 965,
+ 2820,
+ 1095
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Organize vegetarian meals\nfor a busy student.",
+ "box": [
+ 2245,
+ 997,
+ 2576,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | open the homepage",
+ "box": [
+ 2208,
+ 962,
+ 2816,
+ 1100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 277,
+ 204,
+ 842,
+ 1873
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 287,
+ 318,
+ 820,
+ 371
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 330,
+ 437,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "11:48",
+ "box": [
+ 753,
+ 331,
+ 813,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 288,
+ 271,
+ 823,
+ 320
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 284,
+ 437,
+ 312
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Now",
+ "box": [
+ 758,
+ 285,
+ 813,
+ 305
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 283,
+ 208,
+ 825,
+ 269
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "\u2022 History",
+ "box": [
+ 316,
+ 235,
+ 442,
+ 260
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 835,
+ 844,
+ 3164,
+ 947
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "2/ Write",
+ "box": [
+ 1179,
+ 865,
+ 1371,
+ 920
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1661,
+ 842,
+ 3456,
+ 951
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Help",
+ "box": [
+ 1883,
+ 867,
+ 1974,
+ 917
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Plan",
+ "box": [
+ 2501,
+ 872,
+ 2581,
+ 907
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 792,
+ 1809,
+ 3315,
+ 1961
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 872,
+ 1786,
+ 1900,
+ 1818
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 874,
+ 1787,
+ 1502,
+ 1819
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Message",
+ "box": [
+ 894,
+ 1900,
+ 1014,
+ 1930
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 0,
+ 19,
+ 3438,
+ 1976
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 299,
+ 121,
+ 820,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Chats",
+ "box": [
+ 306,
+ 39,
+ 396,
+ 65
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Q Search",
+ "box": [
+ 311,
+ 139,
+ 447,
+ 171
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the next page | go to the next page",
+ "box": [
+ 34,
+ 34,
+ 66,
+ 66
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 173,
+ 14,
+ 261,
+ 82
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "go to news | news | news",
+ "box": [
+ 171,
+ 0,
+ 261,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 0,
+ 107,
+ 275,
+ 1944
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "v2.0.5 (78)",
+ "box": [
+ 30,
+ 1910,
+ 145,
+ 1936
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to settings | settings",
+ "box": [
+ 190,
+ 1901,
+ 267,
+ 1960
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to previous | select the number | select number",
+ "box": [
+ 0,
+ 1898,
+ 181,
+ 1964
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 5,
+ 98,
+ 261,
+ 140
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Al Pro",
+ "box": [
+ 25,
+ 110,
+ 95,
+ 135
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 13,
+ 140,
+ 260,
+ 195
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "(AI",
+ "box": [
+ 35,
+ 153,
+ 70,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Chats",
+ "box": [
+ 80,
+ 160,
+ 155,
+ 185
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add a | add to chat | send message",
+ "box": [
+ 32,
+ 150,
+ 67,
+ 190
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 4,
+ 252,
+ 260,
+ 308
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "\uf8ff Referral",
+ "box": [
+ 35,
+ 264,
+ 178,
+ 296
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "add to cart | add to cart | go to cart",
+ "box": [
+ 33,
+ 264,
+ 66,
+ 296
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 7,
+ 195,
+ 260,
+ 252
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "Help",
+ "box": [
+ 70,
+ 208,
+ 140,
+ 240
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXImage",
+ "value": "go to search | search",
+ "box": [
+ 34,
+ 208,
+ 66,
+ 240
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2,
+ 308,
+ 258,
+ 365
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "M About",
+ "box": [
+ 35,
+ 321,
+ 160,
+ 352
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 850,
+ 0,
+ 3451,
+ 864
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3275,
+ 15,
+ 3398,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "delete | delete the file | delete the file",
+ "box": [
+ 3276,
+ 0,
+ 3361,
+ 100
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "save | save the image | save the image",
+ "box": [
+ 3362,
+ 1,
+ 3447,
+ 98
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3187,
+ 12,
+ 3287,
+ 92
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "click to save | save | save the article",
+ "box": [
+ 3190,
+ 0,
+ 3275,
+ 100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 762,
+ 0,
+ 3214,
+ 103
+ ],
+ "children": [
+ {
+ "cls": "AXImage",
+ "value": "+",
+ "box": [
+ 766,
+ 26,
+ 815,
+ 75
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "GPT-40 v",
+ "box": [
+ 884,
+ 40,
+ 1019,
+ 65
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the previous page | go to the previous page | go to the previous page",
+ "box": [
+ 853,
+ 18,
+ 1035,
+ 87
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3101,
+ 13,
+ 3180,
+ 89
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "add to cart | go to cart | go to cart",
+ "box": [
+ 3097,
+ 0,
+ 3185,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2998,
+ 8,
+ 3091,
+ 91
+ ],
+ "children": [
+ {
+ "cls": "AXButton",
+ "value": "edit | edit text | edit text",
+ "box": [
+ 3006,
+ 0,
+ 3092,
+ 101
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 833,
+ 960,
+ 3456,
+ 1967
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Pen a dog's diary\nfor a day.",
+ "box": [
+ 1014,
+ 997,
+ 1230,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to previous | go to previous page | previous page",
+ "box": [
+ 3372,
+ 1886,
+ 3431,
+ 1946
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1575,
+ 952,
+ 3189,
+ 1100
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1589,
+ 966,
+ 2198,
+ 1094
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Start conversations\nconfidently at events.",
+ "box": [
+ 1632,
+ 997,
+ 1898,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | start conversations | start conversations",
+ "box": [
+ 1588,
+ 964,
+ 2201,
+ 1099
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2211,
+ 958,
+ 3402,
+ 1097
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2829,
+ 969,
+ 3443,
+ 1093
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 2822,
+ 967,
+ 3431,
+ 1101
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Set up a budget\nfor a student.",
+ "box": [
+ 2863,
+ 997,
+ 3064,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to the home page | go to the home page | go to home page",
+ "box": [
+ 2829,
+ 960,
+ 3410,
+ 1101
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 2204,
+ 965,
+ 2820,
+ 1095
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Organize vegetarian meals\nfor a busy student.",
+ "box": [
+ 2245,
+ 997,
+ 2576,
+ 1058
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXButton",
+ "value": "go to home page | open the homepage",
+ "box": [
+ 2208,
+ 962,
+ 2816,
+ 1100
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 277,
+ 204,
+ 842,
+ 1873
+ ],
+ "children": [
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 287,
+ 318,
+ 820,
+ 371
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 330,
+ 437,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "11:48",
+ "box": [
+ 753,
+ 331,
+ 813,
+ 356
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 288,
+ 271,
+ 823,
+ 320
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "New Chat",
+ "box": [
+ 311,
+ 284,
+ 437,
+ 312
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Now",
+ "box": [
+ 758,
+ 285,
+ 813,
+ 305
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 283,
+ 208,
+ 825,
+ 269
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "\u2022 History",
+ "box": [
+ 316,
+ 235,
+ 442,
+ 260
+ ],
+ "children": []
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 835,
+ 844,
+ 3164,
+ 947
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "2/ Write",
+ "box": [
+ 1179,
+ 865,
+ 1371,
+ 920
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 1661,
+ 842,
+ 3456,
+ 951
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Help",
+ "box": [
+ 1883,
+ 867,
+ 1974,
+ 917
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Plan",
+ "box": [
+ 2501,
+ 872,
+ 2581,
+ 907
+ ],
+ "children": []
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 3022,
+ 847,
+ 3443,
+ 924
+ ],
+ "children": [
+ {
+ "cls": "Text",
+ "value": "Guide",
+ "box": [
+ 3099,
+ 867,
+ 3214,
+ 912
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "cls": "Group",
+ "value": null,
+ "box": [
+ 792,
+ 1809,
+ 3315,
+ 1961
+ ],
+ "children": [
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 872,
+ 1786,
+ 1900,
+ 1818
+ ],
+ "children": []
+ },
+ {
+ "cls": "AXTextArea",
+ "value": null,
+ "box": [
+ 874,
+ 1787,
+ 1502,
+ 1819
+ ],
+ "children": []
+ },
+ {
+ "cls": "Text",
+ "value": "Message",
+ "box": [
+ 894,
+ 1900,
+ 1014,
+ 1930
+ ],
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/output_visualisation/example_data/screenshot.png b/output_visualisation/example_data/screenshot.png
new file mode 100644
index 0000000..5c027ec
Binary files /dev/null and b/output_visualisation/example_data/screenshot.png differ
diff --git a/output_visualisation/example_data/system_acc.json b/output_visualisation/example_data/system_acc.json
new file mode 100644
index 0000000..bfce793
--- /dev/null
+++ b/output_visualisation/example_data/system_acc.json
@@ -0,0 +1,1037 @@
+{
+ "name": "Chats",
+ "role": "AXWindow",
+ "description": null,
+ "role_description": "standard window",
+ "value": null,
+ "position": "0.00;38.00",
+ "size": "1728;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "0.00;0.00",
+ "size": "1728;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXSplitGroup",
+ "description": null,
+ "role_description": "split group",
+ "value": null,
+ "position": "0.00;0.00",
+ "size": "1728;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "0.00;0.00",
+ "size": "140;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXScrollArea",
+ "description": null,
+ "role_description": "scroll area",
+ "value": null,
+ "position": "0.00;52.00",
+ "size": "140;896",
+ "children": [
+ {
+ "name": null,
+ "role": "AXOutline",
+ "description": "Sidebar",
+ "role_description": "outline",
+ "value": null,
+ "position": "0.00;52.00",
+ "size": "140;896",
+ "children": [
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "outline row",
+ "value": null,
+ "position": "0.00;52.00",
+ "size": "140;19",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "10.00;52.00",
+ "size": "120;19",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "14.00;52.00",
+ "size": "124;19",
+ "children": [
+ {
+ "name": null,
+ "role": "AXHeading",
+ "description": "AI Pro",
+ "role_description": "heading",
+ "value": null,
+ "position": "14.00;54.50",
+ "size": "33;14",
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "outline row",
+ "value": null,
+ "position": "0.00;71.00",
+ "size": "140;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "10.00;71.00",
+ "size": "120;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Chats",
+ "position": "19.00;76.79",
+ "size": "56;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "outline row",
+ "value": null,
+ "position": "0.00;99.00",
+ "size": "140;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "10.00;99.00",
+ "size": "120;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Help",
+ "position": "19.50;105.00",
+ "size": "48;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "outline row",
+ "value": null,
+ "position": "0.00;127.00",
+ "size": "140;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "10.00;127.00",
+ "size": "120;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Referral",
+ "position": "20.00;133.04",
+ "size": "68;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "outline row",
+ "value": null,
+ "position": "0.00;155.00",
+ "size": "140;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "10.00;155.00",
+ "size": "120;28",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "About",
+ "position": "19.00;161.00",
+ "size": "58;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXColumn",
+ "description": null,
+ "role_description": "column",
+ "value": null,
+ "position": "10.00;52.00",
+ "size": "120;896",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "v2.0.5 (78)",
+ "role_description": "button",
+ "value": null,
+ "position": "16.00;957.00",
+ "size": "57;14",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Settings",
+ "role_description": "button",
+ "value": null,
+ "position": "108.00;956.00",
+ "size": "16;16",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXSplitter",
+ "description": null,
+ "role_description": "splitter",
+ "value": 140.0,
+ "position": "140.00;52.00",
+ "size": "1;936",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "141.00;0.00",
+ "size": "280;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXScrollArea",
+ "description": null,
+ "role_description": "scroll area",
+ "value": null,
+ "position": "141.00;52.00",
+ "size": "280;936",
+ "children": [
+ {
+ "name": null,
+ "role": "AXTable",
+ "description": null,
+ "role_description": "list",
+ "value": null,
+ "position": "141.00;52.00",
+ "size": "280;936",
+ "children": [
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "table row",
+ "value": null,
+ "position": "141.00;62.00",
+ "size": "280;30",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "157.00;62.00",
+ "size": "248;30",
+ "children": [
+ {
+ "name": null,
+ "role": "AXTextField",
+ "description": null,
+ "role_description": "search text field",
+ "value": "",
+ "position": "150.00;62.00",
+ "size": "262;30",
+ "children": [
+ {
+ "name": "",
+ "role": "AXButton",
+ "description": "Search",
+ "role_description": "button",
+ "value": null,
+ "position": "152.00;66.00",
+ "size": "25;22",
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "table row",
+ "value": null,
+ "position": "141.00;112.00",
+ "size": "280;0",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "141.00;112.00",
+ "size": "280;0",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "157.00;112.00",
+ "size": "248;0",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "table row",
+ "value": null,
+ "position": "141.00;112.00",
+ "size": "280;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "157.00;112.00",
+ "size": "248;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "History",
+ "position": "159.00;116.94",
+ "size": "60;14",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "table row",
+ "value": null,
+ "position": "141.00;136.00",
+ "size": "280;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "157.00;136.00",
+ "size": "248;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "New Chat, Now",
+ "role_description": "button",
+ "value": null,
+ "position": "157.00;140.00",
+ "size": "248;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXRow",
+ "description": null,
+ "role_description": "table row",
+ "value": null,
+ "position": "141.00;160.00",
+ "size": "280;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXCell",
+ "description": null,
+ "role_description": "cell",
+ "value": null,
+ "position": "157.00;160.00",
+ "size": "248;24",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "New Chat, 11:48",
+ "role_description": "button",
+ "value": null,
+ "position": "157.00;164.00",
+ "size": "248;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXColumn",
+ "description": null,
+ "role_description": "column",
+ "value": null,
+ "position": "151.00;52.00",
+ "size": "260;936",
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXSplitter",
+ "description": null,
+ "role_description": "splitter",
+ "value": 280.0,
+ "position": "421.00;52.00",
+ "size": "1;936",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "422.00;0.00",
+ "size": "1306;988",
+ "children": [
+ {
+ "name": null,
+ "role": "AXScrollArea",
+ "description": null,
+ "role_description": "scroll area",
+ "value": null,
+ "position": "438.00;76.00",
+ "size": "1274;834",
+ "children": [
+ {
+ "name": null,
+ "role": "AXOpaqueProviderGroup",
+ "description": null,
+ "role_description": "list",
+ "value": null,
+ "position": "-436.00;429.00",
+ "size": "2148;122",
+ "children": [
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Create",
+ "position": "-335.00;430.50",
+ "size": "100;28",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Devise a zero-gravity sport, for space travelers.",
+ "role_description": "button",
+ "value": null,
+ "position": "-436.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Explain",
+ "position": "-29.00;429.00",
+ "size": "104;32",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Describe rainbows, to a 5-year-old.",
+ "role_description": "button",
+ "value": null,
+ "position": "-128.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Entertain",
+ "position": "267.50;431.50",
+ "size": "128;26",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Reimagine a classic fairy tale, set in the modern world.",
+ "role_description": "button",
+ "value": null,
+ "position": "180.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Write",
+ "position": "593.00;431.25",
+ "size": "91;26",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Pen a dog's diary, for a day.",
+ "role_description": "button",
+ "value": null,
+ "position": "488.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Help",
+ "position": "908.00;431.25",
+ "size": "79;26",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Start conversations, confidently at events.",
+ "role_description": "button",
+ "value": null,
+ "position": "796.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Plan",
+ "position": "1220.00;431.50",
+ "size": "70;27",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Organize vegetarian meals, for a busy student.",
+ "role_description": "button",
+ "value": null,
+ "position": "1104.00;477.50",
+ "size": "300;74",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Guide",
+ "position": "1518.50;431.50",
+ "size": "89;26",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Set up a budget, for a student.",
+ "role_description": "button",
+ "value": null,
+ "position": "1412.00;477.50",
+ "size": "300;74",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXScrollBar",
+ "description": null,
+ "role_description": "scroll bar",
+ "value": 1.0,
+ "position": "438.00;895.00",
+ "size": "1274;15",
+ "children": [
+ {
+ "name": null,
+ "role": "AXValueIndicator",
+ "description": null,
+ "role_description": "value indicator",
+ "value": 1.0,
+ "position": "949.50;895.00",
+ "size": "760;15",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "increment arrow button",
+ "value": null,
+ "position": "438.00;895.00",
+ "size": "0;0",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "decrement arrow button",
+ "value": null,
+ "position": "438.00;895.00",
+ "size": "0;0",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "increment page button",
+ "value": null,
+ "position": "1710.00;895.00",
+ "size": "2;15",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "decrement page button",
+ "value": null,
+ "position": "438.00;895.00",
+ "size": "511;15",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXScrollArea",
+ "description": null,
+ "role_description": "scroll area",
+ "value": null,
+ "position": "422.00;52.00",
+ "size": "1306;882",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Message",
+ "position": "450.00;951.00",
+ "size": "54;16",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXScrollArea",
+ "description": null,
+ "role_description": "scroll area",
+ "value": null,
+ "position": "444.00;951.00",
+ "size": "1226;16",
+ "children": [
+ {
+ "name": null,
+ "role": "AXTextArea",
+ "description": null,
+ "role_description": "text entry area",
+ "value": "",
+ "position": "444.00;951.00",
+ "size": "1211;16",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXScrollBar",
+ "description": null,
+ "role_description": "scroll bar",
+ "value": 0.0,
+ "position": "1655.00;951.00",
+ "size": "15;16",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Arrow Up Circle",
+ "role_description": "button",
+ "value": null,
+ "position": "1688.00;947.00",
+ "size": "24;24",
+ "children": []
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXToolbar",
+ "description": null,
+ "role_description": "toolbar",
+ "value": null,
+ "position": "0.00;0.00",
+ "size": "1728;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Hide Sidebar",
+ "role_description": "button",
+ "value": null,
+ "position": "87.00;0.00",
+ "size": "42;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Hide Sidebar",
+ "role_description": "button",
+ "value": null,
+ "position": "96.50;14.00",
+ "size": "24;18",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "New Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "377.00;0.00",
+ "size": "36;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "New Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "386.00;14.00",
+ "size": "18;17",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "430.00;0.00",
+ "size": "88;52",
+ "children": [
+ {
+ "name": "GPT-4o",
+ "role": "AXPopUpButton",
+ "description": null,
+ "role_description": "pop up button",
+ "value": null,
+ "position": "434.00;12.00",
+ "size": "80;28",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Edit Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1507.00;0.00",
+ "size": "36;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Edit Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1516.00;14.00",
+ "size": "18;17",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Pin Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1552.00;0.00",
+ "size": "37;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Pin Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1561.00;14.00",
+ "size": "19;22",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Clear Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1598.00;0.00",
+ "size": "39;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Clear Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1607.50;14.00",
+ "size": "20;20",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Delete Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1637.00;0.00",
+ "size": "37;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Delete Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1646.00;14.00",
+ "size": "19;22",
+ "children": []
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Share Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1683.00;0.00",
+ "size": "37;52",
+ "children": [
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": "Share Chat",
+ "role_description": "button",
+ "value": null,
+ "position": "1692.00;14.00",
+ "size": "19;23",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "close button",
+ "value": null,
+ "position": "19.00;18.00",
+ "size": "14;16",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "full screen button",
+ "value": null,
+ "position": "59.00;18.00",
+ "size": "14;16",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "59.00;18.00",
+ "size": "14;16",
+ "children": [
+ {
+ "name": null,
+ "role": "AXGroup",
+ "description": null,
+ "role_description": "group",
+ "value": null,
+ "position": "59.00;18.00",
+ "size": "14;16",
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": null,
+ "role": "AXButton",
+ "description": null,
+ "role_description": "minimize button",
+ "value": null,
+ "position": "39.00;18.00",
+ "size": "14;16",
+ "children": []
+ },
+ {
+ "name": null,
+ "role": "AXStaticText",
+ "description": null,
+ "role_description": "text",
+ "value": "Chats",
+ "position": "149.00;0.00",
+ "size": "228;52",
+ "children": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/output_visualisation/visualiser_app.py b/output_visualisation/visualiser_app.py
new file mode 100644
index 0000000..eef88a3
--- /dev/null
+++ b/output_visualisation/visualiser_app.py
@@ -0,0 +1,756 @@
+"""
+This is a Streamlit app for visualizing accessibility metadata.
+
+To run the app, use the following command:
+streamlit run visualiser_app.py
+
+Also, please select dark theme in the settings.
+"""
+
+
+import streamlit as st
+import json
+import os
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from PIL import Image
+import plotly.express as px
+
+class StreamlitAccessibilityVisualizer:
+ def __init__(self, image_path, custom_json_path, system_json_path, system_scale_factor=2.0):
+ """
+ Initialize the accessibility visualizer with file paths
+
+ Parameters:
+ - image_path: Path to the app screenshot image
+ - custom_json_path: Path to the custom accessibility JSON file
+ - system_json_path: Path to the system accessibility JSON file
+ - system_scale_factor: Scale factor to apply to system data coordinates (default: 2.0)
+ """
+ self.image_path = image_path
+ self.custom_json_path = custom_json_path
+ self.system_json_path = system_json_path
+ self.custom_data = None
+ self.system_data = None
+ self.elements = {'custom': [], 'system': []}
+ self.fig = None
+ self.system_scale_factor = system_scale_factor
+
+ # Load data
+ self.load_data()
+ self.extract_elements()
+
+ def load_data(self):
+ """Load the JSON data and image"""
+ # Load custom JSON
+ with open(self.custom_json_path, 'r') as f:
+ self.custom_data = json.load(f)
+
+ # Load system JSON
+ with open(self.system_json_path, 'r') as f:
+ self.system_data = json.load(f)
+
+ # Load image
+ self.image = Image.open(self.image_path)
+ self.img_width, self.img_height = self.image.size
+
+ def extract_elements_from_custom(self, node, elements, parent=None, depth=0):
+ """Recursively extract elements with bounding boxes from custom data"""
+ # Extract information for this node if it has a bounding box
+ if 'box' in node and len(node['box']) == 4:
+ x1, y1, x2, y2 = node['box']
+ width = x2 - x1
+ height = y2 - y1
+
+ # Only include elements with non-zero width and height
+ if width > 0 and height > 0:
+ element = {
+ 'x0': x1,
+ 'y0': y1,
+ 'x1': x2,
+ 'y1': y2,
+ 'width': width,
+ 'height': height,
+ 'value': node.get('value', None),
+ 'cls': node.get('cls', None),
+ 'depth': depth
+ }
+ elements.append(element)
+
+ # Process children recursively
+ if 'children' in node and node['children']:
+ for child in node['children']:
+ self.extract_elements_from_custom(child, elements, node, depth + 1)
+
+ def extract_elements_from_system(self, node, elements, parent=None, depth=0):
+ """Recursively extract elements with position and size from system data"""
+ # Extract information for this node if it has position and size
+ if 'position' in node and 'size' in node:
+ try:
+ pos = node['position'].split(';')
+ size = node['size'].split(';')
+
+ if len(pos) == 2 and len(size) == 2:
+ # Apply scaling factor to adjust system coordinates to match the screenshot scale
+ x = float(pos[0]) * self.system_scale_factor
+ y = float(pos[1]) * self.system_scale_factor
+ width = float(size[0]) * self.system_scale_factor
+ height = float(size[1]) * self.system_scale_factor
+
+ # Only include elements with non-zero width and height
+ if width > 0 and height > 0:
+ element = {
+ 'x0': x,
+ 'y0': y,
+ 'x1': x + width,
+ 'y1': y + height,
+ 'width': width,
+ 'height': height,
+ 'value': node.get('value', None),
+ 'name': node.get('name', None),
+ 'role': node.get('role', None),
+ 'description': node.get('description', None),
+ 'depth': depth
+ }
+ elements.append(element)
+ except (ValueError, IndexError):
+ pass # Skip if conversion fails
+
+ # Process children recursively
+ if 'children' in node and node['children']:
+ for child in node['children']:
+ self.extract_elements_from_system(child, elements, node, depth + 1)
+
+ def extract_elements(self):
+ """Extract elements from both data sources"""
+ # Extract from custom data
+ self.extract_elements_from_custom(self.custom_data, self.elements['custom'])
+
+ # Extract from system data
+ self.extract_elements_from_system(self.system_data, self.elements['system'])
+
+ print(f"Extracted {len(self.elements['custom'])} elements from custom data")
+ print(f"Extracted {len(self.elements['system'])} elements from system data")
+
+ def create_figure_config(self):
+ """Create a configuration object for the Plotly figure"""
+ return {
+ 'displayModeBar': True,
+ 'displaylogo': False,
+ 'modeBarButtonsToAdd': ['toggleHover'],
+ 'modeBarButtonsToRemove': ['lasso2d', 'select2d'],
+ 'toImageButtonOptions': {
+ 'format': 'png',
+ 'filename': f'accessibility_visualization',
+ 'height': 800,
+ 'width': 1200,
+ 'scale': 2
+ }
+ }
+
+ def create_plotly_figure(self, data_source, element_types=None, min_size=0, max_depth=None):
+ """
+ Create a plotly figure with the app screenshot as background
+
+ Parameters:
+ - data_source: 'custom' or 'system'
+ - element_types: List of element types to include (cls for custom, role for system)
+ - min_size: Minimum element size (width*height) to include
+ - max_depth: Maximum depth of elements to include
+
+ Returns:
+ - fig: Plotly figure
+ """
+ # Create figure
+ self.fig = go.Figure()
+
+ # Calculate aspect ratio and set figure size
+ aspect_ratio = self.img_height / self.img_width
+ display_width = min(1200, self.img_width) # Limit max width for large images
+ display_height = int(display_width * aspect_ratio)
+
+ # Add the screenshot as a background image
+ self.fig.add_layout_image(
+ dict(
+ source=self.image,
+ xref="x",
+ yref="y",
+ x=0,
+ y=0,
+ sizex=self.img_width,
+ sizey=self.img_height,
+ sizing="contain", # "contain" to preserve aspect ratio
+ opacity=1,
+ layer="below"
+ )
+ )
+
+ # Set axes properties
+ self.fig.update_xaxes(
+ range=[0, self.img_width],
+ showticklabels=False,
+ showgrid=False,
+ zeroline=False,
+ scaleanchor="y", # Lock the aspect ratio
+ scaleratio=1 # 1:1 aspect ratio
+ )
+
+ self.fig.update_yaxes(
+ range=[self.img_height, 0], # Inverted y-axis to match image coordinates
+ showticklabels=False,
+ showgrid=False,
+ zeroline=False
+ )
+
+ # Update layout with a dark theme
+ self.fig.update_layout(
+ title=f"Accessibility Visualization ({data_source.capitalize()} Data)",
+ title_font=dict(size=20, color="white"),
+ autosize=False, # Use fixed size instead of autosize
+ width=display_width,
+ height=display_height,
+ margin=dict(l=0, r=0, t=40, b=0),
+ hovermode="closest",
+ paper_bgcolor="#111111",
+ plot_bgcolor="#111111",
+ font=dict(color="white"),
+ legend=dict(
+ title_font=dict(size=14),
+ font=dict(size=12),
+ bgcolor="rgba(0,0,0,0.5)",
+ bordercolor="rgba(255,255,255,0.2)",
+ borderwidth=1,
+ itemsizing='constant', # Make legend items all the same size
+ itemwidth=30,
+ orientation='v',
+ yanchor='top',
+ y=1,
+ xanchor='right',
+ x=1.1,
+ tracegroupgap=5
+ ),
+ modebar=dict(
+ bgcolor="rgba(0,0,0,0)",
+ color="white",
+ activecolor="#636EFA"
+ ),
+ dragmode="pan", # Set default interaction mode to pan
+ )
+
+ return self.fig
+
+ def add_bounding_boxes(self, data_source, element_types=None, min_size=0, max_depth=None):
+ """
+ Add bounding boxes to the plotly figure
+
+ Parameters:
+ - data_source: 'custom' or 'system'
+ - element_types: List of element types to include (cls for custom, role for system)
+ - min_size: Minimum element size (width*height) to include
+ - max_depth: Maximum depth of elements to include
+ """
+ # Get elements for the current data source
+ current_elements = self.elements[data_source]
+
+ # Apply filters
+ filtered_elements = current_elements
+
+ # Filter by element type
+ if element_types and len(element_types) > 0:
+ if data_source == 'custom':
+ filtered_elements = [e for e in filtered_elements if e.get('cls') in element_types]
+ else: # system
+ filtered_elements = [e for e in filtered_elements if e.get('role') in element_types]
+
+ # Filter by minimum size
+ if min_size > 0:
+ filtered_elements = [e for e in filtered_elements if e.get('width', 0) * e.get('height', 0) >= min_size]
+
+ # Filter by maximum depth
+ if max_depth is not None:
+ filtered_elements = [e for e in filtered_elements if e.get('depth', 0) <= max_depth]
+
+ # Sort elements by depth (deepest first, to have shallower elements on top)
+ sorted_elements = sorted(filtered_elements, key=lambda e: e.get('depth', 0), reverse=True)
+
+ # Get element types for the current data source
+ if data_source == 'custom':
+ all_element_types = sorted(set(e.get('cls', 'Unknown') for e in current_elements))
+ else: # system
+ all_element_types = sorted(set(e.get('role', 'Unknown') for e in current_elements))
+
+ # Create a colormap
+ colors = px.colors.qualitative.Plotly # Use Plotly's built-in color scale
+ color_map = {elem_type: colors[i % len(colors)] for i, elem_type in enumerate(all_element_types)}
+
+ # Group elements by type and add them to the figure
+ for elem_type in all_element_types:
+ if element_types and elem_type not in element_types:
+ continue
+
+ color = color_map[elem_type]
+
+ # Create lists for shape data
+ x0_list, y0_list, x1_list, y1_list = [], [], [], []
+ hover_texts = []
+
+ # Create separate lists for Group elements (which will not have hover)
+ group_x0_list, group_y0_list, group_x1_list, group_y1_list = [], [], [], []
+
+ # Filter elements by type and collect data
+ for elem in sorted_elements:
+ if ((data_source == 'custom' and elem.get('cls') == elem_type) or
+ (data_source == 'system' and elem.get('role') == elem_type)):
+
+ # Skip very large elements that might be the background
+ width = elem['width'] if 'width' in elem else elem['x1'] - elem['x0']
+ height = elem['height'] if 'height' in elem else elem['y1'] - elem['y0']
+ if width > self.img_width * 0.95 and height > self.img_height * 0.95:
+ continue
+
+ # Check if this is a Group element
+ is_group = False
+ if (data_source == 'custom' and elem.get('cls') == 'Group') or \
+ (data_source == 'system' and elem.get('role') == 'AXGroup'):
+ is_group = True
+ group_x0_list.append(elem['x0'])
+ group_y0_list.append(elem['y0'])
+ group_x1_list.append(elem['x1'])
+ group_y1_list.append(elem['y1'])
+ else:
+ # Only add non-Group elements to the hoverable list
+ x0_list.append(elem['x0'])
+ y0_list.append(elem['y0'])
+ x1_list.append(elem['x1'])
+ y1_list.append(elem['y1'])
+
+ # Create simplified hover text with just the value (if available)
+ # Otherwise use type/role
+ if data_source == 'custom':
+ hover_text = elem.get('value', elem.get('cls', ''))
+ else: # system
+ hover_text = elem.get('value', elem.get('name', elem.get('role', '')))
+
+ # Make sure we have a value to show
+ if not hover_text:
+ if data_source == 'custom':
+ hover_text = elem.get('cls', 'Unknown')
+ else:
+ hover_text = elem.get('role', 'Unknown')
+
+ hover_texts.append(hover_text)
+
+ # Add shapes if we have any for this type
+ if x0_list:
+ # Add hoverable elements (non-Group elements)
+ self.fig.add_trace(
+ go.Scatter(
+ x=[(x0 + x1) / 2 for x0, x1 in zip(x0_list, x1_list)], # Center point for hover
+ y=[(y0 + y1) / 2 for y0, y1 in zip(y0_list, y1_list)],
+ mode='markers',
+ marker=dict(
+ size=2,
+ color='rgba(0,0,0,0)' # Invisible markers, just for hover
+ ),
+ hoverinfo='text',
+ hovertemplate="%{hovertext}",
+ hovertext=hover_texts,
+ name=elem_type,
+ showlegend=True,
+ legendgroup=elem_type,
+ marker_color=color # Set marker color to match the boxes (for legend)
+ )
+ )
+
+ # Add rectangle shapes for hoverable elements
+ for i in range(len(x0_list)):
+ self.fig.add_shape(
+ type="rect",
+ x0=x0_list[i],
+ y0=y0_list[i],
+ x1=x1_list[i],
+ y1=y1_list[i],
+ line=dict(
+ color=color,
+ width=1, # Thinner line
+ ),
+ fillcolor="rgba(0,0,0,0)",
+ opacity=0.9
+ )
+
+ # Add Group elements (non-hoverable)
+ if group_x0_list and elem_type in ['Group', 'AXGroup']:
+ # We don't add a trace for these (so they won't be hoverable)
+ # Just add the shapes
+ for i in range(len(group_x0_list)):
+ self.fig.add_shape(
+ type="rect",
+ x0=group_x0_list[i],
+ y0=group_y0_list[i],
+ x1=group_x1_list[i],
+ y1=group_y1_list[i],
+ line=dict(
+ color=color,
+ width=1, # Thinner line
+ ),
+ fillcolor="rgba(0,0,0,0)",
+ opacity=0.9
+ )
+
+ def get_element_types(self, data_source):
+ """Get all element types for a data source"""
+ if data_source == 'custom':
+ return sorted(set(e.get('cls', 'Unknown') for e in self.elements['custom']))
+ else: # system
+ return sorted(set(e.get('role', 'Unknown') for e in self.elements['system']))
+
+ def get_max_depth(self, data_source):
+ """Get the maximum depth of elements for a data source"""
+ if data_source == 'custom':
+ return max((e.get('depth', 0) for e in self.elements['custom']), default=0)
+ else: # system
+ return max((e.get('depth', 0) for e in self.elements['system']), default=0)
+
+
+# Streamlit app
+def main():
+ # Configure the page with a dark theme and expanded layout
+ st.set_page_config(
+ layout="wide",
+ page_title="Accessibility Visualization",
+ page_icon="🔍",
+ initial_sidebar_state="collapsed",
+ menu_items={
+ 'Get Help': 'https://github.com/yourusername/accessibility-visualizer',
+ 'Report a bug': 'https://github.com/yourusername/accessibility-visualizer/issues',
+ 'About': "Interactive visualization tool for accessibility metadata"
+ }
+ )
+
+ # Custom CSS for dark theme and better spacing
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Main app title with icon
+ st.markdown("""
+ # 🔍 Interactive Accessibility Visualization
+ Visualize and analyze accessibility metadata from custom and system sources.
+ """)
+
+ # File Selection (in a collapsible section)
+ with st.expander("File Selection", expanded=False):
+ # File upload controls in a horizontal layout
+ upload_col1, upload_col2, upload_col3 = st.columns(3)
+
+ with upload_col1:
+ # Screenshot image upload
+ uploaded_image = st.file_uploader("Upload Screenshot Image", type=["png", "jpg", "jpeg"])
+
+ with upload_col2:
+ # Custom JSON upload
+ uploaded_custom_json = st.file_uploader("Upload Custom Accessibility JSON", type=["json"])
+
+ with upload_col3:
+ # System JSON upload
+ uploaded_system_json = st.file_uploader("Upload System Accessibility JSON", type=["json"])
+
+
+ # Always visible filters (not in collapsible section)
+ # Create a 3-column layout for the always-visible filters
+ filter_col1, filter_col2, filter_col3 = st.columns(3)
+
+ with filter_col1:
+ # Data source selection (custom vs system)
+ data_source = st.radio("Data Source", ["custom", "system"], horizontal=True)
+
+ # Set system_scale_factor to a default value (not displayed in UI)
+ system_scale_factor = 2.0
+
+ # Check if files are uploaded or use default files
+ if uploaded_image is None or uploaded_custom_json is None or uploaded_system_json is None:
+ # Display help information
+ # Display instructions and help information
+ st.markdown("### How to Use This Tool")
+ st.markdown("""
+ This tool visualizes accessibility metadata from both custom and system JSON files overlaid on an app screenshot.
+
+ #### Getting Started:
+ 1. Click on "File Selection" to expand the upload section
+ 2. Upload your screenshot image (PNG, JPG)
+ 3. Upload your custom accessibility JSON file
+ 4. Upload your system accessibility JSON file
+
+ #### Features:
+ - **Switch Data Sources**: Toggle between custom and system accessibility data
+ - **Filter Elements**: Show/hide specific element types
+ - **Depth Control**: Filter elements by their depth in the hierarchy
+ - **Interactive Hover**: Mouse over elements to see detailed information
+ """)
+ return
+ else:
+ # Save uploaded files temporarily
+ with open("temp_image.png", "wb") as f:
+ f.write(uploaded_image.getbuffer())
+
+ with open("temp_custom.json", "wb") as f:
+ f.write(uploaded_custom_json.getbuffer())
+
+ with open("temp_system.json", "wb") as f:
+ f.write(uploaded_system_json.getbuffer())
+
+ image_path = "temp_image.png"
+ custom_json_path = "temp_custom.json"
+ system_json_path = "temp_system.json"
+
+ # Create visualizer instance with the fixed scale factor
+ visualizer = StreamlitAccessibilityVisualizer(
+ image_path,
+ custom_json_path,
+ system_json_path,
+ system_scale_factor=system_scale_factor
+ )
+
+ # Get element types for the selected data source
+ element_types = visualizer.get_element_types(data_source)
+
+ # Add the remaining filters (element types and depth)
+ with filter_col2:
+ # Element type filter
+ selected_element_types = st.multiselect(
+ "Element types to display",
+ options=element_types,
+ default=element_types
+ )
+
+ # Set min_size to 0 (no minimum size filter)
+ min_size = 0
+
+ with filter_col3:
+ # Depth filter
+ max_depth = visualizer.get_max_depth(data_source)
+ depth_filter = st.slider(
+ "Maximum element depth",
+ min_value=0,
+ max_value=max_depth,
+ value=max_depth,
+ step=1
+ )
+
+ # Generate visualization
+ fig = visualizer.create_plotly_figure(
+ data_source=data_source,
+ element_types=selected_element_types,
+ min_size=min_size,
+ max_depth=depth_filter
+ )
+
+ # Add bounding boxes and other elements
+ visualizer.add_bounding_boxes(
+ data_source=data_source,
+ element_types=selected_element_types,
+ min_size=min_size,
+ max_depth=depth_filter
+ )
+
+ # Create config for the figure
+ config = visualizer.create_figure_config()
+
+ # Display the visualization in full width
+ st.plotly_chart(fig, use_container_width=True, config=config)
+
+ # Add space between visualization and statistics
+ st.markdown("---")
+
+ # Create tabs for statistics and help information
+ stats_tab, help_tab = st.tabs(["Element Statistics", "Help & Information"])
+
+ with stats_tab:
+ # Count elements by type
+ if data_source == 'custom':
+ df = pd.DataFrame([
+ {"Type": e.get('cls', 'Unknown'), "Depth": e.get('depth', 0)}
+ for e in visualizer.elements['custom']
+ if e.get('width', 0) * e.get('height', 0) >= min_size and e.get('depth', 0) <= depth_filter
+ ])
+ else: # system
+ df = pd.DataFrame([
+ {"Type": e.get('role', 'Unknown'), "Depth": e.get('depth', 0)}
+ for e in visualizer.elements['system']
+ if e.get('width', 0) * e.get('height', 0) >= min_size and e.get('depth', 0) <= depth_filter
+ ])
+
+ if not df.empty:
+ type_counts = df['Type'].value_counts().reset_index()
+ type_counts.columns = ['Element Type', 'Count']
+
+ # Create three columns for the statistics
+ chart_cols = st.columns(3)
+
+ with chart_cols[0]:
+ # Display counts table without index numbers
+ st.subheader(f"Element Type Counts \n ({data_source.capitalize()})")
+ st.dataframe(
+ type_counts[['Element Type', 'Count']].set_index('Element Type'),
+ use_container_width=True
+ )
+
+ with chart_cols[1]:
+ # Display pie chart of element types
+ st.subheader(f"Distribution of Element Types \n ({data_source.capitalize()})")
+ fig_pie = px.pie(
+ type_counts,
+ values='Count',
+ names='Element Type',
+ color_discrete_sequence=px.colors.qualitative.Plotly
+ )
+ # Improve pie chart appearance
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
+ fig_pie.update_layout(
+ margin=dict(t=30, b=0, l=0, r=0),
+ showlegend=False,
+ # legend=dict(orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5),
+ paper_bgcolor="#1a1a1a",
+ plot_bgcolor="#1a1a1a",
+ font=dict(color="white")
+ )
+ st.plotly_chart(fig_pie, use_container_width=True, config={'displayModeBar': False})
+
+ with chart_cols[2]:
+ # Display depth histogram
+ st.subheader(f"Element Depth Distribution \n ({data_source.capitalize()})")
+ fig_hist = px.histogram(
+ df,
+ x='Depth',
+ nbins=max_depth+1,
+ color_discrete_sequence=['#636EFA']
+ )
+ fig_hist.update_layout(
+ xaxis_title="Depth",
+ yaxis_title="Count",
+ margin=dict(t=30, b=0, l=0, r=0),
+ paper_bgcolor="#1a1a1a",
+ plot_bgcolor="#1a1a1a",
+ font=dict(color="white")
+ )
+ st.plotly_chart(fig_hist, use_container_width=True, config={'displayModeBar': False})
+ else:
+ st.info("No elements match the current filters.")
+
+ with help_tab:
+ st.markdown("""
+ ## How to Use This Tool
+
+ This interactive visualization tool helps you analyze accessibility metadata from both custom and system sources, overlaid on an application screenshot.
+
+ ### Getting Started
+
+ 1. **Upload Files** (in the File Selection section):
+ - Screenshot image (PNG, JPG)
+ - Custom accessibility JSON file
+ - System accessibility JSON file
+
+ 2. **Configure Visualization**:
+ - Switch between custom and system data sources
+ - Filter elements by type
+ - Filter elements by hierarchy depth
+
+ 3. **Interact with Visualization**:
+ - Hover over elements to see detailed information
+ - Zoom in/out using the mousewheel
+ - Pan by clicking and dragging
+ - Use the toolbar to reset view, download as PNG, etc.
+
+ ### Understanding the Data
+
+ - **Custom Data**: Uses `box` attribute with [x1, y1, x2, y2] coordinates
+ - **System Data**: Uses `position` (x;y) and `size` (width;height) attributes
+ - **Element Types**:
+ - Custom: Group, Text, AXButton, AXImage, etc.
+ - System: AXWindow, AXGroup, AXButton, etc.
+ - **Depth**: Indicates the nesting level in the element hierarchy
+
+ ### Element Information on Hover
+
+ When hovering over elements, you'll see different information depending on the data source:
+
+ - **Custom data**: Type, Value, Position, Size, Depth
+ - **System data**: Role, Name, Value, Description, Position, Size, Depth
+ """)
+
+ # Clean up temporary files
+ if uploaded_image is not None:
+ try:
+ os.remove("temp_image.png")
+ os.remove("temp_custom.json")
+ os.remove("temp_system.json")
+ except:
+ pass # Ignore errors when removing temp files
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0b60907
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,235 @@
+altair==5.5.0
+atomacos==3.3.0
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+contourpy==1.3.1
+cycler==0.12.1
+filelock==3.17.0
+fonttools==4.56.0
+fsspec==2025.2.0
+future==1.0.0
+gitdb==4.0.12
+GitPython==3.1.44
+huggingface-hub==0.29.1
+idna==3.10
+ImageHash==4.3.2
+iniconfig==2.1.0
+Jinja2==3.1.5
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+macapptree==0.0.2
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+mpmath==1.3.0
+narwhals==1.46.0
+networkx==3.4.2
+numpy==1.26.4
+ocrmac==1.0.0
+opencv-python==4.11.0.86
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+plotly==5.9.0
+pluggy==1.5.0
+protobuf==5.29.5
+psutil==7.0.0
+py-cpuinfo==9.0.0
+pyarrow==20.0.0
+PyAutoGUI==0.9.41
+pydeck==0.9.1
+PyGetWindow==0.0.4
+PyMsgBox==1.0.9
+pynput==1.8.0
+pyobjc==10.3.1
+pyobjc-core==10.3.1
+pyobjc-framework-Accessibility==10.3.1
+pyobjc-framework-Accounts==10.3.1
+pyobjc-framework-AddressBook==10.3.1
+pyobjc-framework-AdServices==10.3.1
+pyobjc-framework-AdSupport==10.3.1
+pyobjc-framework-AppleScriptKit==10.3.1
+pyobjc-framework-AppleScriptObjC==10.3.1
+pyobjc-framework-ApplicationServices==10.3.1
+pyobjc-framework-AppTrackingTransparency==10.3.1
+pyobjc-framework-AudioVideoBridging==10.3.1
+pyobjc-framework-AuthenticationServices==10.3.1
+pyobjc-framework-AutomaticAssessmentConfiguration==10.3.1
+pyobjc-framework-Automator==10.3.1
+pyobjc-framework-AVFoundation==10.3.1
+pyobjc-framework-AVKit==10.3.1
+pyobjc-framework-AVRouting==10.3.1
+pyobjc-framework-BackgroundAssets==10.3.1
+pyobjc-framework-BrowserEngineKit==10.3.1
+pyobjc-framework-BusinessChat==10.3.1
+pyobjc-framework-CalendarStore==10.3.1
+pyobjc-framework-CallKit==10.3.1
+pyobjc-framework-CFNetwork==10.3.1
+pyobjc-framework-Cinematic==10.3.1
+pyobjc-framework-ClassKit==10.3.1
+pyobjc-framework-CloudKit==10.3.1
+pyobjc-framework-Cocoa==10.3.1
+pyobjc-framework-Collaboration==10.3.1
+pyobjc-framework-ColorSync==10.3.1
+pyobjc-framework-Contacts==10.3.1
+pyobjc-framework-ContactsUI==10.3.1
+pyobjc-framework-CoreAudio==10.3.1
+pyobjc-framework-CoreAudioKit==10.3.1
+pyobjc-framework-CoreBluetooth==10.3.1
+pyobjc-framework-CoreData==10.3.1
+pyobjc-framework-CoreHaptics==10.3.1
+pyobjc-framework-CoreLocation==10.3.1
+pyobjc-framework-CoreMedia==10.3.1
+pyobjc-framework-CoreMediaIO==10.3.1
+pyobjc-framework-CoreMIDI==10.3.1
+pyobjc-framework-CoreML==10.3.1
+pyobjc-framework-CoreMotion==10.3.1
+pyobjc-framework-CoreServices==10.3.1
+pyobjc-framework-CoreSpotlight==10.3.1
+pyobjc-framework-CoreText==10.3.1
+pyobjc-framework-CoreWLAN==10.3.1
+pyobjc-framework-CryptoTokenKit==10.3.1
+pyobjc-framework-DataDetection==10.3.1
+pyobjc-framework-DeviceCheck==10.3.1
+pyobjc-framework-DictionaryServices==10.3.1
+pyobjc-framework-DiscRecording==10.3.1
+pyobjc-framework-DiscRecordingUI==10.3.1
+pyobjc-framework-DiskArbitration==10.3.1
+pyobjc-framework-DVDPlayback==10.3.1
+pyobjc-framework-EventKit==10.3.1
+pyobjc-framework-ExceptionHandling==10.3.1
+pyobjc-framework-ExecutionPolicy==10.3.1
+pyobjc-framework-ExtensionKit==10.3.1
+pyobjc-framework-ExternalAccessory==10.3.1
+pyobjc-framework-FileProvider==10.3.1
+pyobjc-framework-FileProviderUI==10.3.1
+pyobjc-framework-FinderSync==10.3.1
+pyobjc-framework-FSEvents==10.3.1
+pyobjc-framework-GameCenter==10.3.1
+pyobjc-framework-GameController==10.3.1
+pyobjc-framework-GameKit==10.3.1
+pyobjc-framework-GameplayKit==10.3.1
+pyobjc-framework-HealthKit==10.3.1
+pyobjc-framework-ImageCaptureCore==10.3.1
+pyobjc-framework-InputMethodKit==10.3.1
+pyobjc-framework-InstallerPlugins==10.3.1
+pyobjc-framework-InstantMessage==10.3.1
+pyobjc-framework-Intents==10.3.1
+pyobjc-framework-IntentsUI==10.3.1
+pyobjc-framework-IOBluetooth==10.3.1
+pyobjc-framework-IOBluetoothUI==10.3.1
+pyobjc-framework-IOSurface==10.3.1
+pyobjc-framework-iTunesLibrary==10.3.1
+pyobjc-framework-KernelManagement==10.3.1
+pyobjc-framework-LatentSemanticMapping==10.3.1
+pyobjc-framework-LaunchServices==10.3.1
+pyobjc-framework-libdispatch==10.3.1
+pyobjc-framework-libxpc==10.3.1
+pyobjc-framework-LinkPresentation==10.3.1
+pyobjc-framework-LocalAuthentication==10.3.1
+pyobjc-framework-LocalAuthenticationEmbeddedUI==10.3.1
+pyobjc-framework-MailKit==10.3.1
+pyobjc-framework-MapKit==10.3.1
+pyobjc-framework-MediaAccessibility==10.3.1
+pyobjc-framework-MediaLibrary==10.3.1
+pyobjc-framework-MediaPlayer==10.3.1
+pyobjc-framework-MediaToolbox==10.3.1
+pyobjc-framework-Metal==10.3.1
+pyobjc-framework-MetalFX==10.3.1
+pyobjc-framework-MetalKit==10.3.1
+pyobjc-framework-MetalPerformanceShaders==10.3.1
+pyobjc-framework-MetalPerformanceShadersGraph==10.3.1
+pyobjc-framework-MetricKit==10.3.1
+pyobjc-framework-MLCompute==10.3.1
+pyobjc-framework-ModelIO==10.3.1
+pyobjc-framework-MultipeerConnectivity==10.3.1
+pyobjc-framework-NaturalLanguage==10.3.1
+pyobjc-framework-NetFS==10.3.1
+pyobjc-framework-Network==10.3.1
+pyobjc-framework-NetworkExtension==10.3.1
+pyobjc-framework-NotificationCenter==10.3.1
+pyobjc-framework-OpenDirectory==10.3.1
+pyobjc-framework-OSAKit==10.3.1
+pyobjc-framework-OSLog==10.3.1
+pyobjc-framework-PassKit==10.3.1
+pyobjc-framework-PencilKit==10.3.1
+pyobjc-framework-PHASE==10.3.1
+pyobjc-framework-Photos==10.3.1
+pyobjc-framework-PhotosUI==10.3.1
+pyobjc-framework-PreferencePanes==10.3.1
+pyobjc-framework-PushKit==10.3.1
+pyobjc-framework-Quartz==10.3.1
+pyobjc-framework-QuickLookThumbnailing==10.3.1
+pyobjc-framework-ReplayKit==10.3.1
+pyobjc-framework-SafariServices==10.3.1
+pyobjc-framework-SafetyKit==10.3.1
+pyobjc-framework-SceneKit==10.3.1
+pyobjc-framework-ScreenCaptureKit==10.3.1
+pyobjc-framework-ScreenSaver==10.3.1
+pyobjc-framework-ScreenTime==10.3.1
+pyobjc-framework-ScriptingBridge==10.3.1
+pyobjc-framework-SearchKit==10.3.1
+pyobjc-framework-Security==10.3.1
+pyobjc-framework-SecurityFoundation==10.3.1
+pyobjc-framework-SecurityInterface==10.3.1
+pyobjc-framework-SensitiveContentAnalysis==10.3.1
+pyobjc-framework-ServiceManagement==10.3.1
+pyobjc-framework-SharedWithYou==10.3.1
+pyobjc-framework-SharedWithYouCore==10.3.1
+pyobjc-framework-ShazamKit==10.3.1
+pyobjc-framework-Social==10.3.1
+pyobjc-framework-SoundAnalysis==10.3.1
+pyobjc-framework-Speech==10.3.1
+pyobjc-framework-SpriteKit==10.3.1
+pyobjc-framework-StoreKit==10.3.1
+pyobjc-framework-Symbols==10.3.1
+pyobjc-framework-SyncServices==10.3.1
+pyobjc-framework-SystemConfiguration==10.3.1
+pyobjc-framework-SystemExtensions==10.3.1
+pyobjc-framework-ThreadNetwork==10.3.1
+pyobjc-framework-UniformTypeIdentifiers==10.3.1
+pyobjc-framework-UserNotifications==10.3.1
+pyobjc-framework-UserNotificationsUI==10.3.1
+pyobjc-framework-VideoSubscriberAccount==10.3.1
+pyobjc-framework-VideoToolbox==10.3.1
+pyobjc-framework-Virtualization==10.3.1
+pyobjc-framework-Vision==10.3.1
+pyobjc-framework-WebKit==10.3.1
+pyparsing==3.2.1
+PyRect==0.2.0
+PyScreeze==0.1.19
+pytest==7.4.4
+python-dateutil==2.9.0.post0
+pytweening==1.2.0
+pytz==2025.1
+PyWavelets==1.8.0
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rpds-py==0.26.0
+safetensors==0.5.3
+scipy==1.15.2
+seaborn==0.13.2
+six==1.17.0
+smmap==5.0.2
+streamlit==1.44.1
+sympy==1.13.1
+tenacity==9.1.2
+tokenizers==0.21.2
+toml==0.10.2
+torch==2.6.0
+torchvision==0.21.0
+tornado==6.5.1
+tqdm==4.67.1
+transformers==4.48.0
+typing_extensions==4.12.2
+tzdata==2025.1
+ultralytics==8.3.82
+ultralytics-thop==2.0.14
+Unidecode==1.3.8
+urllib3==2.3.0
diff --git a/screen_reader/screen_reader.py b/screen_reader/screen_reader.py
new file mode 100644
index 0000000..0222ad5
--- /dev/null
+++ b/screen_reader/screen_reader.py
@@ -0,0 +1,384 @@
+import os
+import time
+import argparse
+import subprocess
+import tkinter as tk
+from typing import Tuple, Dict
+
+from AppKit import NSApp
+from PIL import Image, ImageChops
+from pynput.mouse import Button, Controller
+from macapptree import get_tree, get_app_bundle
+
+from hierarchy_dl.hierarchy import generate_hierarchy
+
+from screen_reader.utils import *
+from screen_reader.screenshot import (
+ get_app_info,
+ open_app_in_foreground,
+ screenshot_app,
+ get_app_name,
+)
+
+
+# Global mouse controller
+mouse = Controller()
+
+# Global variables and constants
+BUNDLE_ID = None
+SAY_RATE = 190
+WELCOME = True
+HELP = True
+SYSTEM_ACCESSIBILITY = False
+VOICE = "Daniel"
+SKIP_GROUPS_SIZE = 5
+
+WELCOME_MESSAGE = "Welcome to the ScreenReader."
+HELP_MESSAGE = (
+ "Use the arrow keys to navigate the UI elements. [[slnc 500]]\n"
+ "- Press the down arrow to navigate inside a group.\n"
+ "- Press the up arrow to navigate to the parent group.\n"
+ "- Press the left and right arrows to navigate to the previous and next elements.\n"
+ "- Press the space bar to click on a button or link.\n"
+ "- Press h to hear the instructions again.\n"
+ "- Press q to quit the ScreenReader."
+)
+SCREEN = None
+SAY_PROCESS: subprocess.Popen = subprocess.Popen(["say", ""])
+ACTIVE_ELEMENT: UIElement = {}
+
+
+def make_click_through(window: tk.Tk) -> None:
+ """
+ Enables click-through for the given Tkinter window.
+ """
+ ns_window = NSApp().windows()[0]
+ ns_window.setIgnoresMouseEvents_(True)
+
+
+def wake_up(root: tk.Tk) -> None:
+ """
+ Brings the Tkinter window to the foreground.
+ """
+ root.deiconify()
+ root.lift()
+ root.focus_force()
+ NSApp.activateIgnoringOtherApps_(True)
+ root.update()
+
+
+def create_app(position: Tuple[int, int, int, int]) -> Tuple[tk.Tk, tk.Canvas]:
+ """
+ Creates and returns a transparent Tkinter window along with its drawing canvas.
+
+ Args:
+ position (Tuple[int, int, int, int]): (x, y, width, height).
+
+ Returns:
+ Tuple[tk.Tk, tk.Canvas]: The Tkinter root window and its associated canvas.
+ """
+ root = tk.Tk()
+ root.title("ScreenReader")
+ root.overrideredirect(True)
+ root.wm_attributes("-transparent", True)
+ root.config(bg="systemTransparent")
+
+ x, y, width, height = map(int, position)
+ root.geometry(f"{width}x{height}+{x}+{y}")
+
+ canvas = tk.Canvas(root, width=width, height=height, bg="systemTransparent")
+ canvas.pack()
+
+ # Allow the window to be click-through after a short delay.
+ root.after(100, lambda: make_click_through(root))
+ return root, canvas
+
+
+def get_message(node: UIElement) -> str:
+ """
+ Generates a message based on the properties of a node in the accessibility tree.
+
+ Args:
+ node (UIElement): A node from the accessibility data.
+
+ Returns:
+ str: The generated message.
+ """
+ message = ""
+ place = ""
+
+ node_value = node.get("value", "")
+ node_cls = node.get("cls", "")
+
+ if node_cls.startswith("AX"):
+ node_cls = node_cls[2:]
+
+ if node_cls.endswith("Group"):
+ children = node.get("children", [])
+ message = f"{node_cls} with {len(children)} items. {'Groups content: ' if node_value else ''} {node_value}. {place}. [[slnc 500]] You are currently in a group. To interact with the items, press the down arrow."
+ elif node_cls.endswith("Button"):
+ message = f"{node_cls} with description {node_value}. You are currently on a button. To click, press the space bar."
+ elif node_cls.endswith("Image"):
+ message = f"{node_cls} with description {node_value}."
+ elif node_cls.endswith("Link"):
+ message = f"{node_cls} {node_value}. You are currently on a link. To click, press the space bar."
+ elif node_cls.endswith("Text"):
+ message = f"{node_cls} detected: {node_value}. "
+ else:
+ children = node.get("children", [])
+ message = f"{node_cls}, {node_value}. {f'[[slnc 500]] {node_cls} with {len(children)} items.' if children else ''}"
+
+ parent = node.get("parent")
+ if parent and len(parent.get("children", [])) > 1:
+ object_type = "group" if node["cls"].endswith("Group") else "element"
+ place = f"{ordinal(node['index'])} {object_type} out of {len(node['parent']['children'])}"
+ place = f"[[slnc 500]] {place}"
+
+ if message and not node_cls.endswith("Group"):
+ message += f" {place}."
+
+ return message
+
+
+def read_message(message: str, wait: bool = False) -> None:
+ """
+ Uses the macOS 'say' command to read a message aloud.
+
+ Args:
+ message (str): The message to be read.
+ wait (bool): If True, waits for the current speech to finish before proceeding.
+ """
+ global SAY_PROCESS
+
+ if not message:
+ return
+
+ if wait:
+ SAY_PROCESS.wait()
+ else:
+ SAY_PROCESS.terminate()
+
+ print(message)
+ SAY_PROCESS = subprocess.Popen(["say", "-r", str(SAY_RATE), "-v", VOICE, message])
+
+
+def on_press(key: str, canvas: tk.Canvas, root: tk.Tk) -> Dict:
+ """
+ Handles key press events for navigating the accessibility tree and triggering actions.
+
+ Args:
+ key (str): The key that was pressed.
+ canvas (tk.Canvas): The canvas used for drawing the UI.
+ root (tk.Tk): The Tkinter root window.
+
+ Returns:
+ Dict: The updated active node.
+ """
+ global ACTIVE_ELEMENT
+ active = ACTIVE_ELEMENT
+ old_active = active
+ key = key.lower()
+
+ if key == "down" and active.get("children"):
+ active = active["children"][0]
+ elif key == "up" and active.get("parent"):
+ active = active["parent"]
+ elif key == "left" and active.get("prev"):
+ active = active["prev"]
+ elif key == "right" and active.get("next"):
+ active = active["next"]
+ elif key == "space":
+ # Calculate click coordinates
+ x = root.winfo_x() + (active["box"][0] + active["box"][2]) // 2
+ y = root.winfo_y() + (active["box"][1] + active["box"][3]) // 2
+
+ open_app_in_foreground(BUNDLE_ID)
+ mouse.position = (x, y)
+ time.sleep(0.5)
+ mouse.click(Button.left)
+ time.sleep(0.5)
+
+ active = get_accessibility_data(BUNDLE_ID)
+ wake_up(root)
+ elif key == "q":
+ root.destroy()
+ os.system("killall say")
+ exit(0)
+ elif key == "h":
+ read_message(HELP_MESSAGE)
+
+ if old_active != active:
+ os.system("clear")
+ print(active["cls"], active["value"], active["box"])
+
+ canvas.delete("all")
+ color = "red" if active["cls"] == "Group" else "green"
+ canvas.create_rectangle(*active["box"], outline=color)
+
+ message = get_message(active)
+ read_message(message)
+
+ return active
+
+
+def find_element(element: UIElement, data: UIElement) -> UIElement:
+ """
+ Find the element in the data tree based on bounding box overlap.
+
+ Args:
+ element (UIElement): The element to find.
+ data (UIElement): The data tree.
+
+ Returns:
+ UIElement: The found element.
+ """
+ stack = [data]
+ while stack:
+ node = stack.pop()
+
+ if iou(element["box"], node["box"]) > 0.8:
+ return node
+
+ if "children" in node:
+ stack.extend(node["children"])
+
+ return data
+
+
+def map_system_accessibility_to_ui_element(accessibility: dict) -> UIElement:
+ stack = [accessibility]
+
+ while stack:
+ element = stack.pop(0)
+
+ if element["children"]:
+ stack.extend( element["children"] )
+
+ element['cls'] = element.pop("role")
+
+ element['box'] = element.pop("visible_bbox") if "visible_bbox" in element else element.pop("bbox")
+ element['box'] = tuple(map(lambda x: 2 * int(x), element['box'])) if element['box'] else (0, 0, 0, 0)
+
+ element['value'] = f"{element['role_description'] or ''} {element['description'] or ''} {element['value'] or ''}"
+ element['value'] = element['value'].strip()
+
+ to_delete = set(element.keys()).difference({"cls", "box", "value", "children"})
+ for d in to_delete:
+ del element[d]
+
+ return accessibility
+
+def get_accessibility_data(bundle_id: str) -> UIElement:
+ """
+ Generates accessibility data by taking a screenshot of the target app and
+ processing it.
+
+ Args:
+ bundle_id (str): The bundle ID of the target application.
+
+ Returns:
+ Dict: The updated accessibility data.
+ """
+ global SCREEN, ACTIVE_ELEMENT
+
+ open_app_in_foreground(bundle_id)
+ screen_path = screenshot_app(bundle_id, "./screenshots/")[0]
+ image = Image.open(screen_path)
+
+ if SCREEN:
+ # Check if the screen has changed
+ diff = ImageChops.difference(SCREEN.convert("RGB"), image.convert("RGB"))
+ if not diff.getbbox():
+ return ACTIVE_ELEMENT
+
+ SCREEN = image
+
+ # Generate hierarchy
+ if SYSTEM_ACCESSIBILITY:
+ data = get_tree(bundle_id)
+ data = map_system_accessibility_to_ui_element(data)
+ else:
+ data = generate_hierarchy(SCREEN).to_dict()
+
+ # add prev, next and parent links
+ active = update_accessibility_data(data, n=SKIP_GROUPS_SIZE)
+
+ # Try to find the previously active element on the new screen
+ if ACTIVE_ELEMENT:
+ active = find_element(ACTIVE_ELEMENT, active)
+
+ return active
+
+
+def parse_arguments():
+ """
+ Parses command-line arguments.
+ """
+ global BUNDLE_ID, WELCOME, HELP, SAY_RATE, SYSTEM_ACCESSIBILITY, VOICE, SKIP_GROUPS_SIZE
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-b", "--bundle_id", type=str, help="The bundle ID of the target application")
+ parser.add_argument("-n", "--name", type=str, help="Name of the target application (alternative to bundle_id)")
+ parser.add_argument("-dw", "--deactivate_welcome", action="store_true", help="Deactivate welcome message. Do not read \"Welcome to the ScreenReader.\" at startup.")
+ parser.add_argument("-dh", "--deactivate_help", action="store_true", help="Deactivate help message. Do not read the help message at startup.")
+ parser.add_argument("-r", "--rate", type=int, default=SAY_RATE, help=f"The speech rate for the 'say' command. Default: {SAY_RATE}")
+ parser.add_argument("-v", "--voice", help="The voice to use for the 'say' command. See 'say -v \"?\" | grep en' for a list of available voices.", default=VOICE)
+ parser.add_argument("-sa", "--system_accessibility", action="store_true", help="Use system accessibility data instead of generating it")
+ parser.add_argument("-sk", "--skip-groups", type=int, default=SKIP_GROUPS_SIZE, help=f"Skip groups with less than n children. Default: {SKIP_GROUPS_SIZE}")
+ args = parser.parse_args()
+
+ if args.name and not args.bundle_id:
+ BUNDLE_ID = get_app_bundle(args.name)
+ else:
+ BUNDLE_ID = args.bundle_id
+
+ WELCOME = not args.deactivate_welcome
+ HELP = not args.deactivate_help
+ SAY_RATE = args.rate
+ SYSTEM_ACCESSIBILITY = args.system_accessibility
+ VOICE = args.voice
+ SKIP_GROUPS_SIZE = args.skip_groups
+
+
+def main():
+ """
+ Main entry point for the ScreenReader application.
+ """
+ parse_arguments()
+
+ global ACTIVE_ELEMENT, SAY_PROCESS, WELCOME, HELP, WELCOME_MESSAGE, HELP_MESSAGE, BUNDLE_ID
+
+ if WELCOME:
+ read_message(WELCOME_MESSAGE)
+
+ if HELP:
+ read_message(HELP_MESSAGE, wait=True)
+
+ ACTIVE_ELEMENT = get_accessibility_data(BUNDLE_ID)
+
+ app_info = get_app_info(BUNDLE_ID)
+ app_name = get_app_name(BUNDLE_ID)
+ children_count = len(ACTIVE_ELEMENT.get("children", []))
+ ACTIVE_ELEMENT["value"] = (
+ f"{app_name}, global window, {app_info[1].replace('_', ' ')}. "
+ f"Press down to view {children_count} items."
+ )
+
+ root, canvas = create_app(app_info[2])
+ canvas.create_rectangle(*ACTIVE_ELEMENT["box"], outline="red")
+
+ def on_press_listener(key: str) -> None:
+ global ACTIVE_ELEMENT
+ print(key)
+ ACTIVE_ELEMENT = on_press(key, canvas, root)
+
+ root.bind("", lambda event: on_press_listener(event.keysym))
+ wake_up(root)
+
+ message = get_message(ACTIVE_ELEMENT)
+ read_message(message, wait=True)
+ root.mainloop()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/screen_reader/screenshot.py b/screen_reader/screenshot.py
new file mode 100644
index 0000000..e79c96f
--- /dev/null
+++ b/screen_reader/screenshot.py
@@ -0,0 +1,154 @@
+import os
+import time
+import Quartz
+import AppKit
+import subprocess
+from typing import Iterable, List, Dict, AnyStr, Union, Iterator, Tuple
+
+class ScreencaptureEx(Exception):
+ pass
+
+WindowInfo = Dict[AnyStr, Union[AnyStr, int]]
+
+USER_OPTS_STR = "exclude_desktop on_screen_only"
+FILE_EXT = "png"
+COMMAND = 'screencapture {options} -l {window} -o "{filename}"'
+SUCCESS = 0
+STATUS_BAR_WINDOW_IDENTIFIER = "Item-0"
+
+
+def get_window_info() -> List[WindowInfo]:
+ return Quartz.CGWindowListCopyWindowInfo(
+ Quartz.kCGWindowListOptionAll
+ | Quartz.kCGWindowListExcludeDesktopElements
+ | Quartz.kCGWindowListOptionOnScreenOnly,
+ Quartz.kCGNullWindowID,
+ )
+
+
+def gen_ids_from_info(
+ windows: Iterable[WindowInfo],
+) -> List[Tuple[int, str, str]]: # Changed return type to List
+ result = [] # Initialize a list to store results
+ for win_dict in windows:
+ owner = win_dict.get("kCGWindowOwnerName", "")
+ num = win_dict.get("kCGWindowNumber", "")
+ name = win_dict.get("kCGWindowName", "")
+ bounds = win_dict.get('kCGWindowBounds', "")
+
+ x = bounds['X']
+ y = bounds['Y']
+ width = bounds['Width']
+ height = bounds['Height']
+
+ result.append((num, owner, name, (x, y, width, height)))
+ return result
+
+
+def gen_window_ids(
+ parent: str,
+) -> List[Tuple[int, str]]: # Changed return type to List[Tuple[int, str]]
+ windows = get_window_info()
+ parent = parent.lower()
+ result = [] # Initialize a list to store results
+
+ for num, owner, window_name, (x, y, width, height) in gen_ids_from_info(windows):
+ if parent == owner.lower():
+ if window_name == STATUS_BAR_WINDOW_IDENTIFIER:
+ print(f"Skipping status bar window: {num}")
+ else:
+ window_name = window_name.replace(" ", "_")
+ result.append((num, window_name, (x, y, width, height)))
+
+ return result # Return the list of window IDs
+
+
+def take_screenshot(window: int, filename: str, output_folder: str) -> str:
+ filename = os.path.join(output_folder, filename)
+
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ command = COMMAND.format(window=window, filename=filename, options="")
+ rc, output = subprocess.getstatusoutput(command)
+ if rc != SUCCESS:
+ raise ScreencaptureEx(f"Error: screencapture output: {output}")
+
+ return filename
+
+
+def get_filename(window_name, extension) -> str:
+ return f"{window_name}:{time.time():.2f}.{extension}"
+
+
+def gen_windows(application_name: str) -> Iterator[int]:
+ windows = list(gen_window_ids(application_name)) # Convert generator to list
+ if not windows: # Check if the list is empty
+ print(f"Window with parent {application_name} not found.")
+ return windows # Return the list of windows
+
+
+def screenshot_windows(
+ app_name: str,
+ output_folder: str,
+ extension: str = "",
+) -> Iterator[str]:
+ windows = gen_windows(app_name)
+
+ for window_identifier, window_name, _ in windows:
+ yield take_screenshot(
+ window_identifier, get_filename(window_name, extension), output_folder
+ )
+
+
+def screenshot_application_windows(name: str, output_folder: str, extension: str) -> List[str]:
+ filenames = []
+ for filename in screenshot_windows(name, output_folder, extension):
+ filenames.append(filename)
+ return filenames
+
+
+def running_app(app_bundle):
+ workspace = AppKit.NSWorkspace.sharedWorkspace()
+ for app in workspace.runningApplications():
+ if app.bundleIdentifier() == app_bundle:
+ return app
+ return None
+
+
+def screenshot_app(app_bundle: str, output_folder: str) -> List[str]:
+ assert app_bundle is not None, "Application bundle is not specified"
+ assert output_folder is not None, "Output folder is not specified"
+
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
+ app = running_app(app_bundle)
+ return screenshot_application_windows(app.localizedName(), output_folder, FILE_EXT)
+
+
+# def get_window_position(processIdentifier):
+# window_list = Quartz.CGWindowListCopyWindowInfo(
+# Quartz.kCGWindowListOptionOnScreenOnly,
+# Quartz.kCGNullWindowID
+# )
+
+# for window in window_list:
+# if window['kCGWindowOwnerPID'] == processIdentifier:
+# return window['kCGWindowBounds']['X'], window['kCGWindowBounds']['Y']
+
+# return None
+
+
+def open_app_in_foreground(app_bundle: str, wait_time: float = 2):
+ os.system(f"open -b {app_bundle}")
+ time.sleep(wait_time)
+
+
+def get_app_name(bundle_id: str):
+ app = running_app(bundle_id)
+ return app.localizedName()
+
+def get_app_info(bundle_id: str):
+ app = get_app_name(bundle_id)
+ return gen_windows(app)[-1]
diff --git a/screen_reader/utils.py b/screen_reader/utils.py
new file mode 100644
index 0000000..6f73605
--- /dev/null
+++ b/screen_reader/utils.py
@@ -0,0 +1,207 @@
+from typing import List, TypedDict, Dict, Optional
+
+class UIElement(TypedDict):
+ box: List[int]
+ cls: str
+ value: Optional[str]
+ parent: "UIElement"
+ children: List["UIElement"]
+ prev: "UIElement"
+ next: "UIElement"
+ index: int
+
+def ordinal(n: int) -> str:
+ """
+ Returns the ordinal representation of an integer (e.g., 1 -> '1st').
+ """
+ if 11 <= (n % 100) <= 13:
+ suffix = "th"
+ else:
+ suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)]
+
+ return f"{n}{suffix}"
+
+
+def iou(box1: List[int], box2: List[int]) -> float:
+ """
+ Calculate the Intersection over Union (IoU) of two bounding boxes.
+
+ Args:
+ box1 (List[int]): The first bounding box.
+ box2 (List[int]): The second bounding box.
+
+ Returns:
+ float: The IoU value.
+ """
+ x1, y1, x2, y2 = box1
+ x1_other, y1_other, x2_other, y2_other = box2
+
+ inter_x1 = max(x1, x1_other)
+ inter_y1 = max(y1, y1_other)
+ inter_x2 = min(x2, x2_other)
+ inter_y2 = min(y2, y2_other)
+
+ inter_width = max(0, inter_x2 - inter_x1)
+ inter_height = max(0, inter_y2 - inter_y1)
+ intersection_area = inter_width * inter_height
+
+ area1 = (x2 - x1) * (y2 - y1)
+ area2 = (x2_other - x1_other) * (y2_other - y1_other)
+ union_area = area1 + area2 - intersection_area
+
+ if union_area == 0:
+ return 0
+
+ return intersection_area / union_area
+
+
+def remove_small_groups(node: Dict, n: int):
+ if "children" not in node:
+ return
+
+ # First process children recursively.
+ for child in node["children"]:
+ remove_small_groups(child, n)
+
+ # Now, scan through the children list and promote any group that has fewer than n children.
+ i = 0
+ while i < len(node["children"]): # skip root node
+ child = node["children"][i]
+
+ if "children" in child and len(child["children"]) < n and child["cls"].lower().endswith("group"):
+ # Promote the child’s children into the parent's children list.
+ promoted = child["children"]
+ # Update each promoted child's parent pointer.
+ for p in promoted:
+ p["parent"] = node
+
+ child["parent"] = None
+ child["children"] = []
+ child["to_remove"] = True
+
+ # Replace the small group with its children.
+ node["children"] = node["children"][:i] + promoted + node["children"][i+1:]
+ # After inserting the promoted children, recheck the new entries.
+ else:
+ i += 1
+
+ def sort_heuristic(c: Dict) -> float:
+ """
+ Heuristic function to sort children based on their bounding box coordinates.
+ """
+ x, y = c["box"][0], c["box"][1]
+
+ if c["parent"] is None:
+ return y ** 2 + 0.5 * x ** 2
+
+ # If the child has a parent, we want to subtract the parent's coordinates
+ parent_x, parent_y = c["parent"]["box"][0], c["parent"]["box"][1]
+ return (y - parent_y) + 0.3 * (x - parent_x)
+
+ # re-sort the children list
+ node["children"].sort(key=sort_heuristic)
+
+ # Remove any children marked for removal.
+ node["children"] = [c for c in node["children"] if "to_remove" not in c]
+
+ # Update sibling links for the (possibly updated) children list.
+ for j, c in enumerate(node["children"]):
+ c["index"] = j + 1
+ c["prev"] = node["children"][j - 1] if j > 0 else node["children"][-1]
+ c["next"] = node["children"][j + 1] if j < len(node["children"]) - 1 else node["children"][0]
+
+
+def add_description_to_groups(node: Dict, max_len: int = 100) -> None:
+ """
+ Adds a description to group nodes based on their children.
+
+ Args:
+ node (Dict): The node to process.
+ """
+ if "children" not in node:
+ return
+
+ for child in node["children"]:
+ add_description_to_groups(child)
+
+ if node["cls"].lower().endswith("group"):
+ descriptions = []
+
+ for child in node["children"]:
+ if child["value"] is not None:
+ descriptions.append(child["value"])
+
+ value = descriptions[0] if descriptions else ""
+ for item in descriptions[1:]:
+ new_value = f"{value}, {item}" if value else item
+
+ if len(new_value) > max_len:
+ new_value = new_value[:max_len] + ", and other..."
+ break
+
+ value = new_value
+
+
+ if len(value) > max_len:
+ value = value[:max_len] + ", and other..."
+
+ node["value"] = value
+
+
+def update_accessibility_data(data: Dict, n: int = 0) -> UIElement:
+ """
+ Updates the accessibility data by adding parent, next, and previous links to nodes.
+ Also, remove groups that have less than n children.
+
+ Args:
+ data (Dict): The original accessibility data.
+ n (int): The minimum number of children a group must have to be retained.
+ Defaults to 0.
+
+ Returns:
+ Dict: The updated accessibility data.
+ """
+ stack = [data]
+ data["parent"] = None
+ data["next"] = None
+ data["prev"] = None
+ data["index"] = 1
+
+ leaves = []
+
+ while stack:
+ node = stack.pop()
+
+ # Scale down the bounding box coordinates.
+ node["box"] = [coord // 2 for coord in node["box"]]
+
+ if "children" not in node:
+ continue
+
+ if len(node["children"]) == 0:
+ leaves.append(node)
+ continue
+
+ node["children"].sort(key=lambda x: (x["box"][1] ** 2 + 0.5 * x["box"][0] ** 2))
+
+ for i, child in enumerate(node["children"]):
+ child["parent"] = node
+ child["index"] = i + 1
+ child["prev"] = node["children"][i - 1] if i > 0 else node["children"][-1]
+ child["next"] = (
+ node["children"][i + 1]
+ if i < len(node["children"]) - 1
+ else node["children"][0]
+ )
+
+ if child["value"] and "|" in child["value"]:
+ child["value"] = child["value"].split("|")[0].strip()
+
+ stack.append(child)
+
+ # Remove groups with less than n children
+ remove_small_groups(data, n)
+
+ add_description_to_groups(data, max_len=30)
+
+ return data
\ No newline at end of file