From 2f2bf19d4d9c61c95557c29b8d2f3dd098861d7e Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Sun, 16 Nov 2025 19:41:44 -0500
Subject: [PATCH 1/2] fix llm looping on complex graphics

---
 .../object-detection-llm.py                   | 81 +++++++++++--------
 .../object-detection.schema.json              | 58 +++++--------
 utils/llm/prompts.py                          | 34 ++++----
 3 files changed, 79 insertions(+), 94 deletions(-)

diff --git a/preprocessors/object-detection-llm/object-detection-llm.py b/preprocessors/object-detection-llm/object-detection-llm.py
index b2930c8c..34687190 100644
--- a/preprocessors/object-detection-llm/object-detection-llm.py
+++ b/preprocessors/object-detection-llm/object-detection-llm.py
@@ -68,33 +68,30 @@ def normalize_bbox(bbox, width, height):
     ]
 
 
-def process_objects(objects, threshold):
+def process_objects(qwen_output, width, height, threshold):
     """
-    Process detected objects by filtering, transforming, and enriching them.
+    Transform Qwen object detection output to IMAGE schema format.
 
-    - Filters objects by confidence threshold
+    - Transforms from Qwen format (bbox_2d, label) to IMAGE format
+    - Normalizes bounding boxes to [0,1] range
+    - Assigns confidence threshold to all objects
     - Normalizes labels (replaces underscores with spaces)
-    - Renumbers IDs sequentially
     - Calculates geometric properties (area, centroid)
+    - Filters objects by confidence threshold
 
     Args:
-        objects (list): List of detected objects with confidence scores
+        qwen_output (list): Qwen detection output with bbox_2d and label
+        width (int): Image width for normalization
+        height (int): Image height for normalization
         threshold (float): Minimum confidence score (0-1)
 
     Returns:
         list: Processed objects with computed properties
     """
     processed = []
-    for obj in objects:
-        if obj.get("confidence", 0) >= threshold:
-            obj['type'] = obj['type'].replace('_', ' ')
-            processed.append(obj)
-
-    # Renumber IDs sequentially after filtering
-    for idx, obj in enumerate(processed):
-        obj['ID'] = idx
-
-        x1, y1, x2, y2 = obj["dimensions"]
+    for idx, item in enumerate(qwen_output):
+        # Normalize bounding box
+        x1, y1, x2, y2 = normalize_bbox(item["bbox_2d"], width, height)
 
         # Calculate area (width * height)
         area = (x2 - x1) * (y2 - y1)
@@ -103,13 +100,20 @@ def process_objects(objects, threshold):
         centroid_x = (x1 + x2) / 2
         centroid_y = (y1 + y2) / 2
 
-        # Create object entry according to schema
-        obj["area"] = area
-        obj["centroid"] = [centroid_x, centroid_y]
+        # Create object entry according to IMAGE schema
+        obj = {
+            "ID": idx,
+            "type": item["label"].replace('_', ' '),
+            "dimensions": [x1, y1, x2, y2],
+            "confidence": threshold,
+            "area": area,
+            "centroid": [centroid_x, centroid_y]
+        }
+
+        processed.append(obj)
 
     logging.debug(
-        f"Processed {len(objects)} objects to {len(processed)} "
-        f"objects with confidence >= {threshold}"
+        f"Processed {len(qwen_output)} objects from Qwen output"
     )
     return processed
 
@@ -155,35 +159,42 @@ def detect_objects():
     if error:
         return jsonify(error), error["code"]
 
+    stop_tokens = [
+        "<|im_end|>",          # Qwen's end token
+        "<|endoftext|>",        # Alternative end token
+        "\n\n\n",               # Triple newline
+        "```",                  # Code block end
+    ]
+
     try:
         # Get object info
-        object_json = llm_client.chat_completion(
+        qwen_output = llm_client.chat_completion(
             prompt=OBJECT_DETECTION_PROMPT,
             image_base64=base64_image,
             json_schema=BBOX_RESPONSE_SCHEMA,
-            temperature=0.0,
-            parse_json=True
+            temperature=0.5,
+            parse_json=True,
+            stop=stop_tokens
         )
 
-        if object_json is None or len(object_json.get("objects", [])) == 0:
+        logging.debug(f"Qwen output received: {qwen_output}")
+
+        if qwen_output is None or len(qwen_output) == 0:
             logging.error("Failed to extract objects from the graphic.")
             return jsonify({"error": "No objects extracted"}), 204
 
-        # Normalize bounding boxes
+        # Transform Qwen format to IMAGE schema format
         width, height = pil_image.size
-        for obj in object_json["objects"]:
-            # Normalize bounding boxes
-            obj["dimensions"] = normalize_bbox(
-                obj["dimensions"], width, height
-            )
-
-        # Filter objects by confidence threshold, add area and centroid,
-        # remove underscores from labels, and renumber IDs
-        object_json["objects"] = process_objects(
-            object_json["objects"],
+        processed_objects = process_objects(
+            qwen_output,
+            width,
+            height,
             CONF_THRESHOLD
         )
 
+        # Wrap in "objects" for schema compliance
+        object_json = {"objects": processed_objects}
+
         logging.pii(f"Normalized output: {object_json}")
 
         # Data schema validation
diff --git a/preprocessors/object-detection-llm/object-detection.schema.json b/preprocessors/object-detection-llm/object-detection.schema.json
index ed9a6d92..35133565 100644
--- a/preprocessors/object-detection-llm/object-detection.schema.json
+++ b/preprocessors/object-detection-llm/object-detection.schema.json
@@ -1,45 +1,23 @@
 {
   "$schema": "http://json-schema.org/draft-07/schema",
-  "type": "object",
+  "type": "array",
   "title": "Object Detection Data",
-  "description": "Detected object data with bounding boxes.",
-  "definitions": {
-    "object": {
-      "type": "object",
-      "title": "BoundingBoxItem",
-      "properties": {
-        "ID": {
-          "description": "A number identifying this object in the set.",
-          "type": "integer"
-        },
-        "type": {
-          "description": "The type of object detected (e.g., 'person', 'car').",
-          "type": "string"
-        },
-        "dimensions": {
-          "description": "Bounding box coordinates of this object [x1, y1, x2, y2].",
-          "type": "array",
-          "items": { "type": "number" },
-          "minItems": 4,
-          "maxItems": 4,
-          "additionalItems": false
-        },
-        "confidence": {
-          "description": "Confidence in the correctness of this object's data (0-1).",
-          "type": "number",
-          "minimum": 0,
-          "maximum": 1
-        }
+  "description": "Detected object data with bounding boxes in Qwen format.",
+  "items": {
+    "type": "object",
+    "properties": {
+      "bbox_2d": {
+        "description": "Bounding box coordinates [x1, y1, x2, y2].",
+        "type": "array",
+        "items": { "type": "number" },
+        "minItems": 4,
+        "maxItems": 4
       },
-      "required": ["ID", "type", "dimensions", "confidence"]
-    }
-  },
-  "properties": {
-    "objects": {
-      "description": "The set of detected objects in the image.",
-      "type": "array",
-      "items": { "$ref": "#/definitions/object" }
-    }
-  },
-  "required": ["objects"]
+      "label": {
+        "description": "The type of object detected (e.g., 'person', 'car').",
+        "type": "string"
+      }
+    },
+    "required": ["bbox_2d", "label"]
+  }
 }
\ No newline at end of file
diff --git a/utils/llm/prompts.py b/utils/llm/prompts.py
index ce686ce7..44f7be73 100644
--- a/utils/llm/prompts.py
+++ b/utils/llm/prompts.py
@@ -10,39 +10,35 @@
 """
 # Object detection
 OBJECT_DETECTION_PROMPT = """
-Give the bounding boxes for the objects found in this image.
+Step 1:
+Determine from 0 to 10 major and important objects in the image.
+Focus ONLY on the objects that are clearly visible and identifiable.
+
+Step 2:
+Give the bounding boxes for the objects determined in the first step.
 Output a only JSON list of bounding boxes where each entry contains:
-- the unique numeric ID in the key "ID",
-- the object label in the key "type",
-- the pixel coordinates of a 2D bounding box in the key "dimensions",
-- and the confidence score in the key "confidence".
+- the pixel coordinates of a 2D bounding box in the key "bbox_2d",
+- the object label in the key "label".
 
 Example:
 ```json
-{
-  "objects": [
+[
     {
-        "ID": 0,
-        "type": "car",
-        "dimensions": [120, 200, 300, 450],
-        "confidence": 0.92
+        "bbox_2d": [120, 200, 300, 450],
+        "label": "car",
     },
     {
-        "ID": 1,
-        "type": "person",
-        "dimensions": [50, 100, 120, 300],
-        "confidence": 0.95
+        "bbox_2d": [50, 100, 120, 300],
+        "label": "person",
     }
-  ]
-}
-
+]
 ```
 Ensure that the bounding boxes are in the format [x1, y1, x2, y2].
 
 Rules:
 1. Focus ONLY on the major and important objects in the image.
 2. The graphic can contain any number of objects, from zero to many.
-3. If no objects are detected, return an empty list: {"objects": []}.
+3. If no objects are detected, return an empty list: [].
 4. Use simple and common object labels (e.g., "car", "person", "tree").
 5. Include ONLY objects that are clearly visible and identifiable.
 6. Multiple objects can have the same confidence score.

From 85c822026bd2ad299690ffd1393d6e274869d832 Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <44172510+gvzdv@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:15:42 -0500
Subject: [PATCH 2/2] Clarify argument descriptions in object detection
 function

---
 preprocessors/object-detection-llm/object-detection-llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocessors/object-detection-llm/object-detection-llm.py b/preprocessors/object-detection-llm/object-detection-llm.py
index 34687190..011a6e15 100644
--- a/preprocessors/object-detection-llm/object-detection-llm.py
+++ b/preprocessors/object-detection-llm/object-detection-llm.py
@@ -81,8 +81,8 @@ def process_objects(qwen_output, width, height, threshold):
 
     Args:
         qwen_output (list): Qwen detection output with bbox_2d and label
-        width (int): Image width for normalization
-        height (int): Image height for normalization
+        width (int): Image width in pixels for normalization
+        height (int): Image height in pixels for normalization
         threshold (float): Minimum confidence score (0-1)
 
     Returns: