From a13998554a3d98d011f35cab5395e286e942bbdd Mon Sep 17 00:00:00 2001
From: siromermer <siromermer@gmail.com>
Date: Sun, 24 May 2026 15:21:24 +0300
Subject: [PATCH] Add GroundingDINO ONNX Runtime Python example

---
 python/README.md                              |   4 +
 python/models/grounding_dino/.gitignore       |   1 +
 python/models/grounding_dino/README.md        |  70 +++++
 .../infer_grounding_dino_onnxruntime.py       | 258 ++++++++++++++++++
 python/models/grounding_dino/requirements.txt |   6 +
 5 files changed, 339 insertions(+)
 create mode 100644 python/models/grounding_dino/.gitignore
 create mode 100644 python/models/grounding_dino/README.md
 create mode 100644 python/models/grounding_dino/infer_grounding_dino_onnxruntime.py
 create mode 100644 python/models/grounding_dino/requirements.txt

diff --git a/python/README.md b/python/README.md
index 4e5897e7..fe7b750a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -14,3 +14,7 @@ These samples show very minimal API usage that is not execution provider specifi
 ## AzureML
  
 [Question answering with BERT on AzureML](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/azureml)
+
+## Models
+
+[grounding_dino](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/models/grounding_dino).
diff --git a/python/models/grounding_dino/.gitignore b/python/models/grounding_dino/.gitignore
new file mode 100644
index 00000000..55d84739
--- /dev/null
+++ b/python/models/grounding_dino/.gitignore
@@ -0,0 +1 @@
+output.jpg
diff --git a/python/models/grounding_dino/README.md b/python/models/grounding_dino/README.md
new file mode 100644
index 00000000..8dab8d68
--- /dev/null
+++ b/python/models/grounding_dino/README.md
@@ -0,0 +1,70 @@
+# GroundingDINO with ONNX Runtime
+
+This example runs GroundingDINO zero-shot object detection with ONNX Runtime.
+It uses Hugging Face Transformers for preprocessing and post-processing, and
+ONNX Runtime for model execution.
+
+## Model
+
+The default model is
+[`onnx-community/grounding-dino-tiny-ONNX`](https://huggingface.co/onnx-community/grounding-dino-tiny-ONNX).
+The example downloads only the requested ONNX file from the Hugging Face Hub.
+
+The default ONNX file is `onnx/model_quantized.onnx`. It is smaller than the
+full precision model and is suitable for CPU execution.
+
+## Setup
+
+Install the dependencies:
+
+```bash
+pip install onnxruntime
+pip install -r requirements.txt
+```
+
+For GPU execution, install the ONNX Runtime GPU package instead of the CPU
+package:
+
+```bash
+pip install onnxruntime-gpu
+pip install -r requirements.txt
+```
+
+## Run
+
+```bash
+python infer_grounding_dino_onnxruntime.py \
+  --image http://images.cocodataset.org/val2017/000000039769.jpg \
+  --text "a cat. a remote control." \
+  --output output.jpg
+```
+
+Run with a specific execution provider:
+
+```bash
+python infer_grounding_dino_onnxruntime.py \
+  --provider CPUExecutionProvider \
+  --image http://images.cocodataset.org/val2017/000000039769.jpg \
+  --text "a cat. a remote control."
+```
+
+Run with a local ONNX file:
+
+```bash
+python infer_grounding_dino_onnxruntime.py \
+  --model-path path/to/model.onnx \
+  --model-repo onnx-community/grounding-dino-tiny-ONNX \
+  --image path/to/image.jpg \
+  --text "a cat. a remote control."
+```
+
+## Output
+
+The script prints:
+
+- ONNX Runtime execution provider.
+- Model input and output names.
+- Input and output types and shapes.
+- Detected text labels, confidence scores, and bounding boxes.
+
+It also writes an annotated image to `--output`.
diff --git a/python/models/grounding_dino/infer_grounding_dino_onnxruntime.py b/python/models/grounding_dino/infer_grounding_dino_onnxruntime.py
new file mode 100644
index 00000000..662af315
--- /dev/null
+++ b/python/models/grounding_dino/infer_grounding_dino_onnxruntime.py
@@ -0,0 +1,258 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+from io import BytesIO
+from pathlib import Path
+from urllib.parse import urlparse
+
+import numpy as np
+import onnxruntime as ort
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor
+from transformers.models.grounding_dino.modeling_grounding_dino import (
+    GroundingDinoObjectDetectionOutput,
+)
+
+
+ORT_DTYPE_TO_NUMPY = {
+    "tensor(bool)": np.bool_,
+    "tensor(float)": np.float32,
+    "tensor(double)": np.float64,
+    "tensor(int32)": np.int32,
+    "tensor(int64)": np.int64,
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run GroundingDINO ONNX inference with ONNX Runtime."
+    )
+    parser.add_argument(
+        "--model-repo",
+        default="onnx-community/grounding-dino-tiny-ONNX",
+        help="Hugging Face model repository containing the processor files.",
+    )
+    parser.add_argument(
+        "--onnx-file",
+        default="onnx/model_quantized.onnx",
+        help="ONNX filename inside --model-repo. Ignored when --model-path is set.",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=Path,
+        help="Local ONNX model path. If unset, the file is downloaded from --model-repo.",
+    )
+    parser.add_argument(
+        "--image",
+        default="http://images.cocodataset.org/val2017/000000039769.jpg",
+        help="Image URL or local image path.",
+    )
+    parser.add_argument(
+        "--text",
+        default="a cat. a remote control.",
+        help="Grounding text. Separate object names with periods.",
+    )
+    parser.add_argument(
+        "--provider",
+        default="CPUExecutionProvider",
+        help="ONNX Runtime execution provider.",
+    )
+    parser.add_argument(
+        "--box-threshold",
+        type=float,
+        default=0.3,
+        help="Minimum object confidence score.",
+    )
+    parser.add_argument(
+        "--text-threshold",
+        type=float,
+        default=0.3,
+        help="Minimum token confidence score for text label extraction.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("output.jpg"),
+        help="Annotated output image path.",
+    )
+    return parser.parse_args()
+
+
+def resolve_model_path(
+    model_repo: str, onnx_file: str, model_path: Path | None
+) -> Path:
+    if model_path is not None:
+        return model_path
+
+    downloaded_path = hf_hub_download(repo_id=model_repo, filename=onnx_file)
+    return Path(downloaded_path)
+
+
+def load_image(source: str) -> Image.Image:
+    parsed_url = urlparse(source)
+    if parsed_url.scheme in {"http", "https"}:
+        response = requests.get(source, timeout=30)
+        response.raise_for_status()
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(source).convert("RGB")
+
+    print(f"Loaded image: Type: PIL.Image.Image, Shape: {image.size[::-1] + (3,)}")
+    return image
+
+
+def create_session(model_path: Path, provider: str) -> ort.InferenceSession:
+    available_providers = ort.get_available_providers()
+    if provider not in available_providers:
+        raise ValueError(
+            f"Requested provider '{provider}' is not available. "
+            f"Available providers: {available_providers}"
+        )
+
+    session = ort.InferenceSession(str(model_path), providers=[provider])
+    print(f"ONNX model path: {model_path}")
+    print(f"ONNX Runtime providers: {session.get_providers()}")
+    return session
+
+
+def cast_for_ort(name: str, value: np.ndarray, ort_type: str) -> np.ndarray:
+    expected_dtype = ORT_DTYPE_TO_NUMPY.get(ort_type)
+    if expected_dtype is None:
+        raise TypeError(f"Unsupported ONNX Runtime input type for '{name}': {ort_type}")
+
+    array = np.asarray(value)
+    if array.dtype != expected_dtype:
+        array = array.astype(expected_dtype)
+    return array
+
+
+def build_ort_inputs(
+    session: ort.InferenceSession, encoded_inputs: dict[str, np.ndarray]
+) -> dict[str, np.ndarray]:
+    ort_inputs = {}
+    for input_meta in session.get_inputs():
+        if input_meta.name not in encoded_inputs:
+            raise KeyError(f"Processor output does not contain '{input_meta.name}'")
+
+        value = cast_for_ort(
+            input_meta.name, encoded_inputs[input_meta.name], input_meta.type
+        )
+        ort_inputs[input_meta.name] = value
+        print(
+            f"Prepared input {input_meta.name}: "
+            f"Type: np.ndarray[{value.dtype}], Shape: {value.shape}"
+        )
+
+    return ort_inputs
+
+
+def run_ort(
+    session: ort.InferenceSession, ort_inputs: dict[str, np.ndarray]
+) -> dict[str, np.ndarray]:
+    output_names = [output_meta.name for output_meta in session.get_outputs()]
+    print(f"Model output names: {output_names}")
+
+    raw_outputs = session.run(output_names, ort_inputs)
+    output_map = dict(zip(output_names, raw_outputs, strict=True))
+
+    for name, value in output_map.items():
+        print(
+            f"Raw output {name}: Type: np.ndarray[{value.dtype}], Shape: {value.shape}"
+        )
+
+    return output_map
+
+
+def post_process(
+    processor,
+    output_map: dict[str, np.ndarray],
+    input_ids: np.ndarray,
+    image: Image.Image,
+    box_threshold: float,
+    text_threshold: float,
+):
+    required_outputs = {"logits", "pred_boxes"}
+    missing_outputs = required_outputs.difference(output_map)
+    if missing_outputs:
+        raise KeyError(f"Missing required model outputs: {sorted(missing_outputs)}")
+
+    outputs = GroundingDinoObjectDetectionOutput(
+        logits=torch.from_numpy(output_map["logits"]),
+        pred_boxes=torch.from_numpy(output_map["pred_boxes"]),
+        input_ids=torch.from_numpy(input_ids),
+    )
+    target_sizes = [image.size[::-1]]
+    results = processor.post_process_grounded_object_detection(
+        outputs,
+        input_ids=outputs.input_ids,
+        threshold=box_threshold,
+        text_threshold=text_threshold,
+        target_sizes=target_sizes,
+    )
+    result = results[0]
+    print(
+        f"Postprocessed boxes: Type: torch.Tensor, Shape: {tuple(result['boxes'].shape)}"
+    )
+    print(
+        f"Postprocessed scores: Type: torch.Tensor, Shape: {tuple(result['scores'].shape)}"
+    )
+    return result
+
+
+def annotate_image(image: Image.Image, result, output_path: Path) -> None:
+    annotated_image = image.copy()
+    draw = ImageDraw.Draw(annotated_image)
+
+    for box, score, label in zip(
+        result["boxes"], result["scores"], result["text_labels"], strict=True
+    ):
+        x0, y0, x1, y1 = [float(value) for value in box.tolist()]
+        caption = f"{label}: {float(score):.3f}"
+        draw.rectangle((x0, y0, x1, y1), outline="red", width=3)
+        text_box = draw.textbbox((x0, y0), caption)
+        draw.rectangle(text_box, fill="red")
+        draw.text((x0, y0), caption, fill="white")
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    annotated_image.save(output_path)
+    print(f"Saved annotated image: {output_path}")
+
+
+def main() -> None:
+    args = parse_args()
+    model_path = resolve_model_path(args.model_repo, args.onnx_file, args.model_path)
+    processor = AutoProcessor.from_pretrained(args.model_repo)
+    image = load_image(args.image)
+    session = create_session(model_path, args.provider)
+
+    encoded_inputs = processor(images=image, text=args.text, return_tensors="np")
+    ort_inputs = build_ort_inputs(session, encoded_inputs)
+    output_map = run_ort(session, ort_inputs)
+    result = post_process(
+        processor,
+        output_map,
+        ort_inputs["input_ids"],
+        image,
+        args.box_threshold,
+        args.text_threshold,
+    )
+
+    for index, (box, score, label) in enumerate(
+        zip(result["boxes"], result["scores"], result["text_labels"], strict=True),
+        start=1,
+    ):
+        rounded_box = [round(float(value), 2) for value in box.tolist()]
+        print(
+            f"Detection {index}: Label: {label}, "
+            f"Score: {float(score):.3f}, Box: {rounded_box}"
+        )
+
+    annotate_image(image, result, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/models/grounding_dino/requirements.txt b/python/models/grounding_dino/requirements.txt
new file mode 100644
index 00000000..e758c7bd
--- /dev/null
+++ b/python/models/grounding_dino/requirements.txt
@@ -0,0 +1,6 @@
+huggingface-hub
+numpy
+pillow
+requests
+torch
+transformers