From 8c6764cfdf76d8daee9f1c009d93ab3c03b40e0d Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sun, 15 Mar 2026 11:06:52 -0700
Subject: [PATCH] feat: add CoreML depth estimation models and benchmark

- Add models.json with CoreML model definitions for depth estimation
- Add benchmark_coreml.py script for CoreML inference performance testing
---
 .../depth-estimation/models.json              | 146 ++++++++++++++
 .../scripts/benchmark_coreml.py               | 178 ++++++++++++++++++
 2 files changed, 324 insertions(+)
 create mode 100644 skills/transformation/depth-estimation/models.json
 create mode 100644 skills/transformation/depth-estimation/scripts/benchmark_coreml.py

diff --git a/skills/transformation/depth-estimation/models.json b/skills/transformation/depth-estimation/models.json
new file mode 100644
index 0000000..27ee043
--- /dev/null
+++ b/skills/transformation/depth-estimation/models.json
@@ -0,0 +1,146 @@
+{
+  "studio": {
+    "title": "3D Depth Vision Studio",
+    "subtitle": "Convert 2D video to 3D depth maps • Privacy-first scene understanding",
+    "icon": "layers"
+  },
+  "models_dir": "~/.aegis-ai/models/feature-extraction",
+  "models": {
+    "depth-anything-v2-small": {
+      "name": "Depth Anything V2 Small",
+      "type": "depth_estimation",
+      "description": "Real-time monocular depth estimation — Apple Neural Engine optimized",
+      "input_size": [518, 392],
+      "platforms": {
+        "darwin": {
+          "repository": "apple/coreml-depth-anything-v2-small",
+          "format": "mlpackage",
+          "variants": {
+            "DepthAnythingV2SmallF16": {
+              "precision": "float16",
+              "size_mb": 49.8,
+              "description": "Float16 — optimized for Neural Engine"
+            },
+            "DepthAnythingV2SmallF16INT8": {
+              "precision": "float16_int8",
+              "size_mb": 25.0,
+              "description": "Float16 + INT8 quantization — smallest"
+            },
+            "DepthAnythingV2SmallF32": {
+              "precision": "float32",
+              "size_mb": 99.2,
+              "description": "Float32 — highest precision"
+            },
+            "DepthAnythingV2SmallF16P6": {
+              "precision": "float16_p6",
+              "size_mb": 18.0,
+              "description": "Float16 palettized 6-bit"
+            },
+            "DepthAnythingV2SmallF16P8": {
+              "precision": "float16_p8",
+              "size_mb": 24.0,
+              "description": "Float16 palettized 8-bit"
+            },
+            "DepthAnythingV2SmallF32INT8": {
+              "precision": "float32_int8",
+              "size_mb": 24.0,
+              "description": "Float32 + INT8 quantization"
+            },
+            "DepthAnythingV2SmallF32P6": {
+              "precision": "float32_p6",
+              "size_mb": 18.0,
+              "description": "Float32 palettized 6-bit"
+            },
+            "DepthAnythingV2SmallF32P8": {
+              "precision": "float32_p8",
+              "size_mb": 24.0,
+              "description": "Float32 palettized 8-bit"
+            }
+          }
+        },
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Small",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vits": {
+              "precision": "float32",
+              "size_mb": 99.0,
+              "description": "PyTorch ViT-S — CUDA/CPU"
+            }
+          }
+        },
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Small",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vits": {
+              "precision": "float32",
+              "size_mb": 99.0,
+              "description": "PyTorch ViT-S — CUDA/CPU"
+            }
+          }
+        }
+      }
+    },
+    "depth-anything-v2-base": {
+      "name": "Depth Anything V2 Base",
+      "type": "depth_estimation",
+      "description": "Higher accuracy depth estimation — larger model",
+      "input_size": [518, 392],
+      "platforms": {
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Base",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitb": {
+              "precision": "float32",
+              "size_mb": 390.0,
+              "description": "PyTorch ViT-B — CUDA/CPU"
+            }
+          }
+        },
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Base",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitb": {
+              "precision": "float32",
+              "size_mb": 390.0,
+              "description": "PyTorch ViT-B — CUDA/CPU"
+            }
+          }
+        }
+      }
+    },
+    "depth-anything-v2-large": {
+      "name": "Depth Anything V2 Large",
+      "type": "depth_estimation",
+      "description": "Highest accuracy depth estimation — largest model",
+      "input_size": [518, 392],
+      "platforms": {
+        "linux": {
+          "repository": "depth-anything/Depth-Anything-V2-Large",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitl": {
+              "precision": "float32",
+              "size_mb": 1280.0,
+              "description": "PyTorch ViT-L — CUDA recommended"
+            }
+          }
+        },
+        "win32": {
+          "repository": "depth-anything/Depth-Anything-V2-Large",
+          "format": "pth",
+          "variants": {
+            "depth_anything_v2_vitl": {
+              "precision": "float32",
+              "size_mb": 1280.0,
+              "description": "PyTorch ViT-L — CUDA recommended"
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/skills/transformation/depth-estimation/scripts/benchmark_coreml.py b/skills/transformation/depth-estimation/scripts/benchmark_coreml.py
new file mode 100644
index 0000000..63649ed
--- /dev/null
+++ b/skills/transformation/depth-estimation/scripts/benchmark_coreml.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Standalone CoreML depth benchmark — spawned by Aegis IPC handler.
+
+Usage:
+  python3 benchmark_coreml.py --variant DepthAnythingV2SmallF16 --runs 10 --colormap viridis
+
+Outputs a single JSON line to stdout with benchmark results.
+"""
+
+import sys
+import json
+import time
+import os
+import argparse
+import platform
+from pathlib import Path
+
+
+MODELS_DIR = Path.home() / ".aegis-ai" / "models" / "feature-extraction"
+COREML_INPUT_SIZE = (518, 392)  # width, height
+
+COLORMAP_MAP = {
+    "inferno": 1, "viridis": 16, "plasma": 13, "magma": 12,
+    "jet": 2, "turbo": 18, "hot": 11, "cool": 8,
+}
+
+COMPUTE_UNIT_MAP = {
+    "all": "ALL",
+    "cpu": "CPU_ONLY",
+    "gpu": "CPU_AND_GPU",
+    "cpu_npu": "CPU_AND_NE",
+    "npu": "ALL",
+}
+
+
+def _log(msg):
+    print(f"[DepthBenchmark] {msg}", file=sys.stderr, flush=True)
+
+
+def download_test_image(url):
+    """Download a test image from URL, return numpy BGR array."""
+    import cv2
+    import numpy as np
+    import urllib.request
+
+    _log(f"Downloading test image: {url}")
+    tmp_path = "/tmp/aegis_depth_bench_test.jpg"
+
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        img = cv2.imread(tmp_path)
+        if img is not None:
+            return img
+    except Exception as e:
+        _log(f"Download failed: {e}")
+
+    # Fallback: generate a synthetic test image
+    _log("Using synthetic test image (640x480 gradient)")
+    return np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
+
+
+def run_benchmark(args):
+    import cv2
+    import numpy as np
+    import coremltools as ct
+    from PIL import Image
+
+    variant_id = args.variant
+    model_path = MODELS_DIR / f"{variant_id}.mlpackage"
+
+    if not model_path.exists():
+        print(json.dumps({"error": f"Model not found: {model_path}"}))
+        sys.exit(1)
+
+    # Load model
+    _log(f"Loading CoreML model: {variant_id}")
+    compute_unit_key = COMPUTE_UNIT_MAP.get(args.compute_units, "ALL")
+    compute_unit = getattr(ct.ComputeUnit, compute_unit_key, ct.ComputeUnit.ALL)
+
+    t0 = time.perf_counter()
+    model = ct.models.MLModel(str(model_path), compute_units=compute_unit)
+    load_time_ms = (time.perf_counter() - t0) * 1000
+    _log(f"Model loaded in {load_time_ms:.0f}ms (compute_units={compute_unit_key})")
+
+    # Get test image
+    test_image = download_test_image(args.test_image_url)
+    original_h, original_w = test_image.shape[:2]
+    input_w, input_h = COREML_INPUT_SIZE
+
+    # Prepare input
+    rgb = cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB)
+    resized = cv2.resize(rgb, (input_w, input_h), interpolation=cv2.INTER_LINEAR)
+    pil_image = Image.fromarray(resized, mode="RGB")
+
+    colormap_id = COLORMAP_MAP.get(args.colormap, 16)
+
+    # Warm-up run
+    _log("Warm-up inference...")
+    model.predict({"image": pil_image})
+
+    # Benchmark runs
+    _log(f"Running {args.runs} benchmark iterations...")
+    times = []
+    last_depth_colored = None
+
+    for i in range(args.runs):
+        t0 = time.perf_counter()
+        prediction = model.predict({"image": pil_image})
+        elapsed_ms = (time.perf_counter() - t0) * 1000
+        times.append(elapsed_ms)
+
+        if i == 0:
+            # Process first result for extraction preview
+            output_key = list(prediction.keys())[0]
+            depth_map = np.array(prediction[output_key])
+            if depth_map.ndim > 2:
+                depth_map = np.squeeze(depth_map)
+            depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min() + 1e-8)
+            depth_uint8 = (depth_norm * 255).astype(np.uint8)
+            last_depth_colored = cv2.applyColorMap(depth_uint8, colormap_id)
+            last_depth_colored = cv2.resize(last_depth_colored, (original_w, original_h))
+
+    # Compute stats
+    import statistics
+    times_sorted = sorted(times)
+    avg_ms = statistics.mean(times)
+    std_ms = statistics.stdev(times) if len(times) > 1 else 0
+
+    result = {
+        "model_id": "depth-anything-v2-small",
+        "variant_id": variant_id,
+        "num_runs": args.runs,
+        "successful_runs": len(times),
+        "avg_time_ms": round(avg_ms, 2),
+        "min_time_ms": round(times_sorted[0], 2),
+        "max_time_ms": round(times_sorted[-1], 2),
+        "std_time_ms": round(std_ms, 2),
+        "fps": round(1000.0 / avg_ms, 2) if avg_ms > 0 else 0,
+        "model_load_ms": round(load_time_ms, 2),
+        "compute_units": args.compute_units,
+    }
+
+    # Encode extraction result as base64 for preview
+    if last_depth_colored is not None:
+        import base64
+        _, buf = cv2.imencode(".jpg", last_depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85])
+        result["extraction_result"] = {
+            "success": True,
+            "feature_type": "depth_estimation",
+            "feature_data": base64.b64encode(buf).decode("ascii"),
+            "processing_time": round(times[0], 2),
+            "metadata": {
+                "model": variant_id,
+                "colormap": args.colormap,
+                "compute_units": args.compute_units,
+                "input_size": list(COREML_INPUT_SIZE),
+            },
+        }
+
+    _log(f"Benchmark complete: {avg_ms:.1f}ms avg ({result['fps']:.1f} FPS)")
+    print(json.dumps(result), flush=True)
+
+
+if __name__ == "__main__":
+    if platform.system() != "Darwin":
+        print(json.dumps({"error": "CoreML benchmark requires macOS"}))
+        sys.exit(1)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--variant", default="DepthAnythingV2SmallF16")
+    parser.add_argument("--runs", type=int, default=10)
+    parser.add_argument("--colormap", default="viridis")
+    parser.add_argument("--compute-units", default="all")
+    parser.add_argument("--test-image-url", default="https://ultralytics.com/images/bus.jpg")
+    args = parser.parse_args()
+
+    run_benchmark(args)