From 8c6764cfdf76d8daee9f1c009d93ab3c03b40e0d Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sun, 15 Mar 2026 11:06:52 -0700 Subject: [PATCH] feat: add CoreML depth estimation models and benchmark - Add models.json with CoreML model definitions for depth estimation - Add benchmark_coreml.py script for CoreML inference performance testing --- .../depth-estimation/models.json | 146 ++++++++++++++ .../scripts/benchmark_coreml.py | 178 ++++++++++++++++++ 2 files changed, 324 insertions(+) create mode 100644 skills/transformation/depth-estimation/models.json create mode 100644 skills/transformation/depth-estimation/scripts/benchmark_coreml.py diff --git a/skills/transformation/depth-estimation/models.json b/skills/transformation/depth-estimation/models.json new file mode 100644 index 0000000..27ee043 --- /dev/null +++ b/skills/transformation/depth-estimation/models.json @@ -0,0 +1,146 @@ +{ + "studio": { + "title": "3D Depth Vision Studio", + "subtitle": "Convert 2D video to 3D depth maps • Privacy-first scene understanding", + "icon": "layers" + }, + "models_dir": "~/.aegis-ai/models/feature-extraction", + "models": { + "depth-anything-v2-small": { + "name": "Depth Anything V2 Small", + "type": "depth_estimation", + "description": "Real-time monocular depth estimation — Apple Neural Engine optimized", + "input_size": [518, 392], + "platforms": { + "darwin": { + "repository": "apple/coreml-depth-anything-v2-small", + "format": "mlpackage", + "variants": { + "DepthAnythingV2SmallF16": { + "precision": "float16", + "size_mb": 49.8, + "description": "Float16 — optimized for Neural Engine" + }, + "DepthAnythingV2SmallF16INT8": { + "precision": "float16_int8", + "size_mb": 25.0, + "description": "Float16 + INT8 quantization — smallest" + }, + "DepthAnythingV2SmallF32": { + "precision": "float32", + "size_mb": 99.2, + "description": "Float32 — highest precision" + }, + "DepthAnythingV2SmallF16P6": { + "precision": "float16_p6", + "size_mb": 18.0, + "description": "Float16 palettized 6-bit" + }, + "DepthAnythingV2SmallF16P8": { + "precision": "float16_p8", + "size_mb": 24.0, + "description": "Float16 palettized 8-bit" + }, + "DepthAnythingV2SmallF32INT8": { + "precision": "float32_int8", + "size_mb": 24.0, + "description": "Float32 + INT8 quantization" + }, + "DepthAnythingV2SmallF32P6": { + "precision": "float32_p6", + "size_mb": 18.0, + "description": "Float32 palettized 6-bit" + }, + "DepthAnythingV2SmallF32P8": { + "precision": "float32_p8", + "size_mb": 24.0, + "description": "Float32 palettized 8-bit" + } + } + }, + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Small", + "format": "pth", + "variants": { + "depth_anything_v2_vits": { + "precision": "float32", + "size_mb": 99.0, + "description": "PyTorch ViT-S — CUDA/CPU" + } + } + }, + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Small", + "format": "pth", + "variants": { + "depth_anything_v2_vits": { + "precision": "float32", + "size_mb": 99.0, + "description": "PyTorch ViT-S — CUDA/CPU" + } + } + } + } + }, + "depth-anything-v2-base": { + "name": "Depth Anything V2 Base", + "type": "depth_estimation", + "description": "Higher accuracy depth estimation — larger model", + "input_size": [518, 392], + "platforms": { + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Base", + "format": "pth", + "variants": { + "depth_anything_v2_vitb": { + "precision": "float32", + "size_mb": 390.0, + "description": "PyTorch ViT-B — CUDA/CPU" + } + } + }, + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Base", + "format": "pth", + "variants": { + "depth_anything_v2_vitb": { + "precision": "float32", + "size_mb": 390.0, + "description": "PyTorch ViT-B — CUDA/CPU" + } + } + } + } + }, + "depth-anything-v2-large": { + "name": "Depth Anything V2 Large", + "type": "depth_estimation", + "description": "Highest accuracy depth estimation — largest model", + "input_size": [518, 392], + "platforms": { + "linux": { + "repository": "depth-anything/Depth-Anything-V2-Large", + "format": "pth", + "variants": { + "depth_anything_v2_vitl": { + "precision": "float32", + "size_mb": 1280.0, + "description": "PyTorch ViT-L — CUDA recommended" + } + } + }, + "win32": { + "repository": "depth-anything/Depth-Anything-V2-Large", + "format": "pth", + "variants": { + "depth_anything_v2_vitl": { + "precision": "float32", + "size_mb": 1280.0, + "description": "PyTorch ViT-L — CUDA recommended" + } + } + } + } + } + } +} diff --git a/skills/transformation/depth-estimation/scripts/benchmark_coreml.py b/skills/transformation/depth-estimation/scripts/benchmark_coreml.py new file mode 100644 index 0000000..63649ed --- /dev/null +++ b/skills/transformation/depth-estimation/scripts/benchmark_coreml.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Standalone CoreML depth benchmark — spawned by Aegis IPC handler. + +Usage: + python3 benchmark_coreml.py --variant DepthAnythingV2SmallF16 --runs 10 --colormap viridis + +Outputs a single JSON line to stdout with benchmark results. +""" + +import sys +import json +import time +import os +import argparse +import platform +from pathlib import Path + + +MODELS_DIR = Path.home() / ".aegis-ai" / "models" / "feature-extraction" +COREML_INPUT_SIZE = (518, 392) # width, height + +COLORMAP_MAP = { + "inferno": 1, "viridis": 16, "plasma": 13, "magma": 12, + "jet": 2, "turbo": 18, "hot": 11, "cool": 8, +} + +COMPUTE_UNIT_MAP = { + "all": "ALL", + "cpu": "CPU_ONLY", + "gpu": "CPU_AND_GPU", + "cpu_npu": "CPU_AND_NE", + "npu": "ALL", +} + + +def _log(msg): + print(f"[DepthBenchmark] {msg}", file=sys.stderr, flush=True) + + +def download_test_image(url): + """Download a test image from URL, return numpy BGR array.""" + import cv2 + import numpy as np + import urllib.request + + _log(f"Downloading test image: {url}") + tmp_path = "/tmp/aegis_depth_bench_test.jpg" + + try: + urllib.request.urlretrieve(url, tmp_path) + img = cv2.imread(tmp_path) + if img is not None: + return img + except Exception as e: + _log(f"Download failed: {e}") + + # Fallback: generate a synthetic test image + _log("Using synthetic test image (640x480 gradient)") + return np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8) + + +def run_benchmark(args): + import cv2 + import numpy as np + import coremltools as ct + from PIL import Image + + variant_id = args.variant + model_path = MODELS_DIR / f"{variant_id}.mlpackage" + + if not model_path.exists(): + print(json.dumps({"error": f"Model not found: {model_path}"})) + sys.exit(1) + + # Load model + _log(f"Loading CoreML model: {variant_id}") + compute_unit_key = COMPUTE_UNIT_MAP.get(args.compute_units, "ALL") + compute_unit = getattr(ct.ComputeUnit, compute_unit_key, ct.ComputeUnit.ALL) + + t0 = time.perf_counter() + model = ct.models.MLModel(str(model_path), compute_units=compute_unit) + load_time_ms = (time.perf_counter() - t0) * 1000 + _log(f"Model loaded in {load_time_ms:.0f}ms (compute_units={compute_unit_key})") + + # Get test image + test_image = download_test_image(args.test_image_url) + original_h, original_w = test_image.shape[:2] + input_w, input_h = COREML_INPUT_SIZE + + # Prepare input + rgb = cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB) + resized = cv2.resize(rgb, (input_w, input_h), interpolation=cv2.INTER_LINEAR) + pil_image = Image.fromarray(resized, mode="RGB") + + colormap_id = COLORMAP_MAP.get(args.colormap, 16) + + # Warm-up run + _log("Warm-up inference...") + model.predict({"image": pil_image}) + + # Benchmark runs + _log(f"Running {args.runs} benchmark iterations...") + times = [] + last_depth_colored = None + + for i in range(args.runs): + t0 = time.perf_counter() + prediction = model.predict({"image": pil_image}) + elapsed_ms = (time.perf_counter() - t0) * 1000 + times.append(elapsed_ms) + + if i == 0: + # Process first result for extraction preview + output_key = list(prediction.keys())[0] + depth_map = np.array(prediction[output_key]) + if depth_map.ndim > 2: + depth_map = np.squeeze(depth_map) + depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min() + 1e-8) + depth_uint8 = (depth_norm * 255).astype(np.uint8) + last_depth_colored = cv2.applyColorMap(depth_uint8, colormap_id) + last_depth_colored = cv2.resize(last_depth_colored, (original_w, original_h)) + + # Compute stats + import statistics + times_sorted = sorted(times) + avg_ms = statistics.mean(times) + std_ms = statistics.stdev(times) if len(times) > 1 else 0 + + result = { + "model_id": "depth-anything-v2-small", + "variant_id": variant_id, + "num_runs": args.runs, + "successful_runs": len(times), + "avg_time_ms": round(avg_ms, 2), + "min_time_ms": round(times_sorted[0], 2), + "max_time_ms": round(times_sorted[-1], 2), + "std_time_ms": round(std_ms, 2), + "fps": round(1000.0 / avg_ms, 2) if avg_ms > 0 else 0, + "model_load_ms": round(load_time_ms, 2), + "compute_units": args.compute_units, + } + + # Encode extraction result as base64 for preview + if last_depth_colored is not None: + import base64 + _, buf = cv2.imencode(".jpg", last_depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85]) + result["extraction_result"] = { + "success": True, + "feature_type": "depth_estimation", + "feature_data": base64.b64encode(buf).decode("ascii"), + "processing_time": round(times[0], 2), + "metadata": { + "model": variant_id, + "colormap": args.colormap, + "compute_units": args.compute_units, + "input_size": list(COREML_INPUT_SIZE), + }, + } + + _log(f"Benchmark complete: {avg_ms:.1f}ms avg ({result['fps']:.1f} FPS)") + print(json.dumps(result), flush=True) + + +if __name__ == "__main__": + if platform.system() != "Darwin": + print(json.dumps({"error": "CoreML benchmark requires macOS"})) + sys.exit(1) + + parser = argparse.ArgumentParser() + parser.add_argument("--variant", default="DepthAnythingV2SmallF16") + parser.add_argument("--runs", type=int, default=10) + parser.add_argument("--colormap", default="viridis") + parser.add_argument("--compute-units", default="all") + parser.add_argument("--test-image-url", default="https://ultralytics.com/images/bus.jpg") + args = parser.parse_args() + + run_benchmark(args)