From d921a512c3954c12d44fd2ef8e53333c0a10d605 Mon Sep 17 00:00:00 2001
From: Pierre Tenedero <pierre.tenedero@gmail.com>
Date: Wed, 8 Apr 2026 17:44:33 +0800
Subject: [PATCH] Add face detection feature on video input

---
 README.md                            |   10 +
 services/ws-server/src/main.rs       |   21 +
 services/ws-server/static/app.js     | 1079 ++++++++++++++++++++++++++
 services/ws-server/static/index.html |    9 +
 4 files changed, 1119 insertions(+)

diff --git a/README.md b/README.md
index 57531f3..7f8eedd 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,18 @@ mise run ws-e2e-chrome
 
 ## Run ws agent in browser
 
+### HAR model setup
+
 Download the onnx from https://modelnova.ai/models/details/human-activity-recognition ,
 and save it as `services/ws-server/static/models/human_activity_recognition.onnx`
 
+### Face detection setup
+
+Download the onnx from https://huggingface.co/amd/retinaface and save it in
+`services/ws-server/static/models/` and rename the file to `video_cv.onnx`.
+
+### Build and run the agent
+
 ```bash
 mise run build-ws-wasm-agent
 mise run ws-server
@@ -35,6 +44,7 @@ which will normally be something like 192.168.1.x.
 Then on your phone, open Chrome and type in https://192.168.1.x:8433/
 
 Click "Load HAR model" and then "Start sensors".
+For webcam inference, click "Load video CV model" and then "Start video".
 
 ## Grant
 
diff --git a/services/ws-server/src/main.rs b/services/ws-server/src/main.rs
index 4e78025..3538a1d 100644
--- a/services/ws-server/src/main.rs
+++ b/services/ws-server/src/main.rs
@@ -177,6 +177,27 @@ impl StreamHandler<Result<ws::Message, ws::ProtocolError>> for WebSocketActor {
                             action,
                             details,
                         } => {
+                            if capability == "video_cv" && action == "inference" {
+                                let detected_class = details
+                                    .get("detected_class")
+                                    .and_then(|value| value.as_str())
+                                    .unwrap_or("unknown");
+                                let confidence = details
+                                    .get("confidence")
+                                    .and_then(|value| value.as_f64())
+                                    .unwrap_or_default();
+                                let processed_at = details
+                                    .get("processed_at")
+                                    .and_then(|value| value.as_str())
+                                    .unwrap_or("unknown");
+                                info!(
+                                    "Video inference received from {}: class={} confidence={:.4} processed_at={}",
+                                    self.current_agent_id(),
+                                    detected_class,
+                                    confidence,
+                                    processed_at
+                                );
+                            }
                             info!(
                                 "Client event from {}: capability={} action={} details={}",
                                 self.current_agent_id(),
diff --git a/services/ws-server/static/app.js b/services/ws-server/static/app.js
index 3caa152..43409ac 100644
--- a/services/ws-server/static/app.js
+++ b/services/ws-server/static/app.js
@@ -24,12 +24,16 @@ const speechButton = document.getElementById("speech-button");
 const nfcButton = document.getElementById("nfc-button");
 const sensorsButton = document.getElementById("sensors-button");
 const harButton = document.getElementById("har-button");
+const videoModelButton = document.getElementById("video-model-button");
+const videoOutputButton = document.getElementById("video-output-button");
 const harExportButton = document.getElementById("har-export-button");
 const agentStatusEl = document.getElementById("agent-status");
 const agentIdEl = document.getElementById("agent-id");
 const sensorOutputEl = document.getElementById("sensor-output");
 const harOutputEl = document.getElementById("har-output");
+const videoOutputEl = document.getElementById("video-output");
 const videoPreview = document.getElementById("video-preview");
+const videoOutputCanvas = document.getElementById("video-output-canvas");
 let microphone = null;
 let videoCapture = null;
 let bluetoothDevice = null;
@@ -46,11 +50,36 @@ let harInferencePending = false;
 let lastInferenceAt = 0;
 let harSamplerId = null;
 let lastHarClassLabel = null;
+let videoCvSession = null;
+let videoCvInputName = null;
+let videoCvOutputName = null;
+let videoCvLoopId = null;
+let videoCvInferencePending = false;
+let lastVideoInferenceAt = 0;
+let lastVideoCvLabel = null;
+let videoCvCanvas = null;
+let videoCvContext = null;
+let videoOverlayContext = videoOutputCanvas.getContext("2d");
+let videoOutputVisible = false;
+let videoRenderFrameId = null;
+let lastVideoInferenceSummary = null;
 let gravityEstimate = { x: 0, y: 0, z: 0 };
 let sendClientEvent = () => {};
 const HAR_SEQUENCE_LENGTH = 512;
 const HAR_FEATURE_COUNT = 9;
 const HAR_SAMPLE_INTERVAL_MS = 20;
+const VIDEO_INFERENCE_INTERVAL_MS = 750;
+const VIDEO_RENDER_SCORE_THRESHOLD = 0.35;
+const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx";
+const VIDEO_FALLBACK_INPUT_SIZE = 224;
+const RETINAFACE_INPUT_HEIGHT = 608;
+const RETINAFACE_INPUT_WIDTH = 640;
+const RETINAFACE_CONFIDENCE_THRESHOLD = 0.75;
+const RETINAFACE_NMS_THRESHOLD = 0.4;
+const RETINAFACE_VARIANCES = [0.1, 0.2];
+const RETINAFACE_MIN_SIZES = [[16, 32], [64, 128], [256, 512]];
+const RETINAFACE_STEPS = [8, 16, 32];
+const RETINAFACE_MEAN_BGR = [104, 117, 123];
 const STANDARD_GRAVITY = 9.80665;
 const GRAVITY_FILTER_ALPHA = 0.8;
 const HAR_CLASS_LABELS = [
@@ -224,6 +253,10 @@ const setHarOutput = (lines) => {
   harOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines);
 };
 
+const setVideoOutput = (lines) => {
+  videoOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines);
+};
+
 const updateHarStatus = (extraLines = []) => {
   const lines = [
     `model: ${harSession ? "loaded" : "not loaded"}`,
@@ -237,6 +270,27 @@ const updateHarStatus = (extraLines = []) => {
   setHarOutput(lines.concat("", extraLines));
 };
 
+const updateVideoStatus = (extraLines = []) => {
+  const inputMetadata = videoCvInputName
+    ? videoCvSession?.inputMetadata?.[videoCvInputName]
+    : null;
+  const outputMetadata = videoCvOutputName
+    ? videoCvSession?.outputMetadata?.[videoCvOutputName]
+    : null;
+  const lines = [
+    `model: ${videoCvSession ? "loaded" : "not loaded"}`,
+    `video: ${videoCapture ? "active" : "inactive"}`,
+    `input: ${videoCvInputName ?? "n/a"}`,
+    `output: ${videoCvOutputName ?? "n/a"}`,
+    `input dims: ${JSON.stringify(inputMetadata?.dimensions ?? [])}`,
+    `output dims: ${JSON.stringify(outputMetadata?.dimensions ?? [])}`,
+    `loop: ${videoCvLoopId === null ? "idle" : "running"}`,
+    `display: ${videoOutputVisible ? "visible" : "hidden"}`,
+    `mode: ${lastVideoInferenceSummary?.mode ?? "unknown"}`,
+  ];
+  setVideoOutput(lines.concat("", extraLines));
+};
+
 const getFeatureVector = () => {
   const totalAcceleration = motionState?.accelerationIncludingGravity ?? { x: 0, y: 0, z: 0 };
   const bodyAcceleration = {
@@ -448,11 +502,981 @@ const requestSensorPermission = async (permissionTarget) => {
   return permissionTarget.requestPermission();
 };
 
+const getTopK = (values, limit = 3) => {
+  return values
+    .map((value, index) => ({ value, index }))
+    .sort((left, right) => right.value - left.value)
+    .slice(0, limit);
+};
+
+const ensureVideoCvCanvas = () => {
+  if (!videoCvCanvas) {
+    videoCvCanvas = document.createElement("canvas");
+    videoCvContext = videoCvCanvas.getContext("2d", { willReadFrequently: true });
+  }
+
+  if (!videoCvContext) {
+    throw new Error("Unable to create 2D canvas context for video preprocessing.");
+  }
+
+  return videoCvContext;
+};
+
+const ensureVideoOverlayContext = () => {
+  if (!videoOverlayContext) {
+    videoOverlayContext = videoOutputCanvas.getContext("2d");
+  }
+
+  if (!videoOverlayContext) {
+    throw new Error("Unable to create video output canvas context.");
+  }
+
+  return videoOverlayContext;
+};
+
+const isRetinaFaceSession = (session = videoCvSession) => {
+  if (!session) {
+    return false;
+  }
+
+  const inputNames = Array.isArray(session.inputNames) ? session.inputNames : [];
+  const outputNames = Array.isArray(session.outputNames) ? session.outputNames : [];
+  const allNames = inputNames.concat(outputNames).map((name) => String(name).toLowerCase());
+  if (allNames.some((name) => name.includes("retinaface"))) {
+    return true;
+  }
+
+  return outputNames.length === 3 && inputNames.length === 1;
+};
+
+const selectVideoModelInputName = (session) => {
+  const inputNames = Array.isArray(session?.inputNames) ? session.inputNames : [];
+  if (!inputNames.length) {
+    return null;
+  }
+
+  const ranked = inputNames
+    .map((name) => {
+      const metadata = session?.inputMetadata?.[name];
+      const dimensions = Array.isArray(metadata?.dimensions) ? metadata.dimensions : [];
+      const normalizedName = String(name).toLowerCase();
+      let score = 0;
+
+      if (dimensions.length === 4) {
+        score += 100;
+      } else if (dimensions.length === 3) {
+        score += 40;
+      }
+
+      if (
+        normalizedName.includes("pixel")
+        || normalizedName.includes("image")
+        || normalizedName.includes("images")
+        || normalizedName.includes("input")
+      ) {
+        score += 25;
+      }
+
+      if (normalizedName.includes("mask") || normalizedName.includes("token")) {
+        score -= 50;
+      }
+
+      return { name, score };
+    })
+    .sort((left, right) => right.score - left.score);
+
+  return ranked[0]?.name ?? inputNames[0];
+};
+
+const selectVideoModelOutputName = (session) => {
+  const outputNames = Array.isArray(session?.outputNames) ? session.outputNames : [];
+  if (!outputNames.length) {
+    return null;
+  }
+
+  const ranked = outputNames
+    .map((name) => {
+      const normalizedName = String(name).toLowerCase();
+      let score = 0;
+      if (normalizedName.includes("box")) {
+        score += 100;
+      }
+      if (normalizedName.includes("logit") || normalizedName.includes("score")) {
+        score += 40;
+      }
+      return { name, score };
+    })
+    .sort((left, right) => right.score - left.score);
+
+  return ranked[0]?.name ?? outputNames[0];
+};
+
+const resolveVideoModelLayout = () => {
+  if (!videoCvSession || !videoCvInputName) {
+    throw new Error("Video CV model is not loaded.");
+  }
+
+  if (isRetinaFaceSession(videoCvSession)) {
+    return {
+      dataType: "float32",
+      channels: 3,
+      width: RETINAFACE_INPUT_WIDTH,
+      height: RETINAFACE_INPUT_HEIGHT,
+      tensorDimensions: [1, RETINAFACE_INPUT_HEIGHT, RETINAFACE_INPUT_WIDTH, 3],
+      layout: "nhwc",
+      profile: "retinaface",
+    };
+  }
+
+  const metadata = videoCvSession.inputMetadata?.[videoCvInputName];
+  const dataType = metadata?.type ?? "float32";
+  if (dataType !== "float32" && dataType !== "uint8") {
+    throw new Error(`Unsupported video model input type: ${dataType}`);
+  }
+
+  const rawDimensions = Array.isArray(metadata?.dimensions)
+    ? metadata.dimensions
+    : [];
+  const dimensions = rawDimensions.length === 4
+    ? rawDimensions
+    : rawDimensions.length === 3
+    ? [1, ...rawDimensions]
+    : [1, 3, VIDEO_FALLBACK_INPUT_SIZE, VIDEO_FALLBACK_INPUT_SIZE];
+
+  const resolved = dimensions.map((dimension, index) => {
+    if (typeof dimension === "number" && Number.isFinite(dimension) && dimension > 0) {
+      return dimension;
+    }
+
+    if (index === 0) {
+      return 1;
+    }
+
+    if (index === 1 && dimensions.length === 4) {
+      const inputName = String(videoCvInputName).toLowerCase();
+      if (!inputName.includes("nhwc")) {
+        return 3;
+      }
+    }
+
+    return VIDEO_FALLBACK_INPUT_SIZE;
+  });
+
+  const secondDimension = resolved[1];
+  const lastDimension = resolved[3];
+  const inputName = String(videoCvInputName).toLowerCase();
+  const channelsFirst = inputName.includes("nhwc")
+    ? false
+    : secondDimension === 1
+      || secondDimension === 3
+      || ((lastDimension !== 1 && lastDimension !== 3) && !inputName.includes("image_embeddings"));
+  if (channelsFirst) {
+    const [, channels, height, width] = resolved;
+    if (channels !== 1 && channels !== 3) {
+      throw new Error(`Unsupported channel count for NCHW image input: ${channels}`);
+    }
+
+    return {
+      dataType,
+      channels,
+      width,
+      height,
+      tensorDimensions: [1, channels, height, width],
+      layout: "nchw",
+      profile: "generic",
+    };
+  }
+
+  const [, height, width, channels] = resolved;
+  if (channels !== 1 && channels !== 3) {
+    throw new Error(`Unsupported channel count for NHWC image input: ${channels}`);
+  }
+
+  return {
+    dataType,
+    channels,
+    width,
+    height,
+    tensorDimensions: [1, height, width, channels],
+    layout: "nhwc",
+    profile: "generic",
+  };
+};
+
+const buildVideoInputTensor = () => {
+  if (!videoCapture || !videoCvSession || !videoCvInputName) {
+    throw new Error("Video capture or model session is unavailable.");
+  }
+
+  if (!videoPreview.videoWidth || !videoPreview.videoHeight) {
+    throw new Error("Video stream is not ready yet.");
+  }
+
+  const {
+    dataType,
+    channels,
+    width,
+    height,
+    tensorDimensions,
+    layout,
+    profile,
+  } = resolveVideoModelLayout();
+  const context = ensureVideoCvCanvas();
+  videoCvCanvas.width = width;
+  videoCvCanvas.height = height;
+  let resizeRatio = 1;
+  if (profile === "retinaface") {
+    const sourceWidth = videoPreview.videoWidth;
+    const sourceHeight = videoPreview.videoHeight;
+    const targetRatio = height / width;
+    if (sourceHeight / sourceWidth <= targetRatio) {
+      resizeRatio = width / sourceWidth;
+    } else {
+      resizeRatio = height / sourceHeight;
+    }
+
+    const resizedWidth = Math.max(1, Math.min(width, Math.round(sourceWidth * resizeRatio)));
+    const resizedHeight = Math.max(1, Math.min(height, Math.round(sourceHeight * resizeRatio)));
+    context.clearRect(0, 0, width, height);
+    context.drawImage(videoPreview, 0, 0, resizedWidth, resizedHeight);
+  } else {
+    context.drawImage(videoPreview, 0, 0, width, height);
+  }
+
+  const rgba = context.getImageData(0, 0, width, height).data;
+  const elementCount = width * height * channels;
+  const tensorData = dataType === "uint8"
+    ? new Uint8Array(elementCount)
+    : new Float32Array(elementCount);
+
+  for (let pixelIndex = 0; pixelIndex < width * height; pixelIndex += 1) {
+    const rgbaIndex = pixelIndex * 4;
+    const red = rgba[rgbaIndex];
+    const green = rgba[rgbaIndex + 1];
+    const blue = rgba[rgbaIndex + 2];
+
+    if (profile === "retinaface") {
+      const tensorIndex = pixelIndex * channels;
+      tensorData[tensorIndex] = blue - RETINAFACE_MEAN_BGR[0];
+      tensorData[tensorIndex + 1] = green - RETINAFACE_MEAN_BGR[1];
+      tensorData[tensorIndex + 2] = red - RETINAFACE_MEAN_BGR[2];
+      continue;
+    }
+
+    if (channels === 1) {
+      const grayscale = Math.round(0.299 * red + 0.587 * green + 0.114 * blue);
+      tensorData[pixelIndex] = dataType === "uint8" ? grayscale : grayscale / 255;
+      continue;
+    }
+
+    if (layout === "nchw") {
+      const planeSize = width * height;
+      if (dataType === "uint8") {
+        tensorData[pixelIndex] = red;
+        tensorData[pixelIndex + planeSize] = green;
+        tensorData[pixelIndex + 2 * planeSize] = blue;
+      } else {
+        tensorData[pixelIndex] = red / 255;
+        tensorData[pixelIndex + planeSize] = green / 255;
+        tensorData[pixelIndex + 2 * planeSize] = blue / 255;
+      }
+      continue;
+    }
+
+    const tensorIndex = pixelIndex * channels;
+    if (dataType === "uint8") {
+      tensorData[tensorIndex] = red;
+      tensorData[tensorIndex + 1] = green;
+      tensorData[tensorIndex + 2] = blue;
+    } else {
+      tensorData[tensorIndex] = red / 255;
+      tensorData[tensorIndex + 1] = green / 255;
+      tensorData[tensorIndex + 2] = blue / 255;
+    }
+  }
+
+  return {
+    tensor: new window.ort.Tensor(dataType, tensorData, tensorDimensions),
+    preprocess: {
+      profile,
+      inputWidth: width,
+      inputHeight: height,
+      resizeRatio,
+      sourceWidth: videoPreview.videoWidth,
+      sourceHeight: videoPreview.videoHeight,
+    },
+  };
+};
+
+const looksLikeBoxes = (tensor) => {
+  if (!tensor?.dims || !tensor?.data) {
+    return false;
+  }
+
+  const dims = tensor.dims.filter((dimension) => Number.isFinite(dimension));
+  const values = Array.from(tensor.data ?? []);
+  const lastDimension = dims[dims.length - 1];
+  return values.length >= 4 && (lastDimension === 4 || lastDimension === 6 || lastDimension === 7);
+};
+
+const flattenFinite = (tensor) => {
+  return Array.from(tensor?.data ?? []).map(Number).filter((value) => Number.isFinite(value));
+};
+
+const normalizeBox = (boxValues, format = "xyxy") => {
+  if (boxValues.length < 4) {
+    return null;
+  }
+
+  let x1;
+  let y1;
+  let x2;
+  let y2;
+  if (format === "cxcywh") {
+    const [centerX, centerY, width, height] = boxValues;
+    x1 = centerX - width / 2;
+    y1 = centerY - height / 2;
+    x2 = centerX + width / 2;
+    y2 = centerY + height / 2;
+  } else {
+    [x1, y1, x2, y2] = boxValues;
+  }
+
+  if (x2 < x1) {
+    [x1, x2] = [x2, x1];
+  }
+  if (y2 < y1) {
+    [y1, y2] = [y2, y1];
+  }
+
+  const normalized = [x1, y1, x2, y2].map((value) => (
+    value > 1.5 ? value : Math.max(0, Math.min(1, value))
+  ));
+
+  return normalized;
+};
+
+const clamp = (value, min, max) => Math.max(min, Math.min(max, value));
+
+const buildRetinaFacePriors = (imageHeight, imageWidth) => {
+  const priors = [];
+  RETINAFACE_STEPS.forEach((step, index) => {
+    const featureMapHeight = Math.ceil(imageHeight / step);
+    const featureMapWidth = Math.ceil(imageWidth / step);
+    const minSizes = RETINAFACE_MIN_SIZES[index];
+
+    for (let row = 0; row < featureMapHeight; row += 1) {
+      for (let column = 0; column < featureMapWidth; column += 1) {
+        minSizes.forEach((minSize) => {
+          priors.push([
+            ((column + 0.5) * step) / imageWidth,
+            ((row + 0.5) * step) / imageHeight,
+            minSize / imageWidth,
+            minSize / imageHeight,
+          ]);
+        });
+      }
+    }
+  });
+  return priors;
+};
+
+const decodeRetinaFaceBox = (loc, prior) => {
+  const centerX = prior[0] + loc[0] * RETINAFACE_VARIANCES[0] * prior[2];
+  const centerY = prior[1] + loc[1] * RETINAFACE_VARIANCES[0] * prior[3];
+  const width = prior[2] * Math.exp(loc[2] * RETINAFACE_VARIANCES[1]);
+  const height = prior[3] * Math.exp(loc[3] * RETINAFACE_VARIANCES[1]);
+  return [
+    centerX - width / 2,
+    centerY - height / 2,
+    centerX + width / 2,
+    centerY + height / 2,
+  ];
+};
+
+const computeIoU = (left, right) => {
+  const x1 = Math.max(left.box[0], right.box[0]);
+  const y1 = Math.max(left.box[1], right.box[1]);
+  const x2 = Math.min(left.box[2], right.box[2]);
+  const y2 = Math.min(left.box[3], right.box[3]);
+  const width = Math.max(0, x2 - x1 + 1);
+  const height = Math.max(0, y2 - y1 + 1);
+  const intersection = width * height;
+  const leftArea = Math.max(0, left.box[2] - left.box[0] + 1) * Math.max(0, left.box[3] - left.box[1] + 1);
+  const rightArea = Math.max(0, right.box[2] - right.box[0] + 1) * Math.max(0, right.box[3] - right.box[1] + 1);
+  return intersection / Math.max(1e-6, leftArea + rightArea - intersection);
+};
+
+const applyNms = (detections, threshold) => {
+  const sorted = [...detections].sort((left, right) => right.score - left.score);
+  const kept = [];
+
+  sorted.forEach((candidate) => {
+    if (kept.every((accepted) => computeIoU(candidate, accepted) <= threshold)) {
+      kept.push(candidate);
+    }
+  });
+
+  return kept;
+};
+
+const decodeRetinaFaceOutputs = (outputs, preprocess) => {
+  if (!preprocess || preprocess.profile !== "retinaface") {
+    return null;
+  }
+
+  const outputNames = Array.isArray(videoCvSession?.outputNames) ? videoCvSession.outputNames : [];
+  if (outputNames.length < 3) {
+    return null;
+  }
+
+  const locTensor = outputs[outputNames[0]];
+  const confTensor = outputs[outputNames[1]];
+  const landmTensor = outputs[outputNames[2]];
+  if (!locTensor || !confTensor || !landmTensor) {
+    return null;
+  }
+
+  const locValues = flattenFinite(locTensor);
+  const confValues = flattenFinite(confTensor);
+  const landmValues = flattenFinite(landmTensor);
+  const priorCount = locValues.length / 4;
+  if (priorCount <= 0 || confValues.length / 2 !== priorCount || landmValues.length / 10 !== priorCount) {
+    return null;
+  }
+
+  const priors = buildRetinaFacePriors(preprocess.inputHeight, preprocess.inputWidth);
+  if (priors.length !== priorCount) {
+    return null;
+  }
+
+  const detections = [];
+  for (let index = 0; index < priorCount; index += 1) {
+    const score = softmax(confValues.slice(index * 2, index * 2 + 2))[1] ?? 0;
+    if (score < RETINAFACE_CONFIDENCE_THRESHOLD) {
+      continue;
+    }
+
+    const decoded = decodeRetinaFaceBox(
+      locValues.slice(index * 4, index * 4 + 4),
+      priors[index],
+    );
+    const scaledBox = [
+      clamp((decoded[0] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
+      clamp((decoded[1] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
+      clamp((decoded[2] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
+      clamp((decoded[3] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
+    ];
+
+    detections.push({
+      label: "face",
+      class_index: 0,
+      score,
+      box: scaledBox,
+    });
+  }
+
+  const filtered = applyNms(detections, RETINAFACE_NMS_THRESHOLD);
+  if (!filtered.length) {
+    return {
+      mode: "detection",
+      detections: [],
+      detected_class: "no_detection",
+      class_index: -1,
+      confidence: 0,
+      probabilities: [],
+      top_classes: [],
+    };
+  }
+
+  const best = filtered[0];
+  return {
+    mode: "detection",
+    detections: filtered,
+    detected_class: best.label,
+    class_index: best.class_index,
+    confidence: best.score,
+    probabilities: filtered.map((entry) => entry.score),
+    top_classes: filtered.slice(0, 3).map((entry) => ({
+      label: entry.label,
+      index: entry.class_index,
+      probability: entry.score,
+    })),
+  };
+};
+
+const findDetectionTensor = (entries, patterns, predicate = () => true) => {
+  return entries.find(([name, tensor]) => {
+    const normalizedName = String(name).toLowerCase();
+    return patterns.some((pattern) => pattern.test(normalizedName)) && predicate(tensor);
+  }) ?? null;
+};
+
+const decodeHuggingFaceDetectionOutputs = (entries) => {
+  const boxesEntry = findDetectionTensor(
+    entries,
+    [/pred_boxes/, /boxes?/, /bbox/],
+    (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : null) === 4,
+  );
+  const logitsEntry = findDetectionTensor(
+    entries,
+    [/logits/, /scores?/, /class/],
+    (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : 0) > 1,
+  );
+
+  if (!boxesEntry || !logitsEntry) {
+    return null;
+  }
+
+  const [boxesName, boxesTensor] = boxesEntry;
+  const [, logitsTensor] = logitsEntry;
+  const rawBoxes = flattenFinite(boxesTensor);
+  const rawLogits = flattenFinite(logitsTensor);
+  const boxCount = Math.floor(rawBoxes.length / 4);
+  const classCount = boxCount > 0 ? Math.floor(rawLogits.length / boxCount) : 0;
+  if (boxCount <= 0 || classCount <= 1) {
+    return null;
+  }
+
+  const usesCenterBoxes = /pred_boxes/.test(String(boxesName).toLowerCase());
+  const detections = [];
+  for (let index = 0; index < boxCount; index += 1) {
+    const box = rawBoxes.slice(index * 4, index * 4 + 4);
+    const logits = rawLogits.slice(index * classCount, index * classCount + classCount);
+    const candidateLogits = logits.length > 1 ? logits.slice(0, -1) : logits;
+    const probabilities = softmax(candidateLogits);
+    const best = getTopK(probabilities, 1)[0];
+    if (!best || best.value < VIDEO_RENDER_SCORE_THRESHOLD) {
+      continue;
+    }
+
+    const normalizedBox = normalizeBox(box, usesCenterBoxes ? "cxcywh" : "xyxy");
+    if (!normalizedBox) {
+      continue;
+    }
+
+    detections.push({
+      label: `class_${best.index}`,
+      class_index: best.index,
+      score: best.value,
+      box: normalizedBox,
+    });
+  }
+
+  if (!detections.length) {
+    return {
+      mode: "detection",
+      detections: [],
+      detected_class: "no_detection",
+      class_index: -1,
+      confidence: 0,
+      probabilities: [],
+      top_classes: [],
+    };
+  }
+
+  detections.sort((left, right) => right.score - left.score);
+  const best = detections[0];
+  return {
+    mode: "detection",
+    detections,
+    detected_class: best.label,
+    class_index: best.class_index,
+    confidence: best.score,
+    probabilities: detections.map((entry) => entry.score),
+    top_classes: detections.slice(0, 3).map((entry) => ({
+      label: entry.label,
+      index: entry.class_index,
+      probability: entry.score,
+    })),
+  };
+};
+
+const decodeDetectionOutputs = (outputs) => {
+  const entries = Object.entries(outputs);
+  const huggingFaceSummary = decodeHuggingFaceDetectionOutputs(entries);
+  if (huggingFaceSummary) {
+    return huggingFaceSummary;
+  }
+
+  const boxesEntry = entries.find(([, tensor]) => looksLikeBoxes(tensor));
+
+  if (!boxesEntry) {
+    return null;
+  }
+
+  const [boxesName, boxesTensor] = boxesEntry;
+  const boxDims = Array.isArray(boxesTensor.dims) ? boxesTensor.dims : [];
+  const rawBoxes = flattenFinite(boxesTensor);
+  const boxWidth = boxDims[boxDims.length - 1] ?? 4;
+  const detectionCount = Math.floor(rawBoxes.length / boxWidth);
+  if (detectionCount <= 0) {
+    return null;
+  }
+
+  const scoresEntry = entries.find(([name, tensor]) =>
+    name !== boxesName && flattenFinite(tensor).length >= detectionCount
+  );
+  const classEntry = entries.find(([name, tensor]) =>
+    name !== boxesName && name !== scoresEntry?.[0] && flattenFinite(tensor).length >= detectionCount
+  );
+  const detections = [];
+  const scoreValues = scoresEntry ? flattenFinite(scoresEntry[1]) : [];
+  const classValues = classEntry ? flattenFinite(classEntry[1]) : [];
+
+  for (let index = 0; index < detectionCount; index += 1) {
+    const start = index * boxWidth;
+    const row = rawBoxes.slice(start, start + boxWidth);
+    const normalizedBox = normalizeBox(row);
+    if (!normalizedBox) {
+      continue;
+    }
+
+    let score = Number(scoreValues[index] ?? row[4] ?? row[5] ?? 1);
+    if (!Number.isFinite(score)) {
+      score = 1;
+    }
+
+    let classIndex = classValues[index];
+    if (!Number.isFinite(classIndex)) {
+      classIndex = row.length >= 6 ? row[5] : row.length >= 7 ? row[6] : index;
+    }
+
+    if (score < VIDEO_RENDER_SCORE_THRESHOLD) {
+      continue;
+    }
+
+    detections.push({
+      label: `class_${Math.round(classIndex)}`,
+      class_index: Math.round(classIndex),
+      score,
+      box: normalizedBox,
+    });
+  }
+
+  if (!detections.length) {
+    return {
+      mode: "detection",
+      detections: [],
+      detected_class: "no_detection",
+      class_index: -1,
+      confidence: 0,
+      probabilities: [],
+      top_classes: [],
+    };
+  }
+
+  detections.sort((left, right) => right.score - left.score);
+  const best = detections[0];
+  return {
+    mode: "detection",
+    detections,
+    detected_class: best.label,
+    class_index: best.class_index,
+    confidence: best.score,
+    probabilities: detections.map((entry) => entry.score),
+    top_classes: detections.slice(0, 3).map((entry) => ({
+      label: entry.label,
+      index: entry.class_index,
+      probability: entry.score,
+    })),
+  };
+};
+
+const decodeClassificationOutputs = (output) => {
+  const values = Array.from(output?.data ?? []);
+  if (values.length === 0) {
+    throw new Error("Video model returned an empty output tensor.");
+  }
+
+  if (values.length === 1) {
+    return {
+      mode: "classification",
+      detections: [],
+      detected_class: "scalar_output",
+      class_index: 0,
+      confidence: Number(values[0]),
+      probabilities: values,
+      top_classes: [{ label: "scalar_output", index: 0, probability: Number(values[0]) }],
+    };
+  }
+
+  const probabilities = softmax(values);
+  const ranked = getTopK(probabilities, 3);
+  const best = ranked[0];
+
+  return {
+    mode: "classification",
+    detections: [],
+    detected_class: `class_${best.index}`,
+    class_index: best.index,
+    confidence: best.value,
+    probabilities,
+    top_classes: ranked.map(({ index, value }) => ({
+      label: `class_${index}`,
+      index,
+      probability: value,
+      logit: values[index],
+    })),
+  };
+};
+
+const summarizeVideoOutput = (outputMap, preprocess = null) => {
+  const retinaFaceSummary = decodeRetinaFaceOutputs(outputMap, preprocess);
+  if (retinaFaceSummary) {
+    return retinaFaceSummary;
+  }
+
+  const detectionSummary = decodeDetectionOutputs(outputMap);
+  if (detectionSummary) {
+    return detectionSummary;
+  }
+
+  const primaryOutput = outputMap[videoCvOutputName];
+  const primaryValues = Array.from(primaryOutput?.data ?? []);
+  if (primaryValues.length > 0 && primaryValues.length <= 4096) {
+    return decodeClassificationOutputs(primaryOutput);
+  }
+
+  return {
+    mode: "passthrough",
+    detections: [],
+    detected_class: "unrecognized_output",
+    class_index: -1,
+    confidence: 0,
+    probabilities: [],
+    top_classes: [],
+  };
+};
+
+const drawOverlayText = (context, lines) => {
+  if (!lines.length) {
+    return;
+  }
+
+  context.font = "18px ui-monospace, monospace";
+  const lineHeight = 24;
+  const width = Math.max(...lines.map((line) => context.measureText(line).width), 0) + 20;
+  const height = lines.length * lineHeight + 12;
+  context.fillStyle = "rgba(24, 32, 40, 0.72)";
+  context.fillRect(12, 12, width, height);
+  context.fillStyle = "#fffdfa";
+  lines.forEach((line, index) => {
+    context.fillText(line, 22, 36 + index * lineHeight);
+  });
+};
+
+const renderVideoOutputFrame = () => {
+  videoRenderFrameId = null;
+
+  if (!videoOutputVisible || !videoCapture || !videoPreview.videoWidth || !videoPreview.videoHeight) {
+    return;
+  }
+
+  const context = ensureVideoOverlayContext();
+  const width = videoPreview.videoWidth;
+  const height = videoPreview.videoHeight;
+  if (videoOutputCanvas.width !== width || videoOutputCanvas.height !== height) {
+    videoOutputCanvas.width = width;
+    videoOutputCanvas.height = height;
+  }
+
+  context.drawImage(videoPreview, 0, 0, width, height);
+
+  if (lastVideoInferenceSummary?.mode === "detection") {
+    context.lineWidth = 3;
+    context.font = "16px ui-monospace, monospace";
+    lastVideoInferenceSummary.detections.forEach((entry) => {
+      const [x1, y1, x2, y2] = entry.box;
+      const left = x1 <= 1 ? x1 * width : x1;
+      const top = y1 <= 1 ? y1 * height : y1;
+      const right = x2 <= 1 ? x2 * width : x2;
+      const bottom = y2 <= 1 ? y2 * height : y2;
+      const boxWidth = Math.max(1, right - left);
+      const boxHeight = Math.max(1, bottom - top);
+
+      context.strokeStyle = "#ef8f35";
+      context.strokeRect(left, top, boxWidth, boxHeight);
+
+      const label = `${entry.label} ${(entry.score * 100).toFixed(1)}%`;
+      const textWidth = context.measureText(label).width + 10;
+      context.fillStyle = "#182028";
+      context.fillRect(left, Math.max(0, top - 24), textWidth, 22);
+      context.fillStyle = "#fffdfa";
+      context.fillText(label, left + 5, Math.max(16, top - 8));
+    });
+  } else if (lastVideoInferenceSummary?.mode === "classification") {
+    drawOverlayText(context, [
+      `classification: ${lastVideoInferenceSummary.detected_class}`,
+      `confidence: ${(lastVideoInferenceSummary.confidence * 100).toFixed(1)}%`,
+    ]);
+  } else if (lastVideoInferenceSummary?.mode === "passthrough") {
+    drawOverlayText(context, [
+      "output mode: passthrough",
+      "model output not recognized as detection or classification",
+    ]);
+  }
+
+  videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame);
+};
+
+const syncVideoOutputView = () => {
+  videoOutputCanvas.hidden = !videoOutputVisible || !videoCapture;
+  videoOutputButton.textContent = videoOutputVisible ? "Hide video output" : "Show video output";
+
+  if (!videoOutputVisible || !videoCapture) {
+    if (videoRenderFrameId !== null) {
+      window.cancelAnimationFrame(videoRenderFrameId);
+      videoRenderFrameId = null;
+    }
+    updateVideoStatus();
+    return;
+  }
+
+  if (videoRenderFrameId === null) {
+    videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame);
+  }
+  updateVideoStatus();
+};
+
+const stopVideoCvLoop = () => {
+  if (videoCvLoopId !== null) {
+    window.clearInterval(videoCvLoopId);
+    videoCvLoopId = null;
+  }
+  lastVideoCvLabel = null;
+  updateVideoStatus();
+};
+
+const inferVideoPrediction = async () => {
+  if (
+    !videoCapture
+    || !videoCvSession
+    || !videoCvInputName
+    || !videoCvOutputName
+    || videoCvInferencePending
+  ) {
+    return;
+  }
+
+  const now = Date.now();
+  if (now - lastVideoInferenceAt < VIDEO_INFERENCE_INTERVAL_MS) {
+    return;
+  }
+
+  videoCvInferencePending = true;
+  lastVideoInferenceAt = now;
+
+  try {
+    const { tensor: input, preprocess } = buildVideoInputTensor();
+    const outputMap = await videoCvSession.run({ [videoCvInputName]: input });
+    const output = outputMap[videoCvOutputName];
+    const summary = summarizeVideoOutput(outputMap, preprocess);
+    const labelChanged = summary.detected_class !== lastVideoCvLabel;
+    lastVideoCvLabel = summary.detected_class;
+    lastVideoInferenceSummary = summary;
+
+    updateVideoStatus([
+      `output mode: ${summary.mode}`,
+      `prediction: ${summary.detected_class}`,
+      `confidence: ${summary.confidence.toFixed(4)}`,
+      ...(
+        summary.mode === "detection"
+          ? [
+            `detections: ${summary.detections.length}`,
+            ...summary.detections.slice(0, 3).map(
+              (entry) =>
+                `${entry.label}: score=${entry.score.toFixed(4)} box=${
+                  entry.box.map((value) => value.toFixed(3)).join(",")
+                }`,
+            ),
+          ]
+          : [
+            "top classes:",
+            ...summary.top_classes.map(
+              (entry) =>
+                `${entry.label}: p=${entry.probability.toFixed(4)} logit=${
+                  Number(entry.logit ?? entry.probability).toFixed(4)
+                }`,
+            ),
+          ]
+      ),
+      `frame: ${videoPreview.videoWidth}x${videoPreview.videoHeight}`,
+      `processed at: ${new Date().toLocaleTimeString()}`,
+    ]);
+    syncVideoOutputView();
+
+    sendClientEvent("video_cv", "inference", {
+      mode: summary.mode,
+      detected_class: summary.detected_class,
+      class_index: summary.class_index,
+      confidence: summary.confidence,
+      probabilities: summary.probabilities,
+      top_classes: summary.top_classes,
+      detections: summary.detections,
+      changed: labelChanged,
+      processed_at: new Date().toISOString(),
+      model_path: VIDEO_MODEL_PATH,
+      input_name: videoCvInputName,
+      output_name: videoCvOutputName,
+      input_dimensions: videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [],
+      output_dimensions: Array.isArray(output?.dims) ? output.dims : [],
+      source_resolution: {
+        width: videoPreview.videoWidth,
+        height: videoPreview.videoHeight,
+      },
+    });
+  } catch (error) {
+    lastVideoInferenceSummary = {
+      mode: "passthrough",
+      detections: [],
+      detected_class: "inference_error",
+      class_index: -1,
+      confidence: 0,
+      probabilities: [],
+      top_classes: [],
+    };
+    updateVideoStatus([
+      `inference error: ${error instanceof Error ? error.message : String(error)}`,
+    ]);
+    console.error(error);
+  } finally {
+    videoCvInferencePending = false;
+  }
+};
+
+const syncVideoCvLoop = () => {
+  if (videoCapture && videoCvSession) {
+    if (videoCvLoopId === null) {
+      videoCvLoopId = window.setInterval(() => {
+        void inferVideoPrediction();
+      }, VIDEO_INFERENCE_INTERVAL_MS);
+    }
+    updateVideoStatus([
+      "browser-side webcam inference active",
+      "results are sent to the backend over the websocket.",
+    ]);
+    return;
+  }
+
+  stopVideoCvLoop();
+  lastVideoInferenceSummary = null;
+  updateVideoStatus([
+    videoCvSession
+      ? "model loaded; start video capture to begin inference."
+      : `model file: ${VIDEO_MODEL_PATH}`,
+  ]);
+};
+
 renderSensorOutput();
 updateHarStatus([
   "local-only inference path",
   "model file: /static/models/human_activity_recognition.onnx",
 ]);
+updateVideoStatus([
+  `model file: ${VIDEO_MODEL_PATH}`,
+  "load the model, then start video capture to process frames in-browser.",
+]);
 
 harExportButton.addEventListener("click", () => {
   try {
@@ -570,6 +1594,8 @@ try {
         videoPreview.hidden = true;
         videoButton.textContent = "Start video";
         delete window.videoCapture;
+        syncVideoCvLoop();
+        syncVideoOutputView();
         append("video stopped");
         sendClientEvent("video", "stopped", { track_count: 0 });
         return;
@@ -581,6 +1607,8 @@ try {
       videoButton.textContent = "Stop video";
       append(`video granted: ${videoCapture.trackCount()} video track(s)`);
       window.videoCapture = videoCapture;
+      syncVideoCvLoop();
+      syncVideoOutputView();
       sendClientEvent("video", "started", {
         track_count: videoCapture.trackCount(),
       });
@@ -882,6 +1910,57 @@ try {
     }
   });
 
+  videoModelButton.addEventListener("click", async () => {
+    try {
+      if (!window.ort) {
+        throw new Error("onnxruntime-web did not load.");
+      }
+
+      configureOnnxRuntimeWasm();
+
+      videoModelButton.disabled = true;
+      videoModelButton.textContent = "Loading video model...";
+      updateVideoStatus(["loading model..."]);
+
+      videoCvSession = await window.ort.InferenceSession.create(
+        VIDEO_MODEL_PATH,
+        {
+          executionProviders: ["wasm"],
+        },
+      );
+
+      videoCvInputName = selectVideoModelInputName(videoCvSession);
+      videoCvOutputName = selectVideoModelOutputName(videoCvSession);
+      lastVideoCvLabel = null;
+      lastVideoInferenceSummary = null;
+      append(
+        `video cv model loaded: input=${videoCvInputName} output=${videoCvOutputName} input_dims=${
+          JSON.stringify(videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [])
+        }`,
+      );
+      syncVideoCvLoop();
+    } catch (error) {
+      videoCvSession = null;
+      videoCvInputName = null;
+      videoCvOutputName = null;
+      stopVideoCvLoop();
+      lastVideoInferenceSummary = null;
+      updateVideoStatus([
+        `model load error: ${error instanceof Error ? error.message : String(error)}`,
+      ]);
+      append(`video cv error: ${error instanceof Error ? error.message : String(error)}`);
+      console.error(error);
+    } finally {
+      videoModelButton.disabled = false;
+      videoModelButton.textContent = videoCvSession ? "Reload video CV model" : "Load video CV model";
+    }
+  });
+
+  videoOutputButton.addEventListener("click", () => {
+    videoOutputVisible = !videoOutputVisible;
+    syncVideoOutputView();
+  });
+
   window.client = client;
   window.sendAlive = () => client.send_alive();
 } catch (error) {
diff --git a/services/ws-server/static/index.html b/services/ws-server/static/index.html
index 630db76..2ccf512 100644
--- a/services/ws-server/static/index.html
+++ b/services/ws-server/static/index.html
@@ -138,14 +138,23 @@ <h1>WASM web agent</h1>
         <button id="nfc-button" type="button">Scan NFC</button>
         <button id="sensors-button" type="button">Start sensors</button>
         <button id="har-button" type="button">Load HAR model</button>
+        <button id="video-model-button" type="button">
+          Load video CV model
+        </button>
+        <button id="video-output-button" type="button">
+          Show video output
+        </button>
         <button id="har-export-button" type="button">Export HAR window</button>
       </p>
       <video id="video-preview" autoplay playsinline muted hidden></video>
+      <canvas id="video-output-canvas" hidden></canvas>
       <div class="status-grid">
         <textarea id="sensor-output" readonly
         >Waiting for device motion/orientation data…</textarea>
         <textarea id="har-output" readonly
         >Human activity recognition model not loaded.</textarea>
+        <textarea id="video-output" readonly
+        >Video CV model not loaded.</textarea>
       </div>
       <pre id="log">Booting…</pre>
     </main>