From ebcab0569a1a1a0795d5cea1e872f960cbf673b2 Mon Sep 17 00:00:00 2001
From: Pierre Tenedero
Date: Wed, 8 Apr 2026 17:44:33 +0800
Subject: [PATCH 1/2] Add face detection feature on video input
---
README.md | 10 +
services/ws-server/src/main.rs | 21 +
services/ws-server/static/app.js | 1079 ++++++++++++++++++++++++++
services/ws-server/static/index.html | 9 +
4 files changed, 1119 insertions(+)
diff --git a/README.md b/README.md
index 57531f3..7f8eedd 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,18 @@ mise run ws-e2e-chrome
## Run ws agent in browser
+### HAR model setup
+
Download the onnx from https://modelnova.ai/models/details/human-activity-recognition ,
and save it as `services/ws-server/static/models/human_activity_recognition.onnx`
+### Face detection setup
+
+Download the onnx from https://huggingface.co/amd/retinaface and save it in
+`services/ws-server/static/models/` and rename the file to `video_cv.onnx`.
+
+### Build and run the agent
+
```bash
mise run build-ws-wasm-agent
mise run ws-server
@@ -35,6 +44,7 @@ which will normally be something like 192.168.1.x.
Then on your phone, open Chrome and type in https://192.168.1.x:8433/
Click "Load HAR model" and then "Start sensors".
+For webcam inference, click "Load video CV model" and then "Start video".
## Grant
diff --git a/services/ws-server/src/main.rs b/services/ws-server/src/main.rs
index 4e78025..3538a1d 100644
--- a/services/ws-server/src/main.rs
+++ b/services/ws-server/src/main.rs
@@ -177,6 +177,27 @@ impl StreamHandler> for WebSocketActor {
action,
details,
} => {
+ if capability == "video_cv" && action == "inference" {
+ let detected_class = details
+ .get("detected_class")
+ .and_then(|value| value.as_str())
+ .unwrap_or("unknown");
+ let confidence = details
+ .get("confidence")
+ .and_then(|value| value.as_f64())
+ .unwrap_or_default();
+ let processed_at = details
+ .get("processed_at")
+ .and_then(|value| value.as_str())
+ .unwrap_or("unknown");
+ info!(
+ "Video inference received from {}: class={} confidence={:.4} processed_at={}",
+ self.current_agent_id(),
+ detected_class,
+ confidence,
+ processed_at
+ );
+ }
info!(
"Client event from {}: capability={} action={} details={}",
self.current_agent_id(),
diff --git a/services/ws-server/static/app.js b/services/ws-server/static/app.js
index 3caa152..43409ac 100644
--- a/services/ws-server/static/app.js
+++ b/services/ws-server/static/app.js
@@ -24,12 +24,16 @@ const speechButton = document.getElementById("speech-button");
const nfcButton = document.getElementById("nfc-button");
const sensorsButton = document.getElementById("sensors-button");
const harButton = document.getElementById("har-button");
+const videoModelButton = document.getElementById("video-model-button");
+const videoOutputButton = document.getElementById("video-output-button");
const harExportButton = document.getElementById("har-export-button");
const agentStatusEl = document.getElementById("agent-status");
const agentIdEl = document.getElementById("agent-id");
const sensorOutputEl = document.getElementById("sensor-output");
const harOutputEl = document.getElementById("har-output");
+const videoOutputEl = document.getElementById("video-output");
const videoPreview = document.getElementById("video-preview");
+const videoOutputCanvas = document.getElementById("video-output-canvas");
let microphone = null;
let videoCapture = null;
let bluetoothDevice = null;
@@ -46,11 +50,36 @@ let harInferencePending = false;
let lastInferenceAt = 0;
let harSamplerId = null;
let lastHarClassLabel = null;
+let videoCvSession = null;
+let videoCvInputName = null;
+let videoCvOutputName = null;
+let videoCvLoopId = null;
+let videoCvInferencePending = false;
+let lastVideoInferenceAt = 0;
+let lastVideoCvLabel = null;
+let videoCvCanvas = null;
+let videoCvContext = null;
+let videoOverlayContext = videoOutputCanvas.getContext("2d");
+let videoOutputVisible = false;
+let videoRenderFrameId = null;
+let lastVideoInferenceSummary = null;
let gravityEstimate = { x: 0, y: 0, z: 0 };
let sendClientEvent = () => {};
const HAR_SEQUENCE_LENGTH = 512;
const HAR_FEATURE_COUNT = 9;
const HAR_SAMPLE_INTERVAL_MS = 20;
+const VIDEO_INFERENCE_INTERVAL_MS = 750;
+const VIDEO_RENDER_SCORE_THRESHOLD = 0.35;
+const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx";
+const VIDEO_FALLBACK_INPUT_SIZE = 224;
+const RETINAFACE_INPUT_HEIGHT = 608;
+const RETINAFACE_INPUT_WIDTH = 640;
+const RETINAFACE_CONFIDENCE_THRESHOLD = 0.75;
+const RETINAFACE_NMS_THRESHOLD = 0.4;
+const RETINAFACE_VARIANCES = [0.1, 0.2];
+const RETINAFACE_MIN_SIZES = [[16, 32], [64, 128], [256, 512]];
+const RETINAFACE_STEPS = [8, 16, 32];
+const RETINAFACE_MEAN_BGR = [104, 117, 123];
const STANDARD_GRAVITY = 9.80665;
const GRAVITY_FILTER_ALPHA = 0.8;
const HAR_CLASS_LABELS = [
@@ -224,6 +253,10 @@ const setHarOutput = (lines) => {
harOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines);
};
+const setVideoOutput = (lines) => {
+ videoOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines);
+};
+
const updateHarStatus = (extraLines = []) => {
const lines = [
`model: ${harSession ? "loaded" : "not loaded"}`,
@@ -237,6 +270,27 @@ const updateHarStatus = (extraLines = []) => {
setHarOutput(lines.concat("", extraLines));
};
+const updateVideoStatus = (extraLines = []) => {
+ const inputMetadata = videoCvInputName
+ ? videoCvSession?.inputMetadata?.[videoCvInputName]
+ : null;
+ const outputMetadata = videoCvOutputName
+ ? videoCvSession?.outputMetadata?.[videoCvOutputName]
+ : null;
+ const lines = [
+ `model: ${videoCvSession ? "loaded" : "not loaded"}`,
+ `video: ${videoCapture ? "active" : "inactive"}`,
+ `input: ${videoCvInputName ?? "n/a"}`,
+ `output: ${videoCvOutputName ?? "n/a"}`,
+ `input dims: ${JSON.stringify(inputMetadata?.dimensions ?? [])}`,
+ `output dims: ${JSON.stringify(outputMetadata?.dimensions ?? [])}`,
+ `loop: ${videoCvLoopId === null ? "idle" : "running"}`,
+ `display: ${videoOutputVisible ? "visible" : "hidden"}`,
+ `mode: ${lastVideoInferenceSummary?.mode ?? "unknown"}`,
+ ];
+ setVideoOutput(lines.concat("", extraLines));
+};
+
const getFeatureVector = () => {
const totalAcceleration = motionState?.accelerationIncludingGravity ?? { x: 0, y: 0, z: 0 };
const bodyAcceleration = {
@@ -448,11 +502,981 @@ const requestSensorPermission = async (permissionTarget) => {
return permissionTarget.requestPermission();
};
+const getTopK = (values, limit = 3) => {
+ return values
+ .map((value, index) => ({ value, index }))
+ .sort((left, right) => right.value - left.value)
+ .slice(0, limit);
+};
+
+const ensureVideoCvCanvas = () => {
+ if (!videoCvCanvas) {
+ videoCvCanvas = document.createElement("canvas");
+ videoCvContext = videoCvCanvas.getContext("2d", { willReadFrequently: true });
+ }
+
+ if (!videoCvContext) {
+ throw new Error("Unable to create 2D canvas context for video preprocessing.");
+ }
+
+ return videoCvContext;
+};
+
+const ensureVideoOverlayContext = () => {
+ if (!videoOverlayContext) {
+ videoOverlayContext = videoOutputCanvas.getContext("2d");
+ }
+
+ if (!videoOverlayContext) {
+ throw new Error("Unable to create video output canvas context.");
+ }
+
+ return videoOverlayContext;
+};
+
+const isRetinaFaceSession = (session = videoCvSession) => {
+ if (!session) {
+ return false;
+ }
+
+ const inputNames = Array.isArray(session.inputNames) ? session.inputNames : [];
+ const outputNames = Array.isArray(session.outputNames) ? session.outputNames : [];
+ const allNames = inputNames.concat(outputNames).map((name) => String(name).toLowerCase());
+ if (allNames.some((name) => name.includes("retinaface"))) {
+ return true;
+ }
+
+ return outputNames.length === 3 && inputNames.length === 1;
+};
+
+const selectVideoModelInputName = (session) => {
+ const inputNames = Array.isArray(session?.inputNames) ? session.inputNames : [];
+ if (!inputNames.length) {
+ return null;
+ }
+
+ const ranked = inputNames
+ .map((name) => {
+ const metadata = session?.inputMetadata?.[name];
+ const dimensions = Array.isArray(metadata?.dimensions) ? metadata.dimensions : [];
+ const normalizedName = String(name).toLowerCase();
+ let score = 0;
+
+ if (dimensions.length === 4) {
+ score += 100;
+ } else if (dimensions.length === 3) {
+ score += 40;
+ }
+
+ if (
+ normalizedName.includes("pixel")
+ || normalizedName.includes("image")
+ || normalizedName.includes("images")
+ || normalizedName.includes("input")
+ ) {
+ score += 25;
+ }
+
+ if (normalizedName.includes("mask") || normalizedName.includes("token")) {
+ score -= 50;
+ }
+
+ return { name, score };
+ })
+ .sort((left, right) => right.score - left.score);
+
+ return ranked[0]?.name ?? inputNames[0];
+};
+
+const selectVideoModelOutputName = (session) => {
+ const outputNames = Array.isArray(session?.outputNames) ? session.outputNames : [];
+ if (!outputNames.length) {
+ return null;
+ }
+
+ const ranked = outputNames
+ .map((name) => {
+ const normalizedName = String(name).toLowerCase();
+ let score = 0;
+ if (normalizedName.includes("box")) {
+ score += 100;
+ }
+ if (normalizedName.includes("logit") || normalizedName.includes("score")) {
+ score += 40;
+ }
+ return { name, score };
+ })
+ .sort((left, right) => right.score - left.score);
+
+ return ranked[0]?.name ?? outputNames[0];
+};
+
+const resolveVideoModelLayout = () => {
+ if (!videoCvSession || !videoCvInputName) {
+ throw new Error("Video CV model is not loaded.");
+ }
+
+ if (isRetinaFaceSession(videoCvSession)) {
+ return {
+ dataType: "float32",
+ channels: 3,
+ width: RETINAFACE_INPUT_WIDTH,
+ height: RETINAFACE_INPUT_HEIGHT,
+ tensorDimensions: [1, RETINAFACE_INPUT_HEIGHT, RETINAFACE_INPUT_WIDTH, 3],
+ layout: "nhwc",
+ profile: "retinaface",
+ };
+ }
+
+ const metadata = videoCvSession.inputMetadata?.[videoCvInputName];
+ const dataType = metadata?.type ?? "float32";
+ if (dataType !== "float32" && dataType !== "uint8") {
+ throw new Error(`Unsupported video model input type: ${dataType}`);
+ }
+
+ const rawDimensions = Array.isArray(metadata?.dimensions)
+ ? metadata.dimensions
+ : [];
+ const dimensions = rawDimensions.length === 4
+ ? rawDimensions
+ : rawDimensions.length === 3
+ ? [1, ...rawDimensions]
+ : [1, 3, VIDEO_FALLBACK_INPUT_SIZE, VIDEO_FALLBACK_INPUT_SIZE];
+
+ const resolved = dimensions.map((dimension, index) => {
+ if (typeof dimension === "number" && Number.isFinite(dimension) && dimension > 0) {
+ return dimension;
+ }
+
+ if (index === 0) {
+ return 1;
+ }
+
+ if (index === 1 && dimensions.length === 4) {
+ const inputName = String(videoCvInputName).toLowerCase();
+ if (!inputName.includes("nhwc")) {
+ return 3;
+ }
+ }
+
+ return VIDEO_FALLBACK_INPUT_SIZE;
+ });
+
+ const secondDimension = resolved[1];
+ const lastDimension = resolved[3];
+ const inputName = String(videoCvInputName).toLowerCase();
+ const channelsFirst = inputName.includes("nhwc")
+ ? false
+ : secondDimension === 1
+ || secondDimension === 3
+ || ((lastDimension !== 1 && lastDimension !== 3) && !inputName.includes("image_embeddings"));
+ if (channelsFirst) {
+ const [, channels, height, width] = resolved;
+ if (channels !== 1 && channels !== 3) {
+ throw new Error(`Unsupported channel count for NCHW image input: ${channels}`);
+ }
+
+ return {
+ dataType,
+ channels,
+ width,
+ height,
+ tensorDimensions: [1, channels, height, width],
+ layout: "nchw",
+ profile: "generic",
+ };
+ }
+
+ const [, height, width, channels] = resolved;
+ if (channels !== 1 && channels !== 3) {
+ throw new Error(`Unsupported channel count for NHWC image input: ${channels}`);
+ }
+
+ return {
+ dataType,
+ channels,
+ width,
+ height,
+ tensorDimensions: [1, height, width, channels],
+ layout: "nhwc",
+ profile: "generic",
+ };
+};
+
+const buildVideoInputTensor = () => {
+ if (!videoCapture || !videoCvSession || !videoCvInputName) {
+ throw new Error("Video capture or model session is unavailable.");
+ }
+
+ if (!videoPreview.videoWidth || !videoPreview.videoHeight) {
+ throw new Error("Video stream is not ready yet.");
+ }
+
+ const {
+ dataType,
+ channels,
+ width,
+ height,
+ tensorDimensions,
+ layout,
+ profile,
+ } = resolveVideoModelLayout();
+ const context = ensureVideoCvCanvas();
+ videoCvCanvas.width = width;
+ videoCvCanvas.height = height;
+ let resizeRatio = 1;
+ if (profile === "retinaface") {
+ const sourceWidth = videoPreview.videoWidth;
+ const sourceHeight = videoPreview.videoHeight;
+ const targetRatio = height / width;
+ if (sourceHeight / sourceWidth <= targetRatio) {
+ resizeRatio = width / sourceWidth;
+ } else {
+ resizeRatio = height / sourceHeight;
+ }
+
+ const resizedWidth = Math.max(1, Math.min(width, Math.round(sourceWidth * resizeRatio)));
+ const resizedHeight = Math.max(1, Math.min(height, Math.round(sourceHeight * resizeRatio)));
+ context.clearRect(0, 0, width, height);
+ context.drawImage(videoPreview, 0, 0, resizedWidth, resizedHeight);
+ } else {
+ context.drawImage(videoPreview, 0, 0, width, height);
+ }
+
+ const rgba = context.getImageData(0, 0, width, height).data;
+ const elementCount = width * height * channels;
+ const tensorData = dataType === "uint8"
+ ? new Uint8Array(elementCount)
+ : new Float32Array(elementCount);
+
+ for (let pixelIndex = 0; pixelIndex < width * height; pixelIndex += 1) {
+ const rgbaIndex = pixelIndex * 4;
+ const red = rgba[rgbaIndex];
+ const green = rgba[rgbaIndex + 1];
+ const blue = rgba[rgbaIndex + 2];
+
+ if (profile === "retinaface") {
+ const tensorIndex = pixelIndex * channels;
+ tensorData[tensorIndex] = blue - RETINAFACE_MEAN_BGR[0];
+ tensorData[tensorIndex + 1] = green - RETINAFACE_MEAN_BGR[1];
+ tensorData[tensorIndex + 2] = red - RETINAFACE_MEAN_BGR[2];
+ continue;
+ }
+
+ if (channels === 1) {
+ const grayscale = Math.round(0.299 * red + 0.587 * green + 0.114 * blue);
+ tensorData[pixelIndex] = dataType === "uint8" ? grayscale : grayscale / 255;
+ continue;
+ }
+
+ if (layout === "nchw") {
+ const planeSize = width * height;
+ if (dataType === "uint8") {
+ tensorData[pixelIndex] = red;
+ tensorData[pixelIndex + planeSize] = green;
+ tensorData[pixelIndex + 2 * planeSize] = blue;
+ } else {
+ tensorData[pixelIndex] = red / 255;
+ tensorData[pixelIndex + planeSize] = green / 255;
+ tensorData[pixelIndex + 2 * planeSize] = blue / 255;
+ }
+ continue;
+ }
+
+ const tensorIndex = pixelIndex * channels;
+ if (dataType === "uint8") {
+ tensorData[tensorIndex] = red;
+ tensorData[tensorIndex + 1] = green;
+ tensorData[tensorIndex + 2] = blue;
+ } else {
+ tensorData[tensorIndex] = red / 255;
+ tensorData[tensorIndex + 1] = green / 255;
+ tensorData[tensorIndex + 2] = blue / 255;
+ }
+ }
+
+ return {
+ tensor: new window.ort.Tensor(dataType, tensorData, tensorDimensions),
+ preprocess: {
+ profile,
+ inputWidth: width,
+ inputHeight: height,
+ resizeRatio,
+ sourceWidth: videoPreview.videoWidth,
+ sourceHeight: videoPreview.videoHeight,
+ },
+ };
+};
+
+const looksLikeBoxes = (tensor) => {
+ if (!tensor?.dims || !tensor?.data) {
+ return false;
+ }
+
+ const dims = tensor.dims.filter((dimension) => Number.isFinite(dimension));
+ const values = Array.from(tensor.data ?? []);
+ const lastDimension = dims[dims.length - 1];
+ return values.length >= 4 && (lastDimension === 4 || lastDimension === 6 || lastDimension === 7);
+};
+
+const flattenFinite = (tensor) => {
+ return Array.from(tensor?.data ?? []).map(Number).filter((value) => Number.isFinite(value));
+};
+
+const normalizeBox = (boxValues, format = "xyxy") => {
+ if (boxValues.length < 4) {
+ return null;
+ }
+
+ let x1;
+ let y1;
+ let x2;
+ let y2;
+ if (format === "cxcywh") {
+ const [centerX, centerY, width, height] = boxValues;
+ x1 = centerX - width / 2;
+ y1 = centerY - height / 2;
+ x2 = centerX + width / 2;
+ y2 = centerY + height / 2;
+ } else {
+ [x1, y1, x2, y2] = boxValues;
+ }
+
+ if (x2 < x1) {
+ [x1, x2] = [x2, x1];
+ }
+ if (y2 < y1) {
+ [y1, y2] = [y2, y1];
+ }
+
+ const normalized = [x1, y1, x2, y2].map((value) => (
+ value > 1.5 ? value : Math.max(0, Math.min(1, value))
+ ));
+
+ return normalized;
+};
+
+const clamp = (value, min, max) => Math.max(min, Math.min(max, value));
+
+const buildRetinaFacePriors = (imageHeight, imageWidth) => {
+ const priors = [];
+ RETINAFACE_STEPS.forEach((step, index) => {
+ const featureMapHeight = Math.ceil(imageHeight / step);
+ const featureMapWidth = Math.ceil(imageWidth / step);
+ const minSizes = RETINAFACE_MIN_SIZES[index];
+
+ for (let row = 0; row < featureMapHeight; row += 1) {
+ for (let column = 0; column < featureMapWidth; column += 1) {
+ minSizes.forEach((minSize) => {
+ priors.push([
+ ((column + 0.5) * step) / imageWidth,
+ ((row + 0.5) * step) / imageHeight,
+ minSize / imageWidth,
+ minSize / imageHeight,
+ ]);
+ });
+ }
+ }
+ });
+ return priors;
+};
+
+const decodeRetinaFaceBox = (loc, prior) => {
+ const centerX = prior[0] + loc[0] * RETINAFACE_VARIANCES[0] * prior[2];
+ const centerY = prior[1] + loc[1] * RETINAFACE_VARIANCES[0] * prior[3];
+ const width = prior[2] * Math.exp(loc[2] * RETINAFACE_VARIANCES[1]);
+ const height = prior[3] * Math.exp(loc[3] * RETINAFACE_VARIANCES[1]);
+ return [
+ centerX - width / 2,
+ centerY - height / 2,
+ centerX + width / 2,
+ centerY + height / 2,
+ ];
+};
+
+const computeIoU = (left, right) => {
+ const x1 = Math.max(left.box[0], right.box[0]);
+ const y1 = Math.max(left.box[1], right.box[1]);
+ const x2 = Math.min(left.box[2], right.box[2]);
+ const y2 = Math.min(left.box[3], right.box[3]);
+ const width = Math.max(0, x2 - x1 + 1);
+ const height = Math.max(0, y2 - y1 + 1);
+ const intersection = width * height;
+ const leftArea = Math.max(0, left.box[2] - left.box[0] + 1) * Math.max(0, left.box[3] - left.box[1] + 1);
+ const rightArea = Math.max(0, right.box[2] - right.box[0] + 1) * Math.max(0, right.box[3] - right.box[1] + 1);
+ return intersection / Math.max(1e-6, leftArea + rightArea - intersection);
+};
+
+const applyNms = (detections, threshold) => {
+ const sorted = [...detections].sort((left, right) => right.score - left.score);
+ const kept = [];
+
+ sorted.forEach((candidate) => {
+ if (kept.every((accepted) => computeIoU(candidate, accepted) <= threshold)) {
+ kept.push(candidate);
+ }
+ });
+
+ return kept;
+};
+
+const decodeRetinaFaceOutputs = (outputs, preprocess) => {
+ if (!preprocess || preprocess.profile !== "retinaface") {
+ return null;
+ }
+
+ const outputNames = Array.isArray(videoCvSession?.outputNames) ? videoCvSession.outputNames : [];
+ if (outputNames.length < 3) {
+ return null;
+ }
+
+ const locTensor = outputs[outputNames[0]];
+ const confTensor = outputs[outputNames[1]];
+ const landmTensor = outputs[outputNames[2]];
+ if (!locTensor || !confTensor || !landmTensor) {
+ return null;
+ }
+
+ const locValues = flattenFinite(locTensor);
+ const confValues = flattenFinite(confTensor);
+ const landmValues = flattenFinite(landmTensor);
+ const priorCount = locValues.length / 4;
+ if (priorCount <= 0 || confValues.length / 2 !== priorCount || landmValues.length / 10 !== priorCount) {
+ return null;
+ }
+
+ const priors = buildRetinaFacePriors(preprocess.inputHeight, preprocess.inputWidth);
+ if (priors.length !== priorCount) {
+ return null;
+ }
+
+ const detections = [];
+ for (let index = 0; index < priorCount; index += 1) {
+ const score = softmax(confValues.slice(index * 2, index * 2 + 2))[1] ?? 0;
+ if (score < RETINAFACE_CONFIDENCE_THRESHOLD) {
+ continue;
+ }
+
+ const decoded = decodeRetinaFaceBox(
+ locValues.slice(index * 4, index * 4 + 4),
+ priors[index],
+ );
+ const scaledBox = [
+ clamp((decoded[0] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
+ clamp((decoded[1] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
+ clamp((decoded[2] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
+ clamp((decoded[3] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
+ ];
+
+ detections.push({
+ label: "face",
+ class_index: 0,
+ score,
+ box: scaledBox,
+ });
+ }
+
+ const filtered = applyNms(detections, RETINAFACE_NMS_THRESHOLD);
+ if (!filtered.length) {
+ return {
+ mode: "detection",
+ detections: [],
+ detected_class: "no_detection",
+ class_index: -1,
+ confidence: 0,
+ probabilities: [],
+ top_classes: [],
+ };
+ }
+
+ const best = filtered[0];
+ return {
+ mode: "detection",
+ detections: filtered,
+ detected_class: best.label,
+ class_index: best.class_index,
+ confidence: best.score,
+ probabilities: filtered.map((entry) => entry.score),
+ top_classes: filtered.slice(0, 3).map((entry) => ({
+ label: entry.label,
+ index: entry.class_index,
+ probability: entry.score,
+ })),
+ };
+};
+
+const findDetectionTensor = (entries, patterns, predicate = () => true) => {
+ return entries.find(([name, tensor]) => {
+ const normalizedName = String(name).toLowerCase();
+ return patterns.some((pattern) => pattern.test(normalizedName)) && predicate(tensor);
+ }) ?? null;
+};
+
+const decodeHuggingFaceDetectionOutputs = (entries) => {
+ const boxesEntry = findDetectionTensor(
+ entries,
+ [/pred_boxes/, /boxes?/, /bbox/],
+ (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : null) === 4,
+ );
+ const logitsEntry = findDetectionTensor(
+ entries,
+ [/logits/, /scores?/, /class/],
+ (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : 0) > 1,
+ );
+
+ if (!boxesEntry || !logitsEntry) {
+ return null;
+ }
+
+ const [boxesName, boxesTensor] = boxesEntry;
+ const [, logitsTensor] = logitsEntry;
+ const rawBoxes = flattenFinite(boxesTensor);
+ const rawLogits = flattenFinite(logitsTensor);
+ const boxCount = Math.floor(rawBoxes.length / 4);
+ const classCount = boxCount > 0 ? Math.floor(rawLogits.length / boxCount) : 0;
+ if (boxCount <= 0 || classCount <= 1) {
+ return null;
+ }
+
+ const usesCenterBoxes = /pred_boxes/.test(String(boxesName).toLowerCase());
+ const detections = [];
+ for (let index = 0; index < boxCount; index += 1) {
+ const box = rawBoxes.slice(index * 4, index * 4 + 4);
+ const logits = rawLogits.slice(index * classCount, index * classCount + classCount);
+ const candidateLogits = logits.length > 1 ? logits.slice(0, -1) : logits;
+ const probabilities = softmax(candidateLogits);
+ const best = getTopK(probabilities, 1)[0];
+ if (!best || best.value < VIDEO_RENDER_SCORE_THRESHOLD) {
+ continue;
+ }
+
+ const normalizedBox = normalizeBox(box, usesCenterBoxes ? "cxcywh" : "xyxy");
+ if (!normalizedBox) {
+ continue;
+ }
+
+ detections.push({
+ label: `class_${best.index}`,
+ class_index: best.index,
+ score: best.value,
+ box: normalizedBox,
+ });
+ }
+
+ if (!detections.length) {
+ return {
+ mode: "detection",
+ detections: [],
+ detected_class: "no_detection",
+ class_index: -1,
+ confidence: 0,
+ probabilities: [],
+ top_classes: [],
+ };
+ }
+
+ detections.sort((left, right) => right.score - left.score);
+ const best = detections[0];
+ return {
+ mode: "detection",
+ detections,
+ detected_class: best.label,
+ class_index: best.class_index,
+ confidence: best.score,
+ probabilities: detections.map((entry) => entry.score),
+ top_classes: detections.slice(0, 3).map((entry) => ({
+ label: entry.label,
+ index: entry.class_index,
+ probability: entry.score,
+ })),
+ };
+};
+
+const decodeDetectionOutputs = (outputs) => {
+ const entries = Object.entries(outputs);
+ const huggingFaceSummary = decodeHuggingFaceDetectionOutputs(entries);
+ if (huggingFaceSummary) {
+ return huggingFaceSummary;
+ }
+
+ const boxesEntry = entries.find(([, tensor]) => looksLikeBoxes(tensor));
+
+ if (!boxesEntry) {
+ return null;
+ }
+
+ const [boxesName, boxesTensor] = boxesEntry;
+ const boxDims = Array.isArray(boxesTensor.dims) ? boxesTensor.dims : [];
+ const rawBoxes = flattenFinite(boxesTensor);
+ const boxWidth = boxDims[boxDims.length - 1] ?? 4;
+ const detectionCount = Math.floor(rawBoxes.length / boxWidth);
+ if (detectionCount <= 0) {
+ return null;
+ }
+
+ const scoresEntry = entries.find(([name, tensor]) =>
+ name !== boxesName && flattenFinite(tensor).length >= detectionCount
+ );
+ const classEntry = entries.find(([name, tensor]) =>
+ name !== boxesName && name !== scoresEntry?.[0] && flattenFinite(tensor).length >= detectionCount
+ );
+ const detections = [];
+ const scoreValues = scoresEntry ? flattenFinite(scoresEntry[1]) : [];
+ const classValues = classEntry ? flattenFinite(classEntry[1]) : [];
+
+ for (let index = 0; index < detectionCount; index += 1) {
+ const start = index * boxWidth;
+ const row = rawBoxes.slice(start, start + boxWidth);
+ const normalizedBox = normalizeBox(row);
+ if (!normalizedBox) {
+ continue;
+ }
+
+ let score = Number(scoreValues[index] ?? row[4] ?? row[5] ?? 1);
+ if (!Number.isFinite(score)) {
+ score = 1;
+ }
+
+ let classIndex = classValues[index];
+ if (!Number.isFinite(classIndex)) {
+ classIndex = row.length >= 6 ? row[5] : row.length >= 7 ? row[6] : index;
+ }
+
+ if (score < VIDEO_RENDER_SCORE_THRESHOLD) {
+ continue;
+ }
+
+ detections.push({
+ label: `class_${Math.round(classIndex)}`,
+ class_index: Math.round(classIndex),
+ score,
+ box: normalizedBox,
+ });
+ }
+
+ if (!detections.length) {
+ return {
+ mode: "detection",
+ detections: [],
+ detected_class: "no_detection",
+ class_index: -1,
+ confidence: 0,
+ probabilities: [],
+ top_classes: [],
+ };
+ }
+
+ detections.sort((left, right) => right.score - left.score);
+ const best = detections[0];
+ return {
+ mode: "detection",
+ detections,
+ detected_class: best.label,
+ class_index: best.class_index,
+ confidence: best.score,
+ probabilities: detections.map((entry) => entry.score),
+ top_classes: detections.slice(0, 3).map((entry) => ({
+ label: entry.label,
+ index: entry.class_index,
+ probability: entry.score,
+ })),
+ };
+};
+
+const decodeClassificationOutputs = (output) => {
+ const values = Array.from(output?.data ?? []);
+ if (values.length === 0) {
+ throw new Error("Video model returned an empty output tensor.");
+ }
+
+ if (values.length === 1) {
+ return {
+ mode: "classification",
+ detections: [],
+ detected_class: "scalar_output",
+ class_index: 0,
+ confidence: Number(values[0]),
+ probabilities: values,
+ top_classes: [{ label: "scalar_output", index: 0, probability: Number(values[0]) }],
+ };
+ }
+
+ const probabilities = softmax(values);
+ const ranked = getTopK(probabilities, 3);
+ const best = ranked[0];
+
+ return {
+ mode: "classification",
+ detections: [],
+ detected_class: `class_${best.index}`,
+ class_index: best.index,
+ confidence: best.value,
+ probabilities,
+ top_classes: ranked.map(({ index, value }) => ({
+ label: `class_${index}`,
+ index,
+ probability: value,
+ logit: values[index],
+ })),
+ };
+};
+
+const summarizeVideoOutput = (outputMap, preprocess = null) => {
+ const retinaFaceSummary = decodeRetinaFaceOutputs(outputMap, preprocess);
+ if (retinaFaceSummary) {
+ return retinaFaceSummary;
+ }
+
+ const detectionSummary = decodeDetectionOutputs(outputMap);
+ if (detectionSummary) {
+ return detectionSummary;
+ }
+
+ const primaryOutput = outputMap[videoCvOutputName];
+ const primaryValues = Array.from(primaryOutput?.data ?? []);
+ if (primaryValues.length > 0 && primaryValues.length <= 4096) {
+ return decodeClassificationOutputs(primaryOutput);
+ }
+
+ return {
+ mode: "passthrough",
+ detections: [],
+ detected_class: "unrecognized_output",
+ class_index: -1,
+ confidence: 0,
+ probabilities: [],
+ top_classes: [],
+ };
+};
+
+const drawOverlayText = (context, lines) => {
+ if (!lines.length) {
+ return;
+ }
+
+ context.font = "18px ui-monospace, monospace";
+ const lineHeight = 24;
+ const width = Math.max(...lines.map((line) => context.measureText(line).width), 0) + 20;
+ const height = lines.length * lineHeight + 12;
+ context.fillStyle = "rgba(24, 32, 40, 0.72)";
+ context.fillRect(12, 12, width, height);
+ context.fillStyle = "#fffdfa";
+ lines.forEach((line, index) => {
+ context.fillText(line, 22, 36 + index * lineHeight);
+ });
+};
+
+const renderVideoOutputFrame = () => {
+ videoRenderFrameId = null;
+
+ if (!videoOutputVisible || !videoCapture || !videoPreview.videoWidth || !videoPreview.videoHeight) {
+ return;
+ }
+
+ const context = ensureVideoOverlayContext();
+ const width = videoPreview.videoWidth;
+ const height = videoPreview.videoHeight;
+ if (videoOutputCanvas.width !== width || videoOutputCanvas.height !== height) {
+ videoOutputCanvas.width = width;
+ videoOutputCanvas.height = height;
+ }
+
+ context.drawImage(videoPreview, 0, 0, width, height);
+
+ if (lastVideoInferenceSummary?.mode === "detection") {
+ context.lineWidth = 3;
+ context.font = "16px ui-monospace, monospace";
+ lastVideoInferenceSummary.detections.forEach((entry) => {
+ const [x1, y1, x2, y2] = entry.box;
+ const left = x1 <= 1 ? x1 * width : x1;
+ const top = y1 <= 1 ? y1 * height : y1;
+ const right = x2 <= 1 ? x2 * width : x2;
+ const bottom = y2 <= 1 ? y2 * height : y2;
+ const boxWidth = Math.max(1, right - left);
+ const boxHeight = Math.max(1, bottom - top);
+
+ context.strokeStyle = "#ef8f35";
+ context.strokeRect(left, top, boxWidth, boxHeight);
+
+ const label = `${entry.label} ${(entry.score * 100).toFixed(1)}%`;
+ const textWidth = context.measureText(label).width + 10;
+ context.fillStyle = "#182028";
+ context.fillRect(left, Math.max(0, top - 24), textWidth, 22);
+ context.fillStyle = "#fffdfa";
+ context.fillText(label, left + 5, Math.max(16, top - 8));
+ });
+ } else if (lastVideoInferenceSummary?.mode === "classification") {
+ drawOverlayText(context, [
+ `classification: ${lastVideoInferenceSummary.detected_class}`,
+ `confidence: ${(lastVideoInferenceSummary.confidence * 100).toFixed(1)}%`,
+ ]);
+ } else if (lastVideoInferenceSummary?.mode === "passthrough") {
+ drawOverlayText(context, [
+ "output mode: passthrough",
+ "model output not recognized as detection or classification",
+ ]);
+ }
+
+ videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame);
+};
+
+const syncVideoOutputView = () => {
+ videoOutputCanvas.hidden = !videoOutputVisible || !videoCapture;
+ videoOutputButton.textContent = videoOutputVisible ? "Hide video output" : "Show video output";
+
+ if (!videoOutputVisible || !videoCapture) {
+ if (videoRenderFrameId !== null) {
+ window.cancelAnimationFrame(videoRenderFrameId);
+ videoRenderFrameId = null;
+ }
+ updateVideoStatus();
+ return;
+ }
+
+ if (videoRenderFrameId === null) {
+ videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame);
+ }
+ updateVideoStatus();
+};
+
+const stopVideoCvLoop = () => {
+ if (videoCvLoopId !== null) {
+ window.clearInterval(videoCvLoopId);
+ videoCvLoopId = null;
+ }
+ lastVideoCvLabel = null;
+ updateVideoStatus();
+};
+
+const inferVideoPrediction = async () => {
+ if (
+ !videoCapture
+ || !videoCvSession
+ || !videoCvInputName
+ || !videoCvOutputName
+ || videoCvInferencePending
+ ) {
+ return;
+ }
+
+ const now = Date.now();
+ if (now - lastVideoInferenceAt < VIDEO_INFERENCE_INTERVAL_MS) {
+ return;
+ }
+
+ videoCvInferencePending = true;
+ lastVideoInferenceAt = now;
+
+ try {
+ const { tensor: input, preprocess } = buildVideoInputTensor();
+ const outputMap = await videoCvSession.run({ [videoCvInputName]: input });
+ const output = outputMap[videoCvOutputName];
+ const summary = summarizeVideoOutput(outputMap, preprocess);
+ const labelChanged = summary.detected_class !== lastVideoCvLabel;
+ lastVideoCvLabel = summary.detected_class;
+ lastVideoInferenceSummary = summary;
+
+ updateVideoStatus([
+ `output mode: ${summary.mode}`,
+ `prediction: ${summary.detected_class}`,
+ `confidence: ${summary.confidence.toFixed(4)}`,
+ ...(
+ summary.mode === "detection"
+ ? [
+ `detections: ${summary.detections.length}`,
+ ...summary.detections.slice(0, 3).map(
+ (entry) =>
+ `${entry.label}: score=${entry.score.toFixed(4)} box=${
+ entry.box.map((value) => value.toFixed(3)).join(",")
+ }`,
+ ),
+ ]
+ : [
+ "top classes:",
+ ...summary.top_classes.map(
+ (entry) =>
+ `${entry.label}: p=${entry.probability.toFixed(4)} logit=${
+ Number(entry.logit ?? entry.probability).toFixed(4)
+ }`,
+ ),
+ ]
+ ),
+ `frame: ${videoPreview.videoWidth}x${videoPreview.videoHeight}`,
+ `processed at: ${new Date().toLocaleTimeString()}`,
+ ]);
+ syncVideoOutputView();
+
+ sendClientEvent("video_cv", "inference", {
+ mode: summary.mode,
+ detected_class: summary.detected_class,
+ class_index: summary.class_index,
+ confidence: summary.confidence,
+ probabilities: summary.probabilities,
+ top_classes: summary.top_classes,
+ detections: summary.detections,
+ changed: labelChanged,
+ processed_at: new Date().toISOString(),
+ model_path: VIDEO_MODEL_PATH,
+ input_name: videoCvInputName,
+ output_name: videoCvOutputName,
+ input_dimensions: videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [],
+ output_dimensions: Array.isArray(output?.dims) ? output.dims : [],
+ source_resolution: {
+ width: videoPreview.videoWidth,
+ height: videoPreview.videoHeight,
+ },
+ });
+ } catch (error) {
+ lastVideoInferenceSummary = {
+ mode: "passthrough",
+ detections: [],
+ detected_class: "inference_error",
+ class_index: -1,
+ confidence: 0,
+ probabilities: [],
+ top_classes: [],
+ };
+ updateVideoStatus([
+ `inference error: ${error instanceof Error ? error.message : String(error)}`,
+ ]);
+ console.error(error);
+ } finally {
+ videoCvInferencePending = false;
+ }
+};
+
+const syncVideoCvLoop = () => {
+ if (videoCapture && videoCvSession) {
+ if (videoCvLoopId === null) {
+ videoCvLoopId = window.setInterval(() => {
+ void inferVideoPrediction();
+ }, VIDEO_INFERENCE_INTERVAL_MS);
+ }
+ updateVideoStatus([
+ "browser-side webcam inference active",
+ "results are sent to the backend over the websocket.",
+ ]);
+ return;
+ }
+
+ stopVideoCvLoop();
+ lastVideoInferenceSummary = null;
+ updateVideoStatus([
+ videoCvSession
+ ? "model loaded; start video capture to begin inference."
+ : `model file: ${VIDEO_MODEL_PATH}`,
+ ]);
+};
+
renderSensorOutput();
updateHarStatus([
"local-only inference path",
"model file: /static/models/human_activity_recognition.onnx",
]);
+updateVideoStatus([
+ `model file: ${VIDEO_MODEL_PATH}`,
+ "load the model, then start video capture to process frames in-browser.",
+]);
harExportButton.addEventListener("click", () => {
try {
@@ -570,6 +1594,8 @@ try {
videoPreview.hidden = true;
videoButton.textContent = "Start video";
delete window.videoCapture;
+ syncVideoCvLoop();
+ syncVideoOutputView();
append("video stopped");
sendClientEvent("video", "stopped", { track_count: 0 });
return;
@@ -581,6 +1607,8 @@ try {
videoButton.textContent = "Stop video";
append(`video granted: ${videoCapture.trackCount()} video track(s)`);
window.videoCapture = videoCapture;
+ syncVideoCvLoop();
+ syncVideoOutputView();
sendClientEvent("video", "started", {
track_count: videoCapture.trackCount(),
});
@@ -882,6 +1910,57 @@ try {
}
});
+ videoModelButton.addEventListener("click", async () => {
+ try {
+ if (!window.ort) {
+ throw new Error("onnxruntime-web did not load.");
+ }
+
+ configureOnnxRuntimeWasm();
+
+ videoModelButton.disabled = true;
+ videoModelButton.textContent = "Loading video model...";
+ updateVideoStatus(["loading model..."]);
+
+ videoCvSession = await window.ort.InferenceSession.create(
+ VIDEO_MODEL_PATH,
+ {
+ executionProviders: ["wasm"],
+ },
+ );
+
+ videoCvInputName = selectVideoModelInputName(videoCvSession);
+ videoCvOutputName = selectVideoModelOutputName(videoCvSession);
+ lastVideoCvLabel = null;
+ lastVideoInferenceSummary = null;
+ append(
+ `video cv model loaded: input=${videoCvInputName} output=${videoCvOutputName} input_dims=${
+ JSON.stringify(videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [])
+ }`,
+ );
+ syncVideoCvLoop();
+ } catch (error) {
+ videoCvSession = null;
+ videoCvInputName = null;
+ videoCvOutputName = null;
+ stopVideoCvLoop();
+ lastVideoInferenceSummary = null;
+ updateVideoStatus([
+ `model load error: ${error instanceof Error ? error.message : String(error)}`,
+ ]);
+ append(`video cv error: ${error instanceof Error ? error.message : String(error)}`);
+ console.error(error);
+ } finally {
+ videoModelButton.disabled = false;
+ videoModelButton.textContent = videoCvSession ? "Reload video CV model" : "Load video CV model";
+ }
+ });
+
+ videoOutputButton.addEventListener("click", () => {
+ videoOutputVisible = !videoOutputVisible;
+ syncVideoOutputView();
+ });
+
window.client = client;
window.sendAlive = () => client.send_alive();
} catch (error) {
diff --git a/services/ws-server/static/index.html b/services/ws-server/static/index.html
index 630db76..2ccf512 100644
--- a/services/ws-server/static/index.html
+++ b/services/ws-server/static/index.html
@@ -138,14 +138,23 @@ WASM web agent
+
+
+
+
Booting…
From e08c07163de1b8482d43b9e815903138de441dce Mon Sep 17 00:00:00 2001
From: Pierre Tenedero
Date: Thu, 9 Apr 2026 17:08:55 +0800
Subject: [PATCH 2/2] Create loadable face detection module
---
.mise.toml | 7 +-
Cargo.toml | 8 +-
README.md | 8 +-
services/ws-modules/face-detection/Cargo.toml | 31 +
services/ws-modules/face-detection/src/lib.rs | 849 ++++++++++++++++++
services/ws-server/static/app.js | 68 +-
services/ws-server/static/index.html | 8 +-
7 files changed, 935 insertions(+), 44 deletions(-)
create mode 100644 services/ws-modules/face-detection/Cargo.toml
create mode 100644 services/ws-modules/face-detection/src/lib.rs
diff --git a/.mise.toml b/.mise.toml
index 586e2c3..6d21ef5 100644
--- a/.mise.toml
+++ b/.mise.toml
@@ -90,8 +90,13 @@ description = "Build the har1 workflow WASM module"
dir = "services/ws-modules/har1"
run = "wasm-pack build . --target web"
+[tasks.build-ws-face-detection-module]
+description = "Build the face detection workflow WASM module"
+dir = "services/ws-modules/face-detection"
+run = "wasm-pack build . --target web"
+
[tasks.build]
-depends = ["build-ws-har1-module", "build-ws-wasm-agent"]
+depends = ["build-ws-face-detection-module", "build-ws-har1-module", "build-ws-wasm-agent"]
description = "Build all WebAssembly modules"
[tasks.test-ws-wasm-agent-firefox]
diff --git a/Cargo.toml b/Cargo.toml
index 026f1b9..7eeb046 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,11 @@
[workspace]
-members = ["libs/edge-toolkit", "services/ws-modules/har1", "services/ws-server", "services/ws-wasm-agent"]
+members = [
+ "libs/edge-toolkit",
+ "services/ws-modules/face-detection",
+ "services/ws-modules/har1",
+ "services/ws-server",
+ "services/ws-wasm-agent",
+]
resolver = "2"
[workspace.dependencies]
diff --git a/README.md b/README.md
index e52d4fd..c3f5ce7 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,16 @@ and save it as `services/ws-server/static/models/human_activity_recognition.onnx
### Face detection setup
-Download the onnx from https://huggingface.co/amd/retinaface and save it in
-`services/ws-server/static/models/` and rename the file to `video_cv.onnx`.
+1. Download RetinaFace_int.onnx from https://huggingface.co/amd/retinaface/tree/main/weights
+2. Save it in `services/ws-server/static/models/`
+3. Rename the file to `video_cv.onnx`.
### Build and run the agent
```bash
mise run build-ws-wasm-agent
mise run build-ws-har1-module
+mise run build-ws-face-detection-module
mise run ws-server
```
@@ -46,7 +48,7 @@ Then on your phone, open Chrome and type in https://192.168.1.x:8433/
Click "har demo".
-For webcam inference, click "Load video CV model" and then "Start video".
+For webcam inference, click "face demo".
## Grant
diff --git a/services/ws-modules/face-detection/Cargo.toml b/services/ws-modules/face-detection/Cargo.toml
new file mode 100644
index 0000000..59725b4
--- /dev/null
+++ b/services/ws-modules/face-detection/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "et-ws-face-detection"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+crate-type = ["cdylib", "rlib"]
+
+[dependencies]
+et-ws-wasm-agent = { path = "../../ws-wasm-agent" }
+js-sys = "0.3"
+serde.workspace = true
+serde_json.workspace = true
+tracing.workspace = true
+tracing-wasm = "0.2"
+wasm-bindgen = "0.2"
+wasm-bindgen-futures = "0.4"
+web-sys = { version = "0.3", features = [
+ "BinaryType",
+ "Document",
+ "Event",
+ "EventTarget",
+ "MessageEvent",
+ "Storage",
+ "WebSocket",
+ "Window",
+ "console",
+] }
+
+[dev-dependencies]
+wasm-bindgen-test = "0.3"
diff --git a/services/ws-modules/face-detection/src/lib.rs b/services/ws-modules/face-detection/src/lib.rs
new file mode 100644
index 0000000..980d784
--- /dev/null
+++ b/services/ws-modules/face-detection/src/lib.rs
@@ -0,0 +1,849 @@
+use std::cell::{Cell, RefCell};
+use std::rc::Rc;
+
+use et_ws_wasm_agent::{VideoCapture, WsClient, WsClientConfig};
+use js_sys::{Array, Float32Array, Function, Promise, Reflect};
+use serde_json::json;
+use tracing::info;
+use wasm_bindgen::JsCast;
+use wasm_bindgen::prelude::*;
+use wasm_bindgen_futures::{JsFuture, spawn_local};
+
+const FACE_MODEL_PATH: &str = "/static/models/video_cv.onnx";
+const FACE_INPUT_WIDTH: usize = 640;
+const FACE_INPUT_HEIGHT: usize = 608;
+const FACE_INPUT_WIDTH_F64: f64 = FACE_INPUT_WIDTH as f64;
+const FACE_INPUT_HEIGHT_F64: f64 = FACE_INPUT_HEIGHT as f64;
+const FACE_INFERENCE_INTERVAL_MS: i32 = 750;
+const FACE_RENDER_INTERVAL_MS: i32 = 60;
+const RETINAFACE_CONFIDENCE_THRESHOLD: f64 = 0.75;
+const RETINAFACE_NMS_THRESHOLD: f64 = 0.4;
+const RETINAFACE_VARIANCES: [f64; 2] = [0.1, 0.2];
+const RETINAFACE_MIN_SIZES: [&[f64]; 3] = [&[16.0, 32.0], &[64.0, 128.0], &[256.0, 512.0]];
+const RETINAFACE_STEPS: [f64; 3] = [8.0, 16.0, 32.0];
+
+#[wasm_bindgen(inline_js = r##"
+export async function face_attach_stream(stream) {
+ const video = document.getElementById("face-video-preview");
+ if (!video) {
+ throw new Error("Missing #face-video-preview element");
+ }
+
+ video.srcObject = stream;
+ video.hidden = false;
+
+ if (!video.videoWidth || !video.videoHeight) {
+ await new Promise((resolve, reject) => {
+ const onLoaded = () => {
+ cleanup();
+ resolve();
+ };
+ const onError = () => {
+ cleanup();
+ reject(new Error("Video stream metadata did not load"));
+ };
+ const cleanup = () => {
+ video.removeEventListener("loadedmetadata", onLoaded);
+ video.removeEventListener("error", onError);
+ };
+ video.addEventListener("loadedmetadata", onLoaded, { once: true });
+ video.addEventListener("error", onError, { once: true });
+ });
+ }
+
+ const playResult = video.play?.();
+ if (playResult?.catch) {
+ try {
+ await playResult;
+ } catch {
+ // Browsers may reject autoplay even after a gesture; metadata is enough for capture.
+ }
+ }
+}
+
+export function face_detach_stream() {
+ const video = document.getElementById("face-video-preview");
+ const canvas = document.getElementById("face-video-output-canvas");
+ if (video) {
+ video.pause?.();
+ video.srcObject = null;
+ video.hidden = true;
+ }
+ if (canvas) {
+ canvas.hidden = true;
+ const context = canvas.getContext("2d");
+ context?.clearRect(0, 0, canvas.width, canvas.height);
+ }
+}
+
+export function face_set_status(message) {
+ const output = document.getElementById("face-output");
+ if (output) {
+ output.value = String(message);
+ }
+}
+
+export function face_log(message) {
+ const line = `[face-detection] ${message}`;
+ console.log(line);
+ const logEl = document.getElementById("log");
+ if (!logEl) {
+ return;
+ }
+ const current = logEl.textContent ?? "";
+ logEl.textContent = current ? `${current}\n${line}` : line;
+}
+
+export function face_capture_input_tensor() {
+ const video = document.getElementById("face-video-preview");
+ if (!video?.videoWidth || !video?.videoHeight) {
+ throw new Error("Video stream is not ready yet.");
+ }
+
+ const width = 640;
+ const height = 608;
+ const mean = [104, 117, 123];
+ const canvas = globalThis.__etFacePreprocessCanvas ?? document.createElement("canvas");
+ globalThis.__etFacePreprocessCanvas = canvas;
+ const context = canvas.getContext("2d", { willReadFrequently: true });
+ if (!context) {
+ throw new Error("Unable to create face preprocessing canvas context.");
+ }
+
+ canvas.width = width;
+ canvas.height = height;
+
+ const sourceWidth = video.videoWidth;
+ const sourceHeight = video.videoHeight;
+ const targetRatio = height / width;
+ let resizeRatio;
+ if (sourceHeight / sourceWidth <= targetRatio) {
+ resizeRatio = width / sourceWidth;
+ } else {
+ resizeRatio = height / sourceHeight;
+ }
+
+ const resizedWidth = Math.max(1, Math.min(width, Math.round(sourceWidth * resizeRatio)));
+ const resizedHeight = Math.max(1, Math.min(height, Math.round(sourceHeight * resizeRatio)));
+ context.clearRect(0, 0, width, height);
+ context.drawImage(video, 0, 0, resizedWidth, resizedHeight);
+
+ const rgba = context.getImageData(0, 0, width, height).data;
+ const tensorData = new Float32Array(width * height * 3);
+
+ for (let pixelIndex = 0; pixelIndex < width * height; pixelIndex += 1) {
+ const rgbaIndex = pixelIndex * 4;
+ const red = rgba[rgbaIndex];
+ const green = rgba[rgbaIndex + 1];
+ const blue = rgba[rgbaIndex + 2];
+ const tensorIndex = pixelIndex * 3;
+ tensorData[tensorIndex] = blue - mean[0];
+ tensorData[tensorIndex + 1] = green - mean[1];
+ tensorData[tensorIndex + 2] = red - mean[2];
+ }
+
+ return {
+ data: tensorData,
+ resizeRatio,
+ sourceWidth,
+ sourceHeight,
+ };
+}
+
+export function face_render(detections) {
+ const video = document.getElementById("face-video-preview");
+ const canvas = document.getElementById("face-video-output-canvas");
+ if (!video?.videoWidth || !video?.videoHeight || !canvas) {
+ return;
+ }
+
+ const context = canvas.getContext("2d");
+ if (!context) {
+ throw new Error("Unable to create face output canvas context.");
+ }
+
+ const width = video.videoWidth;
+ const height = video.videoHeight;
+ if (canvas.width !== width || canvas.height !== height) {
+ canvas.width = width;
+ canvas.height = height;
+ }
+
+ canvas.hidden = false;
+ context.drawImage(video, 0, 0, width, height);
+ context.lineWidth = 3;
+ context.font = "16px ui-monospace, monospace";
+
+ for (const entry of detections ?? []) {
+ const [x1, y1, x2, y2] = entry.box ?? [];
+ const left = Number(x1 ?? 0);
+ const top = Number(y1 ?? 0);
+ const right = Number(x2 ?? 0);
+ const bottom = Number(y2 ?? 0);
+ const boxWidth = Math.max(1, right - left);
+ const boxHeight = Math.max(1, bottom - top);
+ context.strokeStyle = "#ef8f35";
+ context.strokeRect(left, top, boxWidth, boxHeight);
+
+ const label = `${entry.label ?? "face"} ${((entry.score ?? 0) * 100).toFixed(1)}%`;
+ const textWidth = context.measureText(label).width + 10;
+ context.fillStyle = "#182028";
+ context.fillRect(left, Math.max(0, top - 24), textWidth, 22);
+ context.fillStyle = "#fffdfa";
+ context.fillText(label, left + 5, Math.max(16, top - 8));
+ }
+}
+"##)]
+extern "C" {
+ #[wasm_bindgen(catch)]
+ async fn face_attach_stream(stream: JsValue) -> Result;
+ #[wasm_bindgen]
+ fn face_detach_stream();
+ #[wasm_bindgen]
+ fn face_set_status(message: &str);
+ #[wasm_bindgen]
+ fn face_log(message: &str);
+ #[wasm_bindgen(catch)]
+ fn face_capture_input_tensor() -> Result;
+ #[wasm_bindgen(catch)]
+ fn face_render(detections: &JsValue) -> Result<(), JsValue>;
+}
+
+#[derive(Clone)]
+struct Detection {
+ label: String,
+ class_index: i32,
+ score: f64,
+ box_coords: [f64; 4],
+}
+
+#[derive(Clone)]
+struct DetectionSummary {
+ detections: Vec,
+ confidence: f64,
+ processed_at: String,
+}
+
+struct FaceDetectionRuntime {
+ client: WsClient,
+ capture: VideoCapture,
+ inference_interval_id: i32,
+ render_interval_id: i32,
+ _inference_closure: Closure,
+ _render_closure: Closure,
+}
+
+thread_local! {
+ static FACE_RUNTIME: RefCell