From d921a512c3954c12d44fd2ef8e53333c0a10d605 Mon Sep 17 00:00:00 2001 From: Pierre Tenedero Date: Wed, 8 Apr 2026 17:44:33 +0800 Subject: [PATCH] Add face detection feature on video input --- README.md | 10 + services/ws-server/src/main.rs | 21 + services/ws-server/static/app.js | 1079 ++++++++++++++++++++++++++ services/ws-server/static/index.html | 9 + 4 files changed, 1119 insertions(+) diff --git a/README.md b/README.md index 57531f3..7f8eedd 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,18 @@ mise run ws-e2e-chrome ## Run ws agent in browser +### HAR model setup + Download the onnx from https://modelnova.ai/models/details/human-activity-recognition , and save it as `services/ws-server/static/models/human_activity_recognition.onnx` +### Face detection setup + +Download the onnx from https://huggingface.co/amd/retinaface and save it in +`services/ws-server/static/models/` and rename the file to `video_cv.onnx`. + +### Build and run the agent + ```bash mise run build-ws-wasm-agent mise run ws-server @@ -35,6 +44,7 @@ which will normally be something like 192.168.1.x. Then on your phone, open Chrome and type in https://192.168.1.x:8433/ Click "Load HAR model" and then "Start sensors". +For webcam inference, click "Load video CV model" and then "Start video". ## Grant diff --git a/services/ws-server/src/main.rs b/services/ws-server/src/main.rs index 4e78025..3538a1d 100644 --- a/services/ws-server/src/main.rs +++ b/services/ws-server/src/main.rs @@ -177,6 +177,27 @@ impl StreamHandler> for WebSocketActor { action, details, } => { + if capability == "video_cv" && action == "inference" { + let detected_class = details + .get("detected_class") + .and_then(|value| value.as_str()) + .unwrap_or("unknown"); + let confidence = details + .get("confidence") + .and_then(|value| value.as_f64()) + .unwrap_or_default(); + let processed_at = details + .get("processed_at") + .and_then(|value| value.as_str()) + .unwrap_or("unknown"); + info!( + "Video inference received from {}: class={} confidence={:.4} processed_at={}", + self.current_agent_id(), + detected_class, + confidence, + processed_at + ); + } info!( "Client event from {}: capability={} action={} details={}", self.current_agent_id(), diff --git a/services/ws-server/static/app.js b/services/ws-server/static/app.js index 3caa152..43409ac 100644 --- a/services/ws-server/static/app.js +++ b/services/ws-server/static/app.js @@ -24,12 +24,16 @@ const speechButton = document.getElementById("speech-button"); const nfcButton = document.getElementById("nfc-button"); const sensorsButton = document.getElementById("sensors-button"); const harButton = document.getElementById("har-button"); +const videoModelButton = document.getElementById("video-model-button"); +const videoOutputButton = document.getElementById("video-output-button"); const harExportButton = document.getElementById("har-export-button"); const agentStatusEl = document.getElementById("agent-status"); const agentIdEl = document.getElementById("agent-id"); const sensorOutputEl = document.getElementById("sensor-output"); const harOutputEl = document.getElementById("har-output"); +const videoOutputEl = document.getElementById("video-output"); const videoPreview = document.getElementById("video-preview"); +const videoOutputCanvas = document.getElementById("video-output-canvas"); let microphone = null; let videoCapture = null; let bluetoothDevice = null; @@ -46,11 +50,36 @@ let harInferencePending = false; let lastInferenceAt = 0; let harSamplerId = null; let lastHarClassLabel = null; +let videoCvSession = null; +let videoCvInputName = null; +let videoCvOutputName = null; +let videoCvLoopId = null; +let videoCvInferencePending = false; +let lastVideoInferenceAt = 0; +let lastVideoCvLabel = null; +let videoCvCanvas = null; +let videoCvContext = null; +let videoOverlayContext = videoOutputCanvas.getContext("2d"); +let videoOutputVisible = false; +let videoRenderFrameId = null; +let lastVideoInferenceSummary = null; let gravityEstimate = { x: 0, y: 0, z: 0 }; let sendClientEvent = () => {}; const HAR_SEQUENCE_LENGTH = 512; const HAR_FEATURE_COUNT = 9; const HAR_SAMPLE_INTERVAL_MS = 20; +const VIDEO_INFERENCE_INTERVAL_MS = 750; +const VIDEO_RENDER_SCORE_THRESHOLD = 0.35; +const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx"; +const VIDEO_FALLBACK_INPUT_SIZE = 224; +const RETINAFACE_INPUT_HEIGHT = 608; +const RETINAFACE_INPUT_WIDTH = 640; +const RETINAFACE_CONFIDENCE_THRESHOLD = 0.75; +const RETINAFACE_NMS_THRESHOLD = 0.4; +const RETINAFACE_VARIANCES = [0.1, 0.2]; +const RETINAFACE_MIN_SIZES = [[16, 32], [64, 128], [256, 512]]; +const RETINAFACE_STEPS = [8, 16, 32]; +const RETINAFACE_MEAN_BGR = [104, 117, 123]; const STANDARD_GRAVITY = 9.80665; const GRAVITY_FILTER_ALPHA = 0.8; const HAR_CLASS_LABELS = [ @@ -224,6 +253,10 @@ const setHarOutput = (lines) => { harOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines); }; +const setVideoOutput = (lines) => { + videoOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines); +}; + const updateHarStatus = (extraLines = []) => { const lines = [ `model: ${harSession ? "loaded" : "not loaded"}`, @@ -237,6 +270,27 @@ const updateHarStatus = (extraLines = []) => { setHarOutput(lines.concat("", extraLines)); }; +const updateVideoStatus = (extraLines = []) => { + const inputMetadata = videoCvInputName + ? videoCvSession?.inputMetadata?.[videoCvInputName] + : null; + const outputMetadata = videoCvOutputName + ? videoCvSession?.outputMetadata?.[videoCvOutputName] + : null; + const lines = [ + `model: ${videoCvSession ? "loaded" : "not loaded"}`, + `video: ${videoCapture ? "active" : "inactive"}`, + `input: ${videoCvInputName ?? "n/a"}`, + `output: ${videoCvOutputName ?? "n/a"}`, + `input dims: ${JSON.stringify(inputMetadata?.dimensions ?? [])}`, + `output dims: ${JSON.stringify(outputMetadata?.dimensions ?? [])}`, + `loop: ${videoCvLoopId === null ? "idle" : "running"}`, + `display: ${videoOutputVisible ? "visible" : "hidden"}`, + `mode: ${lastVideoInferenceSummary?.mode ?? "unknown"}`, + ]; + setVideoOutput(lines.concat("", extraLines)); +}; + const getFeatureVector = () => { const totalAcceleration = motionState?.accelerationIncludingGravity ?? { x: 0, y: 0, z: 0 }; const bodyAcceleration = { @@ -448,11 +502,981 @@ const requestSensorPermission = async (permissionTarget) => { return permissionTarget.requestPermission(); }; +const getTopK = (values, limit = 3) => { + return values + .map((value, index) => ({ value, index })) + .sort((left, right) => right.value - left.value) + .slice(0, limit); +}; + +const ensureVideoCvCanvas = () => { + if (!videoCvCanvas) { + videoCvCanvas = document.createElement("canvas"); + videoCvContext = videoCvCanvas.getContext("2d", { willReadFrequently: true }); + } + + if (!videoCvContext) { + throw new Error("Unable to create 2D canvas context for video preprocessing."); + } + + return videoCvContext; +}; + +const ensureVideoOverlayContext = () => { + if (!videoOverlayContext) { + videoOverlayContext = videoOutputCanvas.getContext("2d"); + } + + if (!videoOverlayContext) { + throw new Error("Unable to create video output canvas context."); + } + + return videoOverlayContext; +}; + +const isRetinaFaceSession = (session = videoCvSession) => { + if (!session) { + return false; + } + + const inputNames = Array.isArray(session.inputNames) ? session.inputNames : []; + const outputNames = Array.isArray(session.outputNames) ? session.outputNames : []; + const allNames = inputNames.concat(outputNames).map((name) => String(name).toLowerCase()); + if (allNames.some((name) => name.includes("retinaface"))) { + return true; + } + + return outputNames.length === 3 && inputNames.length === 1; +}; + +const selectVideoModelInputName = (session) => { + const inputNames = Array.isArray(session?.inputNames) ? session.inputNames : []; + if (!inputNames.length) { + return null; + } + + const ranked = inputNames + .map((name) => { + const metadata = session?.inputMetadata?.[name]; + const dimensions = Array.isArray(metadata?.dimensions) ? metadata.dimensions : []; + const normalizedName = String(name).toLowerCase(); + let score = 0; + + if (dimensions.length === 4) { + score += 100; + } else if (dimensions.length === 3) { + score += 40; + } + + if ( + normalizedName.includes("pixel") + || normalizedName.includes("image") + || normalizedName.includes("images") + || normalizedName.includes("input") + ) { + score += 25; + } + + if (normalizedName.includes("mask") || normalizedName.includes("token")) { + score -= 50; + } + + return { name, score }; + }) + .sort((left, right) => right.score - left.score); + + return ranked[0]?.name ?? inputNames[0]; +}; + +const selectVideoModelOutputName = (session) => { + const outputNames = Array.isArray(session?.outputNames) ? session.outputNames : []; + if (!outputNames.length) { + return null; + } + + const ranked = outputNames + .map((name) => { + const normalizedName = String(name).toLowerCase(); + let score = 0; + if (normalizedName.includes("box")) { + score += 100; + } + if (normalizedName.includes("logit") || normalizedName.includes("score")) { + score += 40; + } + return { name, score }; + }) + .sort((left, right) => right.score - left.score); + + return ranked[0]?.name ?? outputNames[0]; +}; + +const resolveVideoModelLayout = () => { + if (!videoCvSession || !videoCvInputName) { + throw new Error("Video CV model is not loaded."); + } + + if (isRetinaFaceSession(videoCvSession)) { + return { + dataType: "float32", + channels: 3, + width: RETINAFACE_INPUT_WIDTH, + height: RETINAFACE_INPUT_HEIGHT, + tensorDimensions: [1, RETINAFACE_INPUT_HEIGHT, RETINAFACE_INPUT_WIDTH, 3], + layout: "nhwc", + profile: "retinaface", + }; + } + + const metadata = videoCvSession.inputMetadata?.[videoCvInputName]; + const dataType = metadata?.type ?? "float32"; + if (dataType !== "float32" && dataType !== "uint8") { + throw new Error(`Unsupported video model input type: ${dataType}`); + } + + const rawDimensions = Array.isArray(metadata?.dimensions) + ? metadata.dimensions + : []; + const dimensions = rawDimensions.length === 4 + ? rawDimensions + : rawDimensions.length === 3 + ? [1, ...rawDimensions] + : [1, 3, VIDEO_FALLBACK_INPUT_SIZE, VIDEO_FALLBACK_INPUT_SIZE]; + + const resolved = dimensions.map((dimension, index) => { + if (typeof dimension === "number" && Number.isFinite(dimension) && dimension > 0) { + return dimension; + } + + if (index === 0) { + return 1; + } + + if (index === 1 && dimensions.length === 4) { + const inputName = String(videoCvInputName).toLowerCase(); + if (!inputName.includes("nhwc")) { + return 3; + } + } + + return VIDEO_FALLBACK_INPUT_SIZE; + }); + + const secondDimension = resolved[1]; + const lastDimension = resolved[3]; + const inputName = String(videoCvInputName).toLowerCase(); + const channelsFirst = inputName.includes("nhwc") + ? false + : secondDimension === 1 + || secondDimension === 3 + || ((lastDimension !== 1 && lastDimension !== 3) && !inputName.includes("image_embeddings")); + if (channelsFirst) { + const [, channels, height, width] = resolved; + if (channels !== 1 && channels !== 3) { + throw new Error(`Unsupported channel count for NCHW image input: ${channels}`); + } + + return { + dataType, + channels, + width, + height, + tensorDimensions: [1, channels, height, width], + layout: "nchw", + profile: "generic", + }; + } + + const [, height, width, channels] = resolved; + if (channels !== 1 && channels !== 3) { + throw new Error(`Unsupported channel count for NHWC image input: ${channels}`); + } + + return { + dataType, + channels, + width, + height, + tensorDimensions: [1, height, width, channels], + layout: "nhwc", + profile: "generic", + }; +}; + +const buildVideoInputTensor = () => { + if (!videoCapture || !videoCvSession || !videoCvInputName) { + throw new Error("Video capture or model session is unavailable."); + } + + if (!videoPreview.videoWidth || !videoPreview.videoHeight) { + throw new Error("Video stream is not ready yet."); + } + + const { + dataType, + channels, + width, + height, + tensorDimensions, + layout, + profile, + } = resolveVideoModelLayout(); + const context = ensureVideoCvCanvas(); + videoCvCanvas.width = width; + videoCvCanvas.height = height; + let resizeRatio = 1; + if (profile === "retinaface") { + const sourceWidth = videoPreview.videoWidth; + const sourceHeight = videoPreview.videoHeight; + const targetRatio = height / width; + if (sourceHeight / sourceWidth <= targetRatio) { + resizeRatio = width / sourceWidth; + } else { + resizeRatio = height / sourceHeight; + } + + const resizedWidth = Math.max(1, Math.min(width, Math.round(sourceWidth * resizeRatio))); + const resizedHeight = Math.max(1, Math.min(height, Math.round(sourceHeight * resizeRatio))); + context.clearRect(0, 0, width, height); + context.drawImage(videoPreview, 0, 0, resizedWidth, resizedHeight); + } else { + context.drawImage(videoPreview, 0, 0, width, height); + } + + const rgba = context.getImageData(0, 0, width, height).data; + const elementCount = width * height * channels; + const tensorData = dataType === "uint8" + ? new Uint8Array(elementCount) + : new Float32Array(elementCount); + + for (let pixelIndex = 0; pixelIndex < width * height; pixelIndex += 1) { + const rgbaIndex = pixelIndex * 4; + const red = rgba[rgbaIndex]; + const green = rgba[rgbaIndex + 1]; + const blue = rgba[rgbaIndex + 2]; + + if (profile === "retinaface") { + const tensorIndex = pixelIndex * channels; + tensorData[tensorIndex] = blue - RETINAFACE_MEAN_BGR[0]; + tensorData[tensorIndex + 1] = green - RETINAFACE_MEAN_BGR[1]; + tensorData[tensorIndex + 2] = red - RETINAFACE_MEAN_BGR[2]; + continue; + } + + if (channels === 1) { + const grayscale = Math.round(0.299 * red + 0.587 * green + 0.114 * blue); + tensorData[pixelIndex] = dataType === "uint8" ? grayscale : grayscale / 255; + continue; + } + + if (layout === "nchw") { + const planeSize = width * height; + if (dataType === "uint8") { + tensorData[pixelIndex] = red; + tensorData[pixelIndex + planeSize] = green; + tensorData[pixelIndex + 2 * planeSize] = blue; + } else { + tensorData[pixelIndex] = red / 255; + tensorData[pixelIndex + planeSize] = green / 255; + tensorData[pixelIndex + 2 * planeSize] = blue / 255; + } + continue; + } + + const tensorIndex = pixelIndex * channels; + if (dataType === "uint8") { + tensorData[tensorIndex] = red; + tensorData[tensorIndex + 1] = green; + tensorData[tensorIndex + 2] = blue; + } else { + tensorData[tensorIndex] = red / 255; + tensorData[tensorIndex + 1] = green / 255; + tensorData[tensorIndex + 2] = blue / 255; + } + } + + return { + tensor: new window.ort.Tensor(dataType, tensorData, tensorDimensions), + preprocess: { + profile, + inputWidth: width, + inputHeight: height, + resizeRatio, + sourceWidth: videoPreview.videoWidth, + sourceHeight: videoPreview.videoHeight, + }, + }; +}; + +const looksLikeBoxes = (tensor) => { + if (!tensor?.dims || !tensor?.data) { + return false; + } + + const dims = tensor.dims.filter((dimension) => Number.isFinite(dimension)); + const values = Array.from(tensor.data ?? []); + const lastDimension = dims[dims.length - 1]; + return values.length >= 4 && (lastDimension === 4 || lastDimension === 6 || lastDimension === 7); +}; + +const flattenFinite = (tensor) => { + return Array.from(tensor?.data ?? []).map(Number).filter((value) => Number.isFinite(value)); +}; + +const normalizeBox = (boxValues, format = "xyxy") => { + if (boxValues.length < 4) { + return null; + } + + let x1; + let y1; + let x2; + let y2; + if (format === "cxcywh") { + const [centerX, centerY, width, height] = boxValues; + x1 = centerX - width / 2; + y1 = centerY - height / 2; + x2 = centerX + width / 2; + y2 = centerY + height / 2; + } else { + [x1, y1, x2, y2] = boxValues; + } + + if (x2 < x1) { + [x1, x2] = [x2, x1]; + } + if (y2 < y1) { + [y1, y2] = [y2, y1]; + } + + const normalized = [x1, y1, x2, y2].map((value) => ( + value > 1.5 ? value : Math.max(0, Math.min(1, value)) + )); + + return normalized; +}; + +const clamp = (value, min, max) => Math.max(min, Math.min(max, value)); + +const buildRetinaFacePriors = (imageHeight, imageWidth) => { + const priors = []; + RETINAFACE_STEPS.forEach((step, index) => { + const featureMapHeight = Math.ceil(imageHeight / step); + const featureMapWidth = Math.ceil(imageWidth / step); + const minSizes = RETINAFACE_MIN_SIZES[index]; + + for (let row = 0; row < featureMapHeight; row += 1) { + for (let column = 0; column < featureMapWidth; column += 1) { + minSizes.forEach((minSize) => { + priors.push([ + ((column + 0.5) * step) / imageWidth, + ((row + 0.5) * step) / imageHeight, + minSize / imageWidth, + minSize / imageHeight, + ]); + }); + } + } + }); + return priors; +}; + +const decodeRetinaFaceBox = (loc, prior) => { + const centerX = prior[0] + loc[0] * RETINAFACE_VARIANCES[0] * prior[2]; + const centerY = prior[1] + loc[1] * RETINAFACE_VARIANCES[0] * prior[3]; + const width = prior[2] * Math.exp(loc[2] * RETINAFACE_VARIANCES[1]); + const height = prior[3] * Math.exp(loc[3] * RETINAFACE_VARIANCES[1]); + return [ + centerX - width / 2, + centerY - height / 2, + centerX + width / 2, + centerY + height / 2, + ]; +}; + +const computeIoU = (left, right) => { + const x1 = Math.max(left.box[0], right.box[0]); + const y1 = Math.max(left.box[1], right.box[1]); + const x2 = Math.min(left.box[2], right.box[2]); + const y2 = Math.min(left.box[3], right.box[3]); + const width = Math.max(0, x2 - x1 + 1); + const height = Math.max(0, y2 - y1 + 1); + const intersection = width * height; + const leftArea = Math.max(0, left.box[2] - left.box[0] + 1) * Math.max(0, left.box[3] - left.box[1] + 1); + const rightArea = Math.max(0, right.box[2] - right.box[0] + 1) * Math.max(0, right.box[3] - right.box[1] + 1); + return intersection / Math.max(1e-6, leftArea + rightArea - intersection); +}; + +const applyNms = (detections, threshold) => { + const sorted = [...detections].sort((left, right) => right.score - left.score); + const kept = []; + + sorted.forEach((candidate) => { + if (kept.every((accepted) => computeIoU(candidate, accepted) <= threshold)) { + kept.push(candidate); + } + }); + + return kept; +}; + +const decodeRetinaFaceOutputs = (outputs, preprocess) => { + if (!preprocess || preprocess.profile !== "retinaface") { + return null; + } + + const outputNames = Array.isArray(videoCvSession?.outputNames) ? videoCvSession.outputNames : []; + if (outputNames.length < 3) { + return null; + } + + const locTensor = outputs[outputNames[0]]; + const confTensor = outputs[outputNames[1]]; + const landmTensor = outputs[outputNames[2]]; + if (!locTensor || !confTensor || !landmTensor) { + return null; + } + + const locValues = flattenFinite(locTensor); + const confValues = flattenFinite(confTensor); + const landmValues = flattenFinite(landmTensor); + const priorCount = locValues.length / 4; + if (priorCount <= 0 || confValues.length / 2 !== priorCount || landmValues.length / 10 !== priorCount) { + return null; + } + + const priors = buildRetinaFacePriors(preprocess.inputHeight, preprocess.inputWidth); + if (priors.length !== priorCount) { + return null; + } + + const detections = []; + for (let index = 0; index < priorCount; index += 1) { + const score = softmax(confValues.slice(index * 2, index * 2 + 2))[1] ?? 0; + if (score < RETINAFACE_CONFIDENCE_THRESHOLD) { + continue; + } + + const decoded = decodeRetinaFaceBox( + locValues.slice(index * 4, index * 4 + 4), + priors[index], + ); + const scaledBox = [ + clamp((decoded[0] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth), + clamp((decoded[1] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight), + clamp((decoded[2] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth), + clamp((decoded[3] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight), + ]; + + detections.push({ + label: "face", + class_index: 0, + score, + box: scaledBox, + }); + } + + const filtered = applyNms(detections, RETINAFACE_NMS_THRESHOLD); + if (!filtered.length) { + return { + mode: "detection", + detections: [], + detected_class: "no_detection", + class_index: -1, + confidence: 0, + probabilities: [], + top_classes: [], + }; + } + + const best = filtered[0]; + return { + mode: "detection", + detections: filtered, + detected_class: best.label, + class_index: best.class_index, + confidence: best.score, + probabilities: filtered.map((entry) => entry.score), + top_classes: filtered.slice(0, 3).map((entry) => ({ + label: entry.label, + index: entry.class_index, + probability: entry.score, + })), + }; +}; + +const findDetectionTensor = (entries, patterns, predicate = () => true) => { + return entries.find(([name, tensor]) => { + const normalizedName = String(name).toLowerCase(); + return patterns.some((pattern) => pattern.test(normalizedName)) && predicate(tensor); + }) ?? null; +}; + +const decodeHuggingFaceDetectionOutputs = (entries) => { + const boxesEntry = findDetectionTensor( + entries, + [/pred_boxes/, /boxes?/, /bbox/], + (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : null) === 4, + ); + const logitsEntry = findDetectionTensor( + entries, + [/logits/, /scores?/, /class/], + (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : 0) > 1, + ); + + if (!boxesEntry || !logitsEntry) { + return null; + } + + const [boxesName, boxesTensor] = boxesEntry; + const [, logitsTensor] = logitsEntry; + const rawBoxes = flattenFinite(boxesTensor); + const rawLogits = flattenFinite(logitsTensor); + const boxCount = Math.floor(rawBoxes.length / 4); + const classCount = boxCount > 0 ? Math.floor(rawLogits.length / boxCount) : 0; + if (boxCount <= 0 || classCount <= 1) { + return null; + } + + const usesCenterBoxes = /pred_boxes/.test(String(boxesName).toLowerCase()); + const detections = []; + for (let index = 0; index < boxCount; index += 1) { + const box = rawBoxes.slice(index * 4, index * 4 + 4); + const logits = rawLogits.slice(index * classCount, index * classCount + classCount); + const candidateLogits = logits.length > 1 ? logits.slice(0, -1) : logits; + const probabilities = softmax(candidateLogits); + const best = getTopK(probabilities, 1)[0]; + if (!best || best.value < VIDEO_RENDER_SCORE_THRESHOLD) { + continue; + } + + const normalizedBox = normalizeBox(box, usesCenterBoxes ? "cxcywh" : "xyxy"); + if (!normalizedBox) { + continue; + } + + detections.push({ + label: `class_${best.index}`, + class_index: best.index, + score: best.value, + box: normalizedBox, + }); + } + + if (!detections.length) { + return { + mode: "detection", + detections: [], + detected_class: "no_detection", + class_index: -1, + confidence: 0, + probabilities: [], + top_classes: [], + }; + } + + detections.sort((left, right) => right.score - left.score); + const best = detections[0]; + return { + mode: "detection", + detections, + detected_class: best.label, + class_index: best.class_index, + confidence: best.score, + probabilities: detections.map((entry) => entry.score), + top_classes: detections.slice(0, 3).map((entry) => ({ + label: entry.label, + index: entry.class_index, + probability: entry.score, + })), + }; +}; + +const decodeDetectionOutputs = (outputs) => { + const entries = Object.entries(outputs); + const huggingFaceSummary = decodeHuggingFaceDetectionOutputs(entries); + if (huggingFaceSummary) { + return huggingFaceSummary; + } + + const boxesEntry = entries.find(([, tensor]) => looksLikeBoxes(tensor)); + + if (!boxesEntry) { + return null; + } + + const [boxesName, boxesTensor] = boxesEntry; + const boxDims = Array.isArray(boxesTensor.dims) ? boxesTensor.dims : []; + const rawBoxes = flattenFinite(boxesTensor); + const boxWidth = boxDims[boxDims.length - 1] ?? 4; + const detectionCount = Math.floor(rawBoxes.length / boxWidth); + if (detectionCount <= 0) { + return null; + } + + const scoresEntry = entries.find(([name, tensor]) => + name !== boxesName && flattenFinite(tensor).length >= detectionCount + ); + const classEntry = entries.find(([name, tensor]) => + name !== boxesName && name !== scoresEntry?.[0] && flattenFinite(tensor).length >= detectionCount + ); + const detections = []; + const scoreValues = scoresEntry ? flattenFinite(scoresEntry[1]) : []; + const classValues = classEntry ? flattenFinite(classEntry[1]) : []; + + for (let index = 0; index < detectionCount; index += 1) { + const start = index * boxWidth; + const row = rawBoxes.slice(start, start + boxWidth); + const normalizedBox = normalizeBox(row); + if (!normalizedBox) { + continue; + } + + let score = Number(scoreValues[index] ?? row[4] ?? row[5] ?? 1); + if (!Number.isFinite(score)) { + score = 1; + } + + let classIndex = classValues[index]; + if (!Number.isFinite(classIndex)) { + classIndex = row.length >= 6 ? row[5] : row.length >= 7 ? row[6] : index; + } + + if (score < VIDEO_RENDER_SCORE_THRESHOLD) { + continue; + } + + detections.push({ + label: `class_${Math.round(classIndex)}`, + class_index: Math.round(classIndex), + score, + box: normalizedBox, + }); + } + + if (!detections.length) { + return { + mode: "detection", + detections: [], + detected_class: "no_detection", + class_index: -1, + confidence: 0, + probabilities: [], + top_classes: [], + }; + } + + detections.sort((left, right) => right.score - left.score); + const best = detections[0]; + return { + mode: "detection", + detections, + detected_class: best.label, + class_index: best.class_index, + confidence: best.score, + probabilities: detections.map((entry) => entry.score), + top_classes: detections.slice(0, 3).map((entry) => ({ + label: entry.label, + index: entry.class_index, + probability: entry.score, + })), + }; +}; + +const decodeClassificationOutputs = (output) => { + const values = Array.from(output?.data ?? []); + if (values.length === 0) { + throw new Error("Video model returned an empty output tensor."); + } + + if (values.length === 1) { + return { + mode: "classification", + detections: [], + detected_class: "scalar_output", + class_index: 0, + confidence: Number(values[0]), + probabilities: values, + top_classes: [{ label: "scalar_output", index: 0, probability: Number(values[0]) }], + }; + } + + const probabilities = softmax(values); + const ranked = getTopK(probabilities, 3); + const best = ranked[0]; + + return { + mode: "classification", + detections: [], + detected_class: `class_${best.index}`, + class_index: best.index, + confidence: best.value, + probabilities, + top_classes: ranked.map(({ index, value }) => ({ + label: `class_${index}`, + index, + probability: value, + logit: values[index], + })), + }; +}; + +const summarizeVideoOutput = (outputMap, preprocess = null) => { + const retinaFaceSummary = decodeRetinaFaceOutputs(outputMap, preprocess); + if (retinaFaceSummary) { + return retinaFaceSummary; + } + + const detectionSummary = decodeDetectionOutputs(outputMap); + if (detectionSummary) { + return detectionSummary; + } + + const primaryOutput = outputMap[videoCvOutputName]; + const primaryValues = Array.from(primaryOutput?.data ?? []); + if (primaryValues.length > 0 && primaryValues.length <= 4096) { + return decodeClassificationOutputs(primaryOutput); + } + + return { + mode: "passthrough", + detections: [], + detected_class: "unrecognized_output", + class_index: -1, + confidence: 0, + probabilities: [], + top_classes: [], + }; +}; + +const drawOverlayText = (context, lines) => { + if (!lines.length) { + return; + } + + context.font = "18px ui-monospace, monospace"; + const lineHeight = 24; + const width = Math.max(...lines.map((line) => context.measureText(line).width), 0) + 20; + const height = lines.length * lineHeight + 12; + context.fillStyle = "rgba(24, 32, 40, 0.72)"; + context.fillRect(12, 12, width, height); + context.fillStyle = "#fffdfa"; + lines.forEach((line, index) => { + context.fillText(line, 22, 36 + index * lineHeight); + }); +}; + +const renderVideoOutputFrame = () => { + videoRenderFrameId = null; + + if (!videoOutputVisible || !videoCapture || !videoPreview.videoWidth || !videoPreview.videoHeight) { + return; + } + + const context = ensureVideoOverlayContext(); + const width = videoPreview.videoWidth; + const height = videoPreview.videoHeight; + if (videoOutputCanvas.width !== width || videoOutputCanvas.height !== height) { + videoOutputCanvas.width = width; + videoOutputCanvas.height = height; + } + + context.drawImage(videoPreview, 0, 0, width, height); + + if (lastVideoInferenceSummary?.mode === "detection") { + context.lineWidth = 3; + context.font = "16px ui-monospace, monospace"; + lastVideoInferenceSummary.detections.forEach((entry) => { + const [x1, y1, x2, y2] = entry.box; + const left = x1 <= 1 ? x1 * width : x1; + const top = y1 <= 1 ? y1 * height : y1; + const right = x2 <= 1 ? x2 * width : x2; + const bottom = y2 <= 1 ? y2 * height : y2; + const boxWidth = Math.max(1, right - left); + const boxHeight = Math.max(1, bottom - top); + + context.strokeStyle = "#ef8f35"; + context.strokeRect(left, top, boxWidth, boxHeight); + + const label = `${entry.label} ${(entry.score * 100).toFixed(1)}%`; + const textWidth = context.measureText(label).width + 10; + context.fillStyle = "#182028"; + context.fillRect(left, Math.max(0, top - 24), textWidth, 22); + context.fillStyle = "#fffdfa"; + context.fillText(label, left + 5, Math.max(16, top - 8)); + }); + } else if (lastVideoInferenceSummary?.mode === "classification") { + drawOverlayText(context, [ + `classification: ${lastVideoInferenceSummary.detected_class}`, + `confidence: ${(lastVideoInferenceSummary.confidence * 100).toFixed(1)}%`, + ]); + } else if (lastVideoInferenceSummary?.mode === "passthrough") { + drawOverlayText(context, [ + "output mode: passthrough", + "model output not recognized as detection or classification", + ]); + } + + videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame); +}; + +const syncVideoOutputView = () => { + videoOutputCanvas.hidden = !videoOutputVisible || !videoCapture; + videoOutputButton.textContent = videoOutputVisible ? "Hide video output" : "Show video output"; + + if (!videoOutputVisible || !videoCapture) { + if (videoRenderFrameId !== null) { + window.cancelAnimationFrame(videoRenderFrameId); + videoRenderFrameId = null; + } + updateVideoStatus(); + return; + } + + if (videoRenderFrameId === null) { + videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame); + } + updateVideoStatus(); +}; + +const stopVideoCvLoop = () => { + if (videoCvLoopId !== null) { + window.clearInterval(videoCvLoopId); + videoCvLoopId = null; + } + lastVideoCvLabel = null; + updateVideoStatus(); +}; + +const inferVideoPrediction = async () => { + if ( + !videoCapture + || !videoCvSession + || !videoCvInputName + || !videoCvOutputName + || videoCvInferencePending + ) { + return; + } + + const now = Date.now(); + if (now - lastVideoInferenceAt < VIDEO_INFERENCE_INTERVAL_MS) { + return; + } + + videoCvInferencePending = true; + lastVideoInferenceAt = now; + + try { + const { tensor: input, preprocess } = buildVideoInputTensor(); + const outputMap = await videoCvSession.run({ [videoCvInputName]: input }); + const output = outputMap[videoCvOutputName]; + const summary = summarizeVideoOutput(outputMap, preprocess); + const labelChanged = summary.detected_class !== lastVideoCvLabel; + lastVideoCvLabel = summary.detected_class; + lastVideoInferenceSummary = summary; + + updateVideoStatus([ + `output mode: ${summary.mode}`, + `prediction: ${summary.detected_class}`, + `confidence: ${summary.confidence.toFixed(4)}`, + ...( + summary.mode === "detection" + ? [ + `detections: ${summary.detections.length}`, + ...summary.detections.slice(0, 3).map( + (entry) => + `${entry.label}: score=${entry.score.toFixed(4)} box=${ + entry.box.map((value) => value.toFixed(3)).join(",") + }`, + ), + ] + : [ + "top classes:", + ...summary.top_classes.map( + (entry) => + `${entry.label}: p=${entry.probability.toFixed(4)} logit=${ + Number(entry.logit ?? entry.probability).toFixed(4) + }`, + ), + ] + ), + `frame: ${videoPreview.videoWidth}x${videoPreview.videoHeight}`, + `processed at: ${new Date().toLocaleTimeString()}`, + ]); + syncVideoOutputView(); + + sendClientEvent("video_cv", "inference", { + mode: summary.mode, + detected_class: summary.detected_class, + class_index: summary.class_index, + confidence: summary.confidence, + probabilities: summary.probabilities, + top_classes: summary.top_classes, + detections: summary.detections, + changed: labelChanged, + processed_at: new Date().toISOString(), + model_path: VIDEO_MODEL_PATH, + input_name: videoCvInputName, + output_name: videoCvOutputName, + input_dimensions: videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [], + output_dimensions: Array.isArray(output?.dims) ? output.dims : [], + source_resolution: { + width: videoPreview.videoWidth, + height: videoPreview.videoHeight, + }, + }); + } catch (error) { + lastVideoInferenceSummary = { + mode: "passthrough", + detections: [], + detected_class: "inference_error", + class_index: -1, + confidence: 0, + probabilities: [], + top_classes: [], + }; + updateVideoStatus([ + `inference error: ${error instanceof Error ? error.message : String(error)}`, + ]); + console.error(error); + } finally { + videoCvInferencePending = false; + } +}; + +const syncVideoCvLoop = () => { + if (videoCapture && videoCvSession) { + if (videoCvLoopId === null) { + videoCvLoopId = window.setInterval(() => { + void inferVideoPrediction(); + }, VIDEO_INFERENCE_INTERVAL_MS); + } + updateVideoStatus([ + "browser-side webcam inference active", + "results are sent to the backend over the websocket.", + ]); + return; + } + + stopVideoCvLoop(); + lastVideoInferenceSummary = null; + updateVideoStatus([ + videoCvSession + ? "model loaded; start video capture to begin inference." + : `model file: ${VIDEO_MODEL_PATH}`, + ]); +}; + renderSensorOutput(); updateHarStatus([ "local-only inference path", "model file: /static/models/human_activity_recognition.onnx", ]); +updateVideoStatus([ + `model file: ${VIDEO_MODEL_PATH}`, + "load the model, then start video capture to process frames in-browser.", +]); harExportButton.addEventListener("click", () => { try { @@ -570,6 +1594,8 @@ try { videoPreview.hidden = true; videoButton.textContent = "Start video"; delete window.videoCapture; + syncVideoCvLoop(); + syncVideoOutputView(); append("video stopped"); sendClientEvent("video", "stopped", { track_count: 0 }); return; @@ -581,6 +1607,8 @@ try { videoButton.textContent = "Stop video"; append(`video granted: ${videoCapture.trackCount()} video track(s)`); window.videoCapture = videoCapture; + syncVideoCvLoop(); + syncVideoOutputView(); sendClientEvent("video", "started", { track_count: videoCapture.trackCount(), }); @@ -882,6 +1910,57 @@ try { } }); + videoModelButton.addEventListener("click", async () => { + try { + if (!window.ort) { + throw new Error("onnxruntime-web did not load."); + } + + configureOnnxRuntimeWasm(); + + videoModelButton.disabled = true; + videoModelButton.textContent = "Loading video model..."; + updateVideoStatus(["loading model..."]); + + videoCvSession = await window.ort.InferenceSession.create( + VIDEO_MODEL_PATH, + { + executionProviders: ["wasm"], + }, + ); + + videoCvInputName = selectVideoModelInputName(videoCvSession); + videoCvOutputName = selectVideoModelOutputName(videoCvSession); + lastVideoCvLabel = null; + lastVideoInferenceSummary = null; + append( + `video cv model loaded: input=${videoCvInputName} output=${videoCvOutputName} input_dims=${ + JSON.stringify(videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? []) + }`, + ); + syncVideoCvLoop(); + } catch (error) { + videoCvSession = null; + videoCvInputName = null; + videoCvOutputName = null; + stopVideoCvLoop(); + lastVideoInferenceSummary = null; + updateVideoStatus([ + `model load error: ${error instanceof Error ? error.message : String(error)}`, + ]); + append(`video cv error: ${error instanceof Error ? error.message : String(error)}`); + console.error(error); + } finally { + videoModelButton.disabled = false; + videoModelButton.textContent = videoCvSession ? "Reload video CV model" : "Load video CV model"; + } + }); + + videoOutputButton.addEventListener("click", () => { + videoOutputVisible = !videoOutputVisible; + syncVideoOutputView(); + }); + window.client = client; window.sendAlive = () => client.send_alive(); } catch (error) { diff --git a/services/ws-server/static/index.html b/services/ws-server/static/index.html index 630db76..2ccf512 100644 --- a/services/ws-server/static/index.html +++ b/services/ws-server/static/index.html @@ -138,14 +138,23 @@

WASM web agent

+ +

+
+
Booting…