FluidInference · Alex-Wengg · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
@@ -240,6 +240,35 @@ Peak memory usage (process-wide): 1.503 GB
 Model is nearly identical to the base model in terms of quality, performance wise we see an up to ~3.5x improvement compared to the silero Pytorch VAD model with the 256ms batch model (8 chunks of 32ms)
 
 ![VAD/speed.png](VAD/speed.png)
+
+### FSMN-VAD (`fsmn-vad-segment`)
+
+CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.
+
+Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness (per-clip speech/non-speech), same metric as the silero baseline:
+
+| Backend | Accuracy | Precision | Recall | F1 | RTFx |
+|---------|----------|-----------|--------|----|------|
+| silero (baseline) | 82.0% | 73.5% | 100% | 84.7% | 1408× |
+| **FSMN-VAD** | **98.0%** | **96.2%** | 100% | **98.0%** | 640× |
+
+FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).
+
+Full [FluidInference/musan](https://huggingface.co/datasets/FluidInference/musan) noise set (774 noise clips) — noise rejection / specificity (correctly classified non-speech):
+
+| Backend | Noise rejected (specificity) | False-positive rate | RTFx |
+|---------|------------------------------|---------------------|------|
+| silero | 69.8% | 30.2% | 1341× |
+| **FSMN-VAD** | **81.9%** | **18.1%** | 571× |
+
+On the full MUSAN noise set FSMN-VAD rejects 12 pp more noise as non-speech (18% vs 30% false positives) — consistently more precise than silero on both the balanced (mini50) and noise-heavy (full MUSAN) evaluations.
+
+Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
+
+```bash
+swift run -c release fluidaudiocli vad-benchmark --dataset mini50 --backend fsmn
+swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
+```
 ![VAD/correlation.png](VAD/correlation.png)
 
 Dataset: https://github.com/Lab41/VOiCES-subset

diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
@@ -12,6 +12,8 @@ public enum Repo: String, CaseIterable, Sendable {
     /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE
     /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice.
     case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
+    /// FSMN-VAD voice activity detection (FunASR). See VAD/Fsmn.
+    case fsmnVad = "FluidInference/fsmn-vad-coreml"
     /// Paraformer-large (zh) — non-autoregressive ASR: SANM encoder + CIF
     /// predictor (host-side integrate-and-fire) + parallel decoder. See ASR/Paraformer.
     case paraformerLargeZh = "FluidInference/paraformer-large-zh-coreml"
@@ -76,6 +78,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "parakeet-ctc-0.6b-zh-cn-coreml"
         case .senseVoiceSmall:
             return "sensevoice-small-coreml"
+        case .fsmnVad:
+            return "fsmn-vad-coreml"
         case .paraformerLargeZh:
             return "paraformer-large-zh-coreml"
         case .parakeetJa:
@@ -442,6 +446,23 @@ public enum ModelNames {
         ]
     }
 
+    /// FSMN-VAD model names (2 CoreML stages + host decision).
+    ///   Preprocessor (fp32/CPU): waveform -> 400-d features (fbank80 + LFR m=5,n=1)
+    ///   FsmnVad (fp16/ANE): features -> [1,T,248] frame scores (col 0 = silence prob)
+    /// Plus `vad_config.json` (auto-fetched as a root file).
+    public enum FsmnVad {
+        public static let preprocessor = "FsmnVadPreprocessor"
+        public static let scorer = "FsmnVad"
+
+        public static let preprocessorFile = preprocessor + ".mlmodelc"
+        public static let scorerFile = scorer + ".mlmodelc"
+
+        public static let requiredModels: Set<String> = [
+            preprocessorFile,
+            scorerFile,
+        ]
+    }
+
     /// Paraformer-large (zh) model names. 4 CoreML stages + host CIF:
     ///   Preprocessor (fp32/CPU): waveform -> 560-d LFR features
     ///   Encoder (fp16/ANE): SANM encoder (enumerated buckets)
@@ -1098,6 +1119,8 @@ public enum ModelNames {
             return ModelNames.CTCZhCn.requiredModels
         case .senseVoiceSmall:
             return ModelNames.SenseVoice.requiredModels
+        case .fsmnVad:
+            return ModelNames.FsmnVad.requiredModels
         case .paraformerLargeZh:
             return ModelNames.ParaformerZh.requiredModels
         case .parakeetJa:

diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
@@ -0,0 +1,163 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// A detected speech segment, in milliseconds.
+public struct FsmnVadSegment: Sendable, Equatable {
+    public let startMs: Int
+    public let endMs: Int
+}
+
+/// FSMN-VAD voice activity detection: audio -> speech segments.
+///
+/// Pipeline: waveform -> [Preprocessor fp32/CPU] -> 400-d features
+///   -> [FSMN fp16/ANE, enumerated buckets] -> per-frame scores (col 0 = silence prob)
+///   -> host decision (window-detector hysteresis + silence->endpoint) -> [start_ms, end_ms].
+///
+/// Audio longer than the largest bucket is processed in ~30 s chunks; the per-frame
+/// silence probabilities are concatenated and the decision runs once over all frames.
+public actor FsmnVadManager {
+
+    // Enumerated scorer buckets (post-LFR frames; matches the converted model).
+    private static let buckets = [512, 1024, 2048, 3072]
+    private static let featureDim = 400
+    private static let waveformScale: Float = 32_768.0
+
+    // Decision params (derived from FunASR vad_opts; 10 ms frames).
+    private static let silenceThreshold: Float = 0.2  // GetFrameState: speech if silence_prob <= 0.2
+    private static let windowFrames = 20  // window_size_ms 200 / 10
+    private static let silToSpeech = 15  // sil_to_speech_time 150 / 10
+    private static let speechToSil = 15  // speech_to_sil_time 150 / 10
+    private static let maxEndSilenceFrames = 80  // max_end_silence_time 800 / 10
+    private static let lookbackFrames = 20  // lookback_time_start_point 200 / 10
+    private static let lookaheadFrames = 10  // lookahead_time_end_point 100 / 10
+    private static let maxSegmentFrames = 6000  // max_single_segment_time 60000 / 10
+    private static let frameMs = 10
+
+    private let models: FsmnVadModels
+    private static let logger = AppLogger(category: "FsmnVadManager")
+
+    public init(models: FsmnVadModels) {
+        self.models = models
+    }
+
+    public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> FsmnVadManager {
+        FsmnVadManager(models: try await FsmnVadModels.downloadAndLoad(progressHandler: progressHandler))
+    }
+
+    public func detect(audioURL: URL) throws -> [FsmnVadSegment] {
+        let audio = try autoreleasepool { () -> [Float] in
+            let converter = AudioConverter(sampleRate: 16_000)
+            return try converter.resampleAudioFile(audioURL)
+        }
+        return try detect(audio: audio)
+    }
+
+    public func detect(audio: [Float]) throws -> [FsmnVadSegment] {
+        let silence = try silenceProbabilities(audio: audio)
+        return decide(silence: silence)
+    }
+
+    // MARK: - Scoring (chunked)
+
+    /// Per-frame silence probability over the whole audio (concatenated across chunks).
+    private func silenceProbabilities(audio: [Float]) throws -> [Float] {
+        // ~30 s chunks (largest bucket); samples ≈ frames * 160.
+        let chunkSamples = (Self.buckets.last! - Self.windowFrames) * 160
+        var sil: [Float] = []
+        var offset = 0
+        while offset < audio.count {
+            let end = min(offset + chunkSamples, audio.count)
+            let chunk = Array(audio[offset..<end])
+            // Drain CoreML's autoreleased MLMultiArrays per chunk so memory stays
+            // bounded on long audio; otherwise they accumulate across the whole file.
+            let chunkSil = try autoreleasepool { try chunkSilence(chunk) }
+            sil.append(contentsOf: chunkSil)
+            offset = end
+        }
+        return sil
+    }
+
+    private func chunkSilence(_ audio: [Float]) throws -> [Float] {
+        let n = audio.count
+        let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32)
+        let wp = wav.dataPointer.assumingMemoryBound(to: Float32.self)
+        for i in 0..<n { wp[i] = audio[i] * Self.waveformScale }
+        let feats = try models.preprocessor.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["waveform": MLFeatureValue(multiArray: wav)]))
+        guard let f = feats.featureValue(for: "features")?.multiArrayValue else {
+            throw ASRError.processingFailed("FSMN-VAD preprocessor produced no `features`")
+        }
+        let t = f.shape[1].intValue
+        if t == 0 { return [] }
+        let bucket = Self.buckets.first(where: { $0 >= t }) ?? Self.buckets.last!
+        let speech = try MLMultiArray(shape: [1, bucket as NSNumber, Self.featureDim as NSNumber], dataType: .float32)
+        let sp = speech.dataPointer.assumingMemoryBound(to: Float32.self)
+        memset(sp, 0, bucket * Self.featureDim * MemoryLayout<Float32>.size)
+        let count = t * Self.featureDim
+        if f.dataType == .float32 {
+            memcpy(sp, f.dataPointer, count * MemoryLayout<Float32>.size)
+        } else {
+            for i in 0..<count { sp[i] = f[i].floatValue }
+        }
+        let out = try models.scorer.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["feats": MLFeatureValue(multiArray: speech)]))
+        guard let scores = out.featureValue(for: "scores")?.multiArrayValue else {
+            throw ASRError.processingFailed("FSMN-VAD scorer produced no `scores`")
+        }
+        let vocab = scores.shape[2].intValue
+        var sil = [Float](repeating: 0, count: t)
+        if scores.dataType == .float32 {
+            let p = scores.dataPointer.assumingMemoryBound(to: Float32.self)
+            for frame in 0..<t { sil[frame] = p[frame * vocab] }  // col 0 = silence prob
+        } else {
+            for frame in 0..<t { sil[frame] = scores[[0, frame as NSNumber, 0]].floatValue }
+        }
+        return sil
+    }
+
+    // MARK: - Decision (port of FunASR FsmnVADStreaming)
+
+    private func decide(silence: [Float]) -> [FsmnVadSegment] {
+        let T = silence.count
+        var win = [Int](repeating: 0, count: Self.windowFrames)
+        var pos = 0
+        var winSum = 0
+        var preSpeech = false
+        var inSeg = false
+        var segStart = 0
+        var contSil = 0
+        var segs: [FsmnVadSegment] = []
+
+        func close(at frame: Int) {
+            segs.append(FsmnVadSegment(startMs: segStart * Self.frameMs, endMs: frame * Self.frameMs))
+            inSeg = false
+        }
+
+        for t in 0..<T {
+            let cur = silence[t] <= Self.silenceThreshold ? 1 : 0
+            winSum -= win[pos]
+            winSum += cur
+            win[pos] = cur
+            pos = (pos + 1) % Self.windowFrames
+            if !preSpeech && winSum >= Self.silToSpeech {
+                preSpeech = true
+                if !inSeg {
+                    inSeg = true
+                    segStart = max(0, t - Self.silToSpeech - Self.lookbackFrames)
+                    contSil = 0
+                }
+            } else if preSpeech && winSum <= Self.speechToSil {
+                preSpeech = false
+            }
+            if inSeg && !preSpeech { contSil += 1 } else { contSil = 0 }
+            if inSeg && contSil >= Self.maxEndSilenceFrames {
+                close(at: t - Self.maxEndSilenceFrames + Self.lookaheadFrames)
+            } else if inSeg && (t - segStart) >= Self.maxSegmentFrames {
+                close(at: t)
+                preSpeech = false
+            }
+        }
+        if inSeg { close(at: T) }
+        return segs
+    }
+}
diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift
@@ -0,0 +1,88 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Loaded FSMN-VAD CoreML models.
+///
+/// 2 stages from `FluidInference/fsmn-vad-coreml`:
+///   - `preprocessor` (fp32, CPU): waveform -> [1, T, 400] features (fbank80 + LFR m=5,n=1)
+///   - `scorer` (fp16, ANE): features -> [1, T, 248] frame scores (col 0 = silence prob)
+public struct FsmnVadModels: Sendable {
+
+    public let preprocessor: MLModel
+    public let scorer: MLModel
+
+    private static let logger = AppLogger(category: "FsmnVadModels")
+
+    public init(preprocessor: MLModel, scorer: MLModel) {
+        self.preprocessor = preprocessor
+        self.scorer = scorer
+    }
+
+    public static func downloadAndLoad(
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> FsmnVadModels {
+        try load(from: try await download(progressHandler: progressHandler))
+    }
+
+    public static func download(
+        force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let root = modelsRootDirectory()
+        let dir = root.appendingPathComponent(Repo.fsmnVad.folderName, isDirectory: true)
+        if !force && modelsExist(at: dir) {
+            logger.info("FSMN-VAD models already present at: \(dir.path)")
+            return dir
+        }
+        if force { try? FileManager.default.removeItem(at: dir) }
+        logger.info("Downloading FSMN-VAD models from HuggingFace...")
+        try await DownloadUtils.downloadRepo(.fsmnVad, to: root, progressHandler: progressHandler)
+        return dir
+    }
+
+    public static func modelsExist(at directory: URL) -> Bool {
+        let fm = FileManager.default
+        return [ModelNames.FsmnVad.preprocessorFile, ModelNames.FsmnVad.scorerFile].allSatisfy {
+            fm.fileExists(atPath: directory.appendingPathComponent($0).path)
+        }
+    }
+
+    public static func load(from directory: URL) throws -> FsmnVadModels {
+        let cpu = MLModelConfiguration()
+        cpu.computeUnits = .cpuOnly
+        let ane = MLModelConfiguration()
+        ane.computeUnits = .cpuAndNeuralEngine
+        let pre = try loadModel(named: ModelNames.FsmnVad.preprocessor, from: directory, configuration: cpu)
+        let scorer = try loadModel(named: ModelNames.FsmnVad.scorer, from: directory, configuration: ane)
+        logger.info("Loaded FSMN-VAD models")
+        return FsmnVadModels(preprocessor: pre, scorer: scorer)
+    }
+
+    private static func loadModel(
+        named name: String, from directory: URL, configuration: MLModelConfiguration
+    ) throws -> MLModel {
+        let compiled = directory.appendingPathComponent("\(name).mlmodelc")
+        let pkg = directory.appendingPathComponent("\(name).mlpackage")
+        let url: URL
+        if FileManager.default.fileExists(atPath: compiled.path) {
+            url = compiled
+        } else if FileManager.default.fileExists(atPath: pkg.path) {
+            url = try MLModel.compileModel(at: pkg)
+        } else {
+            throw ASRError.processingFailed("FSMN-VAD model not found: \(name)")
+        }
+        return try MLModel(contentsOf: url, configuration: configuration)
+    }
+
+    private static func modelsRootDirectory() -> URL {
+        let fm = FileManager.default
+        if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
+            return
+                appSupport
+                .appendingPathComponent("FluidAudio", isDirectory: true)
+                .appendingPathComponent("Models", isDirectory: true)
+        }
+        return fm.temporaryDirectory
+            .appendingPathComponent("FluidAudio", isDirectory: true)
+            .appendingPathComponent("Models", isDirectory: true)
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift b/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift
@@ -0,0 +1,37 @@
+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+
+/// `fsmn-vad-segment <audio>` — print detected speech segments [start_ms, end_ms].
+enum FsmnVadSegmentCommand {
+    private static let logger = AppLogger(category: "FsmnVadSegment")
+
+    static func run(arguments: [String]) async {
+        let paths = arguments.filter { !$0.hasPrefix("-") }
+        guard let audioPath = paths.first else {
+            print("Usage: fluidaudio fsmn-vad-segment <audio-file>")
+            return
+        }
+        let url = URL(fileURLWithPath: audioPath)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            logger.error("Error: Audio file not found: \(audioPath)")
+            return
+        }
+        do {
+            logger.info("Loading FSMN-VAD models...")
+            let vad = try await FsmnVadManager.load()
+            let start = Date()
+            let segments = try await vad.detect(audioURL: url)
+            logger.info(
+                "Detected \(segments.count) speech segment(s) in \(String(format: "%.2f", Date().timeIntervalSince(start)))s"
+            )
+            for s in segments {
+                print("[\(s.startMs), \(s.endMs)]  (\(String(format: "%.2f", Double(s.endMs - s.startMs) / 1000.0)) s)")
+            }
+        } catch {
+            logger.error("VAD failed: \(error)")
+        }
+    }
+}
+#endif