From 838592e85c23330e72e472f94774b78e572a9abe Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 21:24:39 -0400
Subject: [PATCH 1/7] feat(vad): FSMN-VAD backend + CLI

CoreML FSMN-VAD from FluidInference/fsmn-vad-coreml: 2-stage (fbank80+LFR preprocessor
fp32/CPU -> FSMN scorer fp16/ANE enumerated [512..3072] -> [1,T,248] scores) + a host
decision (port of FunASR FsmnVADStreaming: speech if silence_prob<=0.2, 20-frame window
hysteresis at 15, max_end_silence 800ms, lookback/lookahead, max_single_segment 60s) ->
[start_ms,end_ms]. Long audio chunked at ~30s; silence probs concatenated, decision once.

- ModelNames: fsmnVad Repo + FsmnVad registry
- VAD/Fsmn/: FsmnVadModels, FsmnVadManager (+ FsmnVadSegment)
- CLI: fsmn-vad-segment
Verified vs FunASR on 20s clip: [120,19960] vs [70,19980] (~50ms). Alternative to silero-vad.
---
 Sources/FluidAudio/ModelNames.swift           |  23 +++
 .../FluidAudio/VAD/Fsmn/FsmnVadManager.swift  | 157 ++++++++++++++++++
 .../FluidAudio/VAD/Fsmn/FsmnVadModels.swift   |  88 ++++++++++
 .../Commands/VAD/FsmnVadSegmentCommand.swift  |  37 +++++
 Sources/FluidAudioCLI/FluidAudioCLI.swift     |   2 +
 5 files changed, 307 insertions(+)
 create mode 100644 Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
 create mode 100644 Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift
 create mode 100644 Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 9dfeeb181..0394380b5 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -12,6 +12,8 @@ public enum Repo: String, CaseIterable, Sendable {
     /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE
     /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice.
     case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
+    /// FSMN-VAD voice activity detection (FunASR). See VAD/Fsmn.
+    case fsmnVad = "FluidInference/fsmn-vad-coreml"
     // Japanese hybrid TDT: INT8 CTC-trained preprocessor+encoder paired with a
     // TDT decoder+joint. CTC-only inference for Japanese was removed in
     // 846924a1d; only the preprocessor+encoder files from this repo are reused.
@@ -73,6 +75,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "parakeet-ctc-0.6b-zh-cn-coreml"
         case .senseVoiceSmall:
             return "sensevoice-small-coreml"
+        case .fsmnVad:
+            return "fsmn-vad-coreml"
         case .parakeetJa:
             return "parakeet-0.6b-ja-coreml"
         case .parakeetEou160:
@@ -437,6 +441,23 @@ public enum ModelNames {
         ]
     }
 
+    /// FSMN-VAD model names (2 CoreML stages + host decision).
+    ///   Preprocessor (fp32/CPU): waveform -> 400-d features (fbank80 + LFR m=5,n=1)
+    ///   FsmnVad (fp16/ANE): features -> [1,T,248] frame scores (col 0 = silence prob)
+    /// Plus `vad_config.json` (auto-fetched as a root file).
+    public enum FsmnVad {
+        public static let preprocessor = "FsmnVadPreprocessor"
+        public static let scorer = "FsmnVad"
+
+        public static let preprocessorFile = preprocessor + ".mlmodelc"
+        public static let scorerFile = scorer + ".mlmodelc"
+
+        public static let requiredModels: Set<String> = [
+            preprocessorFile,
+            scorerFile,
+        ]
+    }
+
     /// TDT ja (Japanese) model names.
     ///
     /// Hybrid layout: the CTC-trained preprocessor + encoder from the
@@ -1061,6 +1082,8 @@ public enum ModelNames {
             return ModelNames.CTCZhCn.requiredModels
         case .senseVoiceSmall:
             return ModelNames.SenseVoice.requiredModels
+        case .fsmnVad:
+            return ModelNames.FsmnVad.requiredModels
         case .parakeetJa:
             return ModelNames.TDTJa.requiredModels
         case .parakeetEou160, .parakeetEou320, .parakeetEou1280:
diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
new file mode 100644
index 000000000..83a76b22c
--- /dev/null
+++ b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
@@ -0,0 +1,157 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// A detected speech segment, in milliseconds.
+public struct FsmnVadSegment: Sendable, Equatable {
+    public let startMs: Int
+    public let endMs: Int
+}
+
+/// FSMN-VAD voice activity detection: audio -> speech segments.
+///
+/// Pipeline: waveform -> [Preprocessor fp32/CPU] -> 400-d features
+///   -> [FSMN fp16/ANE, enumerated buckets] -> per-frame scores (col 0 = silence prob)
+///   -> host decision (window-detector hysteresis + silence->endpoint) -> [start_ms, end_ms].
+///
+/// Audio longer than the largest bucket is processed in ~30 s chunks; the per-frame
+/// silence probabilities are concatenated and the decision runs once over all frames.
+public actor FsmnVadManager {
+
+    // Enumerated scorer buckets (post-LFR frames; matches the converted model).
+    private static let buckets = [512, 1024, 2048, 3072]
+    private static let featureDim = 400
+    private static let waveformScale: Float = 32_768.0
+
+    // Decision params (derived from FunASR vad_opts; 10 ms frames).
+    private static let silenceThreshold: Float = 0.2  // GetFrameState: speech if silence_prob <= 0.2
+    private static let windowFrames = 20  // window_size_ms 200 / 10
+    private static let silToSpeech = 15  // sil_to_speech_time 150 / 10
+    private static let speechToSil = 15  // speech_to_sil_time 150 / 10
+    private static let maxEndSilenceFrames = 80  // max_end_silence_time 800 / 10
+    private static let lookbackFrames = 20  // lookback_time_start_point 200 / 10
+    private static let lookaheadFrames = 10  // lookahead_time_end_point 100 / 10
+    private static let maxSegmentFrames = 6000  // max_single_segment_time 60000 / 10
+    private static let frameMs = 10
+
+    private let models: FsmnVadModels
+    private static let logger = AppLogger(category: "FsmnVadManager")
+
+    public init(models: FsmnVadModels) {
+        self.models = models
+    }
+
+    public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> FsmnVadManager {
+        FsmnVadManager(models: try await FsmnVadModels.downloadAndLoad(progressHandler: progressHandler))
+    }
+
+    public func detect(audioURL: URL) throws -> [FsmnVadSegment] {
+        let converter = AudioConverter(sampleRate: 16_000)
+        return try detect(audio: try converter.resampleAudioFile(audioURL))
+    }
+
+    public func detect(audio: [Float]) throws -> [FsmnVadSegment] {
+        let silence = try silenceProbabilities(audio: audio)
+        return decide(silence: silence)
+    }
+
+    // MARK: - Scoring (chunked)
+
+    /// Per-frame silence probability over the whole audio (concatenated across chunks).
+    private func silenceProbabilities(audio: [Float]) throws -> [Float] {
+        // ~30 s chunks (largest bucket); samples ≈ frames * 160.
+        let chunkSamples = (Self.buckets.last! - Self.windowFrames) * 160
+        var sil: [Float] = []
+        var offset = 0
+        while offset < audio.count {
+            let end = min(offset + chunkSamples, audio.count)
+            let chunk = Array(audio[offset..<end])
+            sil.append(contentsOf: try chunkSilence(chunk))
+            offset = end
+        }
+        return sil
+    }
+
+    private func chunkSilence(_ audio: [Float]) throws -> [Float] {
+        let n = audio.count
+        let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32)
+        let wp = wav.dataPointer.assumingMemoryBound(to: Float32.self)
+        for i in 0..<n { wp[i] = audio[i] * Self.waveformScale }
+        let feats = try models.preprocessor.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["waveform": MLFeatureValue(multiArray: wav)]))
+        guard let f = feats.featureValue(for: "features")?.multiArrayValue else {
+            throw ASRError.processingFailed("FSMN-VAD preprocessor produced no `features`")
+        }
+        let t = f.shape[1].intValue
+        if t == 0 { return [] }
+        let bucket = Self.buckets.first(where: { $0 >= t }) ?? Self.buckets.last!
+        let speech = try MLMultiArray(shape: [1, bucket as NSNumber, Self.featureDim as NSNumber], dataType: .float32)
+        let sp = speech.dataPointer.assumingMemoryBound(to: Float32.self)
+        memset(sp, 0, bucket * Self.featureDim * MemoryLayout<Float32>.size)
+        let count = t * Self.featureDim
+        if f.dataType == .float32 {
+            memcpy(sp, f.dataPointer, count * MemoryLayout<Float32>.size)
+        } else {
+            for i in 0..<count { sp[i] = f[i].floatValue }
+        }
+        let out = try models.scorer.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["feats": MLFeatureValue(multiArray: speech)]))
+        guard let scores = out.featureValue(for: "scores")?.multiArrayValue else {
+            throw ASRError.processingFailed("FSMN-VAD scorer produced no `scores`")
+        }
+        let vocab = scores.shape[2].intValue
+        var sil = [Float](repeating: 0, count: t)
+        if scores.dataType == .float32 {
+            let p = scores.dataPointer.assumingMemoryBound(to: Float32.self)
+            for frame in 0..<t { sil[frame] = p[frame * vocab] }  // col 0 = silence prob
+        } else {
+            for frame in 0..<t { sil[frame] = scores[[0, frame as NSNumber, 0]].floatValue }
+        }
+        return sil
+    }
+
+    // MARK: - Decision (port of FunASR FsmnVADStreaming)
+
+    private func decide(silence: [Float]) -> [FsmnVadSegment] {
+        let T = silence.count
+        var win = [Int](repeating: 0, count: Self.windowFrames)
+        var pos = 0
+        var winSum = 0
+        var preSpeech = false
+        var inSeg = false
+        var segStart = 0
+        var contSil = 0
+        var segs: [FsmnVadSegment] = []
+
+        func close(at frame: Int) {
+            segs.append(FsmnVadSegment(startMs: segStart * Self.frameMs, endMs: frame * Self.frameMs))
+            inSeg = false
+        }
+
+        for t in 0..<T {
+            let cur = silence[t] <= Self.silenceThreshold ? 1 : 0
+            winSum -= win[pos]
+            winSum += cur
+            win[pos] = cur
+            pos = (pos + 1) % Self.windowFrames
+            if !preSpeech && winSum >= Self.silToSpeech {
+                preSpeech = true
+                if !inSeg {
+                    inSeg = true
+                    segStart = max(0, t - Self.silToSpeech - Self.lookbackFrames)
+                    contSil = 0
+                }
+            } else if preSpeech && winSum <= Self.speechToSil {
+                preSpeech = false
+            }
+            if inSeg && !preSpeech { contSil += 1 } else { contSil = 0 }
+            if inSeg && contSil >= Self.maxEndSilenceFrames {
+                close(at: t - Self.maxEndSilenceFrames + Self.lookaheadFrames)
+            } else if inSeg && (t - segStart) >= Self.maxSegmentFrames {
+                close(at: t)
+                preSpeech = false
+            }
+        }
+        if inSeg { close(at: T) }
+        return segs
+    }
+}
diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift
new file mode 100644
index 000000000..77470571e
--- /dev/null
+++ b/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift
@@ -0,0 +1,88 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Loaded FSMN-VAD CoreML models.
+///
+/// 2 stages from `FluidInference/fsmn-vad-coreml`:
+///   - `preprocessor` (fp32, CPU): waveform -> [1, T, 400] features (fbank80 + LFR m=5,n=1)
+///   - `scorer` (fp16, ANE): features -> [1, T, 248] frame scores (col 0 = silence prob)
+public struct FsmnVadModels: Sendable {
+
+    public let preprocessor: MLModel
+    public let scorer: MLModel
+
+    private static let logger = AppLogger(category: "FsmnVadModels")
+
+    public init(preprocessor: MLModel, scorer: MLModel) {
+        self.preprocessor = preprocessor
+        self.scorer = scorer
+    }
+
+    public static func downloadAndLoad(
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> FsmnVadModels {
+        try load(from: try await download(progressHandler: progressHandler))
+    }
+
+    public static func download(
+        force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let root = modelsRootDirectory()
+        let dir = root.appendingPathComponent(Repo.fsmnVad.folderName, isDirectory: true)
+        if !force && modelsExist(at: dir) {
+            logger.info("FSMN-VAD models already present at: \(dir.path)")
+            return dir
+        }
+        if force { try? FileManager.default.removeItem(at: dir) }
+        logger.info("Downloading FSMN-VAD models from HuggingFace...")
+        try await DownloadUtils.downloadRepo(.fsmnVad, to: root, progressHandler: progressHandler)
+        return dir
+    }
+
+    public static func modelsExist(at directory: URL) -> Bool {
+        let fm = FileManager.default
+        return [ModelNames.FsmnVad.preprocessorFile, ModelNames.FsmnVad.scorerFile].allSatisfy {
+            fm.fileExists(atPath: directory.appendingPathComponent($0).path)
+        }
+    }
+
+    public static func load(from directory: URL) throws -> FsmnVadModels {
+        let cpu = MLModelConfiguration()
+        cpu.computeUnits = .cpuOnly
+        let ane = MLModelConfiguration()
+        ane.computeUnits = .cpuAndNeuralEngine
+        let pre = try loadModel(named: ModelNames.FsmnVad.preprocessor, from: directory, configuration: cpu)
+        let scorer = try loadModel(named: ModelNames.FsmnVad.scorer, from: directory, configuration: ane)
+        logger.info("Loaded FSMN-VAD models")
+        return FsmnVadModels(preprocessor: pre, scorer: scorer)
+    }
+
+    private static func loadModel(
+        named name: String, from directory: URL, configuration: MLModelConfiguration
+    ) throws -> MLModel {
+        let compiled = directory.appendingPathComponent("\(name).mlmodelc")
+        let pkg = directory.appendingPathComponent("\(name).mlpackage")
+        let url: URL
+        if FileManager.default.fileExists(atPath: compiled.path) {
+            url = compiled
+        } else if FileManager.default.fileExists(atPath: pkg.path) {
+            url = try MLModel.compileModel(at: pkg)
+        } else {
+            throw ASRError.processingFailed("FSMN-VAD model not found: \(name)")
+        }
+        return try MLModel(contentsOf: url, configuration: configuration)
+    }
+
+    private static func modelsRootDirectory() -> URL {
+        let fm = FileManager.default
+        if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
+            return
+                appSupport
+                .appendingPathComponent("FluidAudio", isDirectory: true)
+                .appendingPathComponent("Models", isDirectory: true)
+        }
+        return fm.temporaryDirectory
+            .appendingPathComponent("FluidAudio", isDirectory: true)
+            .appendingPathComponent("Models", isDirectory: true)
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift b/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift
new file mode 100644
index 000000000..5449d165d
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift
@@ -0,0 +1,37 @@
+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+
+/// `fsmn-vad-segment <audio>` — print detected speech segments [start_ms, end_ms].
+enum FsmnVadSegmentCommand {
+    private static let logger = AppLogger(category: "FsmnVadSegment")
+
+    static func run(arguments: [String]) async {
+        let paths = arguments.filter { !$0.hasPrefix("-") }
+        guard let audioPath = paths.first else {
+            print("Usage: fluidaudio fsmn-vad-segment <audio-file>")
+            return
+        }
+        let url = URL(fileURLWithPath: audioPath)
+        guard FileManager.default.fileExists(atPath: url.path) else {
+            logger.error("Error: Audio file not found: \(audioPath)")
+            return
+        }
+        do {
+            logger.info("Loading FSMN-VAD models...")
+            let vad = try await FsmnVadManager.load()
+            let start = Date()
+            let segments = try await vad.detect(audioURL: url)
+            logger.info(
+                "Detected \(segments.count) speech segment(s) in \(String(format: "%.2f", Date().timeIntervalSince(start)))s"
+            )
+            for s in segments {
+                print("[\(s.startMs), \(s.endMs)]  (\(String(format: "%.2f", Double(s.endMs - s.startMs) / 1000.0)) s)")
+            }
+        } catch {
+            logger.error("VAD failed: \(error)")
+        }
+    }
+}
+#endif
diff --git a/Sources/FluidAudioCLI/FluidAudioCLI.swift b/Sources/FluidAudioCLI/FluidAudioCLI.swift
index f0c9a97b3..fdb9ac6b9 100644
--- a/Sources/FluidAudioCLI/FluidAudioCLI.swift
+++ b/Sources/FluidAudioCLI/FluidAudioCLI.swift
@@ -86,6 +86,8 @@ struct FluidAudioCLI {
             await CtcZhCnTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
         case "sensevoice-transcribe":
             await SenseVoiceTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
+        case "fsmn-vad-segment":
+            await FsmnVadSegmentCommand.run(arguments: Array(arguments.dropFirst(2)))
         case "sensevoice-benchmark":
             await SenseVoiceBenchmark.run(arguments: Array(arguments.dropFirst(2)))
         case "ctc-zh-cn-benchmark":

From 38d394c889a233a59ae645c05f348ec740dc6666 Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 21:33:35 -0400
Subject: [PATCH 2/7] docs(vad): FSMN-VAD benchmark in Benchmarks.md (frame F1
 97.4% vs FunASR, RTFx 1209)

---
 Documentation/Benchmarks.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 8ad282b62..25bb5f987 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -240,6 +240,28 @@ Peak memory usage (process-wide): 1.503 GB
 Model is nearly identical to the base model in terms of quality, performance wise we see an up to ~3.5x improvement compared to the silero Pytorch VAD model with the 256ms batch model (8 chunks of 32ms)
 
 ![VAD/speed.png](VAD/speed.png)
+
+### FSMN-VAD (`fsmn-vad-segment`)
+
+CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.
+
+Fidelity vs the FunASR reference (frame-level, 10 ms, FLEURS zh, n=50):
+
+| Metric | Value |
+|--------|-------|
+| **Frame F1** | **97.4%** |
+| Precision | 100.0% |
+| Recall | 94.8% |
+| Median RTFx | 1209× |
+
+**Notes:**
+- Reference = FunASR `am.generate` segments; the metric is conversion+decision fidelity (not vs hand-labeled ground truth). On a 20 s clip the boundaries match FunASR within ~50 ms.
+- 100% precision / ~95% recall: the decision is slightly conservative at boundaries (lookback/lookahead), never over-detecting speech.
+- Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
+
+```bash
+swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
+```
 ![VAD/correlation.png](VAD/correlation.png)
 
 Dataset: https://github.com/Lab41/VOiCES-subset

From a97afe52451bf584ce3a9a091fa0615ad2df4fbf Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 21:40:27 -0400
Subject: [PATCH 3/7] feat(vad): FSMN-VAD backend in vad-benchmark + real
 mini50 results

Add --backend fsmn to vad-benchmark (same labeled dataset + per-clip metric as
silero). On mini50: FSMN-VAD F1 98.0% (P 96.2/R 100) vs silero 84.7% (P 73.5/R 100),
RTFx 640x. Update Benchmarks.md with the apples-to-apples comparison.
---
 Documentation/Benchmarks.md                   | 20 ++++----
 .../FluidAudioCLI/Commands/VadBenchmark.swift | 50 +++++++++++++++++++
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 25bb5f987..a3d8eb8b7 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -245,21 +245,19 @@ Model is nearly identical to the base model in terms of quality, performance wis
 
 CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.
 
-Fidelity vs the FunASR reference (frame-level, 10 ms, FLEURS zh, n=50):
+Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness (per-clip speech/non-speech), same metric as the silero baseline:
 
-| Metric | Value |
-|--------|-------|
-| **Frame F1** | **97.4%** |
-| Precision | 100.0% |
-| Recall | 94.8% |
-| Median RTFx | 1209× |
+| Backend | Accuracy | Precision | Recall | F1 | RTFx |
+|---------|----------|-----------|--------|----|------|
+| silero (baseline) | 82.0% | 73.5% | 100% | 84.7% | 1408× |
+| **FSMN-VAD** | **98.0%** | **96.2%** | 100% | **98.0%** | 640× |
+
+FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).
 
-**Notes:**
-- Reference = FunASR `am.generate` segments; the metric is conversion+decision fidelity (not vs hand-labeled ground truth). On a 20 s clip the boundaries match FunASR within ~50 ms.
-- 100% precision / ~95% recall: the decision is slightly conservative at boundaries (lookback/lookahead), never over-detecting speech.
-- Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
+Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
 
 ```bash
+swift run -c release fluidaudiocli vad-benchmark --dataset mini50 --backend fsmn
 swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
 ```
 ![VAD/correlation.png](VAD/correlation.png)
diff --git a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
index 2c725c40d..06394cf1c 100644
--- a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
@@ -31,6 +31,7 @@ struct VadBenchmark {
         var dataset = "mini50"
         var debugMode = false
         var computeUnits: MLComputeUnits = .cpuAndNeuralEngine
+        var backend = "silero"  // silero | fsmn
 
         logger.info("Parsing arguments...")
 
@@ -61,6 +62,11 @@ struct VadBenchmark {
                     dataset = arguments[i + 1]
                     i += 1
                 }
+            case "--backend":
+                if i + 1 < arguments.count {
+                    backend = arguments[i + 1]
+                    i += 1
+                }
             case "--output":
                 if i + 1 < arguments.count {
                     outputFile = arguments[i + 1]
@@ -91,6 +97,12 @@ struct VadBenchmark {
         logger.info("Activity threshold: \(activityThreshold)")
         logger.info("Debug mode: \(debugMode)")
 
+        if backend == "fsmn" {
+            try await runFsmnVadBenchmark(
+                dataset: dataset, count: useAllFiles ? -1 : numFiles, activityThreshold: activityThreshold)
+            return
+        }
+
         let vadManager = try await VadManager(
             config: VadConfig(
                 defaultThreshold: vadThreshold,
@@ -168,6 +180,44 @@ struct VadBenchmark {
         fflush(stderr)
     }
 
+    /// FSMN-VAD backend: same dataset + per-file metric as silero, for apples-to-apples.
+    /// Per-file prediction = (speech duration / file duration) >= activityThreshold.
+    static func runFsmnVadBenchmark(dataset: String, count: Int, activityThreshold: Float) async throws {
+        let vad = try await FsmnVadManager.load()
+        let testFiles = try await downloadVadTestFiles(count: count, dataset: dataset)
+        logger.info("Running FSMN-VAD benchmark on \(testFiles.count) files...")
+        var predictions: [Int] = []
+        var groundTruth: [Int] = []
+        var audioSec = 0.0
+        var procSec = 0.0
+        for (idx, f) in testFiles.enumerated() {
+            do {
+                let af = try AVAudioFile(forReading: f.url)
+                let dur = Double(af.length) / af.processingFormat.sampleRate
+                audioSec += dur
+                let t0 = Date()
+                let segs = try await vad.detect(audioURL: f.url)
+                procSec += Date().timeIntervalSince(t0)
+                let speechMs = segs.reduce(0) { $0 + max(0, $1.endMs - $1.startMs) }
+                let ratio = dur > 0 ? Float(speechMs) / Float(dur * 1000.0) : 0
+                predictions.append(ratio >= activityThreshold ? 1 : 0)
+            } catch {
+                logger.warning("Error on \(f.name): \(error)")
+                predictions.append(0)
+            }
+            groundTruth.append(f.expectedLabel)
+            if (idx + 1) % 10 == 0 { logger.info("  \(idx + 1)/\(testFiles.count)") }
+        }
+        let m = calculateVadMetrics(predictions: predictions, groundTruth: groundTruth)
+        let rtfx = procSec > 0 ? audioSec / procSec : 0
+        logger.info("FSMN-VAD Benchmark Results (dataset=\(dataset), n=\(testFiles.count)):")
+        logger.info("Accuracy: \(String(format: "%.1f", m.accuracy))%")
+        logger.info("Precision: \(String(format: "%.1f", m.precision))%")
+        logger.info("Recall: \(String(format: "%.1f", m.recall))%")
+        logger.info("F1-Score: \(String(format: "%.1f", m.f1Score))%")
+        logger.info("RTFx: \(String(format: "%.0f", rtfx))x")
+    }
+
     static func downloadVadTestFiles(
         count: Int, dataset: String = "mini50"
     ) async throws

From 60db11a7ff09ac96875c45d60f3be452a29b431e Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 22:07:59 -0400
Subject: [PATCH 4/7] docs(vad): full MUSAN noise-rejection benchmark (FSMN
 81.9% vs silero 69.8% specificity)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Full FluidInference/musan noise set (774 clips): FSMN-VAD rejects 81.9% of noise
as non-speech (18.1% FP) vs silero 69.8% (30.2% FP) — 12pp fewer false positives.
Complements the balanced mini50 F1 (98.0% vs 84.7%).
---
 Documentation/Benchmarks.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index a3d8eb8b7..2f04d275f 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -254,6 +254,15 @@ Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness
 
 FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).
 
+Full [FluidInference/musan](https://huggingface.co/datasets/FluidInference/musan) noise set (774 noise clips) — noise rejection / specificity (correctly classified non-speech):
+
+| Backend | Noise rejected (specificity) | False-positive rate | RTFx |
+|---------|------------------------------|---------------------|------|
+| silero | 69.8% | 30.2% | 1341× |
+| **FSMN-VAD** | **81.9%** | **18.1%** | 571× |
+
+On the full MUSAN noise set FSMN-VAD rejects 12 pp more noise as non-speech (18% vs 30% false positives) — consistently more precise than silero on both the balanced (mini50) and noise-heavy (full MUSAN) evaluations.
+
 Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
 
 ```bash

From fc711d5c7f390777e9faada26c6397c987826200 Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 22:30:30 -0400
Subject: [PATCH 5/7] fix(vad): auto tar-extract VOiCES download + honor
 --all-files

The Lab41/VOiCES-subset repo now ships audio inside VOiCES_90_*.tar archives
(deeply nested) rather than loose clean/ + noisy/ wavs, so the downloader
silently produced 0 files. Extract every tar and classify each wav by the
noise tag in its filename (-none- = clean, else noisy); error if a clone
yields no wavs (layout changed again).

loadVoicesSubset ignored --all-files: count == -1 was hard-coded to 25 speech
samples (12 clean + 12 noisy), so --all-files ran only ~49 files. Now -1 loads
every VOiCES clip (908) and balances the MUSAN negatives to the speech count
(subject to locally available noise).

Verified: download yields 227 clean + 681 noisy; --all-files runs 933 files
(908 speech + 25 noise), F1 99.9%, ~1334x RTFx on M5 Pro.
---
 .../FluidAudioCLI/Commands/VadBenchmark.swift | 14 ++--
 .../DatasetParsers/DatasetDownloader.swift    | 64 +++++++++++--------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
index 06394cf1c..e19787237 100644
--- a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
@@ -661,28 +661,34 @@ struct VadBenchmark {
         let cleanDir = voicesDir.appendingPathComponent("clean")
         let noisyDir = voicesDir.appendingPathComponent("noisy")
 
-        let requestedSpeechCount = count == -1 ? 25 : count / 2
+        // count == -1 means "--all-files": load every VOiCES speech clip. Otherwise
+        // split the budget evenly across clean + noisy speech (count / 4 each).
+        let allFiles = count == -1
+        let perSideSpeech = allFiles ? Int.max : max(1, count / 4)
 
         if FileManager.default.fileExists(atPath: cleanDir.path) {
             let cleanFiles = try loadAudioFiles(
-                from: cleanDir, expectedLabel: 1, maxCount: requestedSpeechCount / 2)
+                from: cleanDir, expectedLabel: 1, maxCount: perSideSpeech)
             testFiles.append(contentsOf: cleanFiles)
             logger.info("Loaded \(cleanFiles.count) clean speech files")
         }
 
         if FileManager.default.fileExists(atPath: noisyDir.path) {
             let noisyFiles = try loadAudioFiles(
-                from: noisyDir, expectedLabel: 1, maxCount: requestedSpeechCount / 2)
+                from: noisyDir, expectedLabel: 1, maxCount: perSideSpeech)
             testFiles.append(contentsOf: noisyFiles)
             logger.info("Loaded \(noisyFiles.count) noisy speech files")
         }
 
+        let speechCount = testFiles.count
         logger.info("Loading non-speech samples from MUSAN...")
         let vadCacheDir = appSupport.appendingPathComponent("FluidAudio/vadDataset")
         let noiseDir = vadCacheDir.appendingPathComponent("noise")
 
         if FileManager.default.fileExists(atPath: noiseDir.path) {
-            let requestedNoiseCount = count == -1 ? 25 : count - testFiles.count
+            // Balance the negatives to the speech count when loading everything,
+            // otherwise fill the remainder of the requested budget.
+            let requestedNoiseCount = allFiles ? speechCount : max(0, count - speechCount)
             let noiseFiles = try loadAudioFiles(
                 from: noiseDir, expectedLabel: 0, maxCount: requestedNoiseCount)
             testFiles.append(contentsOf: noiseFiles)
diff --git a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
index a5e3f19e2..6362dd71d 100644
--- a/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
+++ b/Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift
@@ -980,46 +980,60 @@ struct DatasetDownloader {
             task.waitUntilExit()
 
             if task.terminationStatus == 0 {
-                // Move the audio files to our cache structure
-                let sourceCleanDir = cloneDir.appendingPathComponent("clean")
-                let sourceNoisyDir = cloneDir.appendingPathComponent("noisy")
-
                 // Create destination directories
                 try FileManager.default.createDirectory(
                     at: cleanDir, withIntermediateDirectories: true)
                 try FileManager.default.createDirectory(
                     at: noisyDir, withIntermediateDirectories: true)
 
-                // Move clean files
-                var cleanCount = 0
-                var noisyCount = 0
-                if FileManager.default.fileExists(atPath: sourceCleanDir.path) {
-                    let cleanFiles = try FileManager.default.contentsOfDirectory(
-                        at: sourceCleanDir, includingPropertiesForKeys: nil)
-                    for file in cleanFiles where file.pathExtension == "wav" {
-                        let destination = cleanDir.appendingPathComponent(
-                            file.lastPathComponent)
-                        try FileManager.default.moveItem(at: file, to: destination)
-                        cleanCount += 1
-                    }
+                // The repo ships its audio inside `VOiCES_90_*.tar` archives (deeply
+                // nested), not as loose `clean/`+`noisy/` wavs. Extract every tar into
+                // a scratch dir, then classify each wav by the noise tag in its name:
+                //   Lab41-SRI-VOiCES-rmX-<cond>-...  ->  "none" = clean, else noisy.
+                // (All VOiCES clips are speech; the split only affects logging/balance.)
+                let extractDir = cloneDir.appendingPathComponent("_extract")
+                try? FileManager.default.removeItem(at: extractDir)
+                try FileManager.default.createDirectory(
+                    at: extractDir, withIntermediateDirectories: true)
+
+                let cloneContents =
+                    (try? FileManager.default.contentsOfDirectory(
+                        at: cloneDir, includingPropertiesForKeys: nil)) ?? []
+                for archive in cloneContents where archive.pathExtension.lowercased() == "tar" {
+                    let untar = Process()
+                    untar.executableURL = URL(fileURLWithPath: "/usr/bin/tar")
+                    untar.arguments = ["xf", archive.path, "-C", extractDir.path]
+                    try? untar.run()
+                    untar.waitUntilExit()
                 }
 
-                // Move noisy files
-                if FileManager.default.fileExists(atPath: sourceNoisyDir.path) {
-                    let noisyFiles = try FileManager.default.contentsOfDirectory(
-                        at: sourceNoisyDir, includingPropertiesForKeys: nil)
-                    for file in noisyFiles where file.pathExtension == "wav" {
-                        let destination = noisyDir.appendingPathComponent(
-                            file.lastPathComponent)
+                // Recursively collect every extracted wav and sort it by filename tag.
+                var cleanCount = 0
+                var noisyCount = 0
+                if let enumerator = FileManager.default.enumerator(
+                    at: extractDir, includingPropertiesForKeys: nil)
+                {
+                    while let file = enumerator.nextObject() as? URL {
+                        guard file.pathExtension.lowercased() == "wav" else { continue }
+                        let name = file.lastPathComponent
+                        let isClean = name.contains("-none-")
+                        let destination =
+                            (isClean ? cleanDir : noisyDir).appendingPathComponent(name)
+                        try? FileManager.default.removeItem(at: destination)
                         try FileManager.default.moveItem(at: file, to: destination)
-                        noisyCount += 1
+                        if isClean { cleanCount += 1 } else { noisyCount += 1 }
                     }
                 }
 
                 // Clean up clone directory
                 try? FileManager.default.removeItem(at: cloneDir)
 
-                logger.info("VOiCES subset ready: \(cleanCount) clean, \(noisyCount) noisy")
+                if cleanCount + noisyCount == 0 {
+                    logger.error(
+                        "VOiCES clone contained no extractable wavs (repo layout changed?)")
+                } else {
+                    logger.info("VOiCES subset ready: \(cleanCount) clean, \(noisyCount) noisy")
+                }
 
             } else {
                 logger.error("Git clone failed")

From 572178bda3916e662eaeeefe6c75a39632ca7f04 Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Mon, 1 Jun 2026 10:53:42 -0400
Subject: [PATCH 6/7] fix(vad): bound FSMN memory on long audio; mark FSMN-VAD
 beta

- FSMN backend leaked memory on long files (autoreleased MLMultiArrays + AVAudio
  buffers accumulated across chunks/files -> ~8GB RSS, OOM on full MUSAN). Wrap
  per-chunk scoring and per-file resampling in autoreleasepool; RSS now ~240MB.
- Mark FSMN-VAD as beta/experimental in docs and CLI: on a balanced full-MUSAN
  set it has high recall but over-triggers on music (low precision), so silero-vad
  stays the recommended default. Drop the non-representative head-to-head tables.
- Persist FSMN benchmark metrics to fsmn_vad_results.json (release logs info to
  os_log only).
---
 Documentation/Benchmarks.md                   | 28 +++++--------------
 .../FluidAudio/VAD/Fsmn/FsmnVadManager.swift  | 12 ++++++--
 .../FluidAudioCLI/Commands/VadBenchmark.swift | 23 +++++++++++++++
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 51dca6602..29a807f45 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -241,32 +241,18 @@ Model is nearly identical to the base model in terms of quality, performance wis
 
 ![VAD/speed.png](VAD/speed.png)
 
-### FSMN-VAD (`fsmn-vad-segment`)
+### FSMN-VAD (`fsmn-vad-segment`) — beta
 
-CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.
+> **Beta / experimental.** silero-vad remains the recommended default. FSMN-VAD has
+> very high speech recall and rejects noise well, but on a balanced full-MUSAN set it
+> over-triggers on **music** (precision drops), so silero is the better general-purpose
+> choice today. FSMN is provided as an opt-in alternative for noise-dominated scenarios.
 
-Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness (per-clip speech/non-speech), same metric as the silero baseline:
+CoreML FSMN-VAD (FunASR, ~5.2M). Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`).
 
-| Backend | Accuracy | Precision | Recall | F1 | RTFx |
-|---------|----------|-----------|--------|----|------|
-| silero (baseline) | 82.0% | 73.5% | 100% | 84.7% | 1408× |
-| **FSMN-VAD** | **98.0%** | **96.2%** | 100% | **98.0%** | 640× |
-
-FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).
-
-Full [FluidInference/musan](https://huggingface.co/datasets/FluidInference/musan) noise set (774 noise clips) — noise rejection / specificity (correctly classified non-speech):
-
-| Backend | Noise rejected (specificity) | False-positive rate | RTFx |
-|---------|------------------------------|---------------------|------|
-| silero | 69.8% | 30.2% | 1341× |
-| **FSMN-VAD** | **81.9%** | **18.1%** | 571× |
-
-On the full MUSAN noise set FSMN-VAD rejects 12 pp more noise as non-speech (18% vs 30% false positives) — consistently more precise than silero on both the balanced (mini50) and noise-heavy (full MUSAN) evaluations.
-
-Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
+Fidelity vs FunASR's own reference segments: frame F1 ~97%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo). Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
 
 ```bash
-swift run -c release fluidaudiocli vad-benchmark --dataset mini50 --backend fsmn
 swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
 ```
 ![VAD/correlation.png](VAD/correlation.png)
diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
index 83a76b22c..40fce1a32 100644
--- a/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
+++ b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
@@ -45,8 +45,11 @@ public actor FsmnVadManager {
     }
 
     public func detect(audioURL: URL) throws -> [FsmnVadSegment] {
-        let converter = AudioConverter(sampleRate: 16_000)
-        return try detect(audio: try converter.resampleAudioFile(audioURL))
+        let audio = try autoreleasepool { () -> [Float] in
+            let converter = AudioConverter(sampleRate: 16_000)
+            return try converter.resampleAudioFile(audioURL)
+        }
+        return try detect(audio: audio)
     }
 
     public func detect(audio: [Float]) throws -> [FsmnVadSegment] {
@@ -65,7 +68,10 @@ public actor FsmnVadManager {
         while offset < audio.count {
             let end = min(offset + chunkSamples, audio.count)
             let chunk = Array(audio[offset..<end])
-            sil.append(contentsOf: try chunkSilence(chunk))
+            // Drain CoreML's autoreleased MLMultiArrays per chunk so memory stays
+            // bounded on long audio; otherwise they accumulate across the whole file.
+            let chunkSil = try autoreleasepool { try chunkSilence(chunk) }
+            sil.append(contentsOf: chunkSil)
             offset = end
         }
         return sil
diff --git a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
index e19787237..31e912d75 100644
--- a/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/VadBenchmark.swift
@@ -98,6 +98,8 @@ struct VadBenchmark {
         logger.info("Debug mode: \(debugMode)")
 
         if backend == "fsmn" {
+            logger.warning(
+                "FSMN-VAD is a beta/experimental backend (over-triggers on music); silero-vad is the default.")
             try await runFsmnVadBenchmark(
                 dataset: dataset, count: useAllFiles ? -1 : numFiles, activityThreshold: activityThreshold)
             return
@@ -216,6 +218,27 @@ struct VadBenchmark {
         logger.info("Recall: \(String(format: "%.1f", m.recall))%")
         logger.info("F1-Score: \(String(format: "%.1f", m.f1Score))%")
         logger.info("RTFx: \(String(format: "%.0f", rtfx))x")
+
+        // Persist to JSON so results survive release-mode logging (info -> os_log only).
+        let resultJSON: [String: Any] = [
+            "test_name": "FSMN_VAD_\(dataset)_\(testFiles.count)_Files",
+            "backend": "fsmn",
+            "dataset": dataset,
+            "accuracy": m.accuracy,
+            "precision": m.precision,
+            "recall": m.recall,
+            "f1_score": m.f1Score,
+            "rtfx": rtfx,
+            "total_files": testFiles.count,
+            "total_audio_duration_seconds": audioSec,
+            "processing_time_seconds": procSec,
+        ]
+        if let data = try? JSONSerialization.data(
+            withJSONObject: resultJSON, options: [.prettyPrinted, .sortedKeys])
+        {
+            try? data.write(to: URL(fileURLWithPath: "fsmn_vad_results.json"))
+            logger.info("Results saved to: fsmn_vad_results.json")
+        }
     }
 
     static func downloadVadTestFiles(

From 16a039a28c85cd5a78bea0c0694aed424b99951c Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Mon, 1 Jun 2026 10:57:02 -0400
Subject: [PATCH 7/7] docs(vad): revert Benchmarks.md FSMN section to prior
 content

Keep the code changes from 572178bd (FSMN memory fix, CLI beta warning,
fsmn_vad_results.json output); drop the Benchmarks.md edits per request.
---
 Documentation/Benchmarks.md | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 29a807f45..51dca6602 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -241,18 +241,32 @@ Model is nearly identical to the base model in terms of quality, performance wis
 
 ![VAD/speed.png](VAD/speed.png)
 
-### FSMN-VAD (`fsmn-vad-segment`) — beta
+### FSMN-VAD (`fsmn-vad-segment`)
 
-> **Beta / experimental.** silero-vad remains the recommended default. FSMN-VAD has
-> very high speech recall and rejects noise well, but on a balanced full-MUSAN set it
-> over-triggers on **music** (precision drops), so silero is the better general-purpose
-> choice today. FSMN is provided as an opt-in alternative for noise-dominated scenarios.
+CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.
 
-CoreML FSMN-VAD (FunASR, ~5.2M). Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`).
+Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness (per-clip speech/non-speech), same metric as the silero baseline:
 
-Fidelity vs FunASR's own reference segments: frame F1 ~97%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo). Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
+| Backend | Accuracy | Precision | Recall | F1 | RTFx |
+|---------|----------|-----------|--------|----|------|
+| silero (baseline) | 82.0% | 73.5% | 100% | 84.7% | 1408× |
+| **FSMN-VAD** | **98.0%** | **96.2%** | 100% | **98.0%** | 640× |
+
+FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).
+
+Full [FluidInference/musan](https://huggingface.co/datasets/FluidInference/musan) noise set (774 noise clips) — noise rejection / specificity (correctly classified non-speech):
+
+| Backend | Noise rejected (specificity) | False-positive rate | RTFx |
+|---------|------------------------------|---------------------|------|
+| silero | 69.8% | 30.2% | 1341× |
+| **FSMN-VAD** | **81.9%** | **18.1%** | 571× |
+
+On the full MUSAN noise set FSMN-VAD rejects 12 pp more noise as non-speech (18% vs 30% false positives) — consistently more precise than silero on both the balanced (mini50) and noise-heavy (full MUSAN) evaluations.
+
+Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).
 
 ```bash
+swift run -c release fluidaudiocli vad-benchmark --dataset mini50 --backend fsmn
 swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
 ```
 ![VAD/correlation.png](VAD/correlation.png)