Skip to content
29 changes: 29 additions & 0 deletions Documentation/Benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,35 @@ Peak memory usage (process-wide): 1.503 GB
Model is nearly identical to the base model in terms of quality, performance wise we see an up to ~3.5x improvement compared to the silero Pytorch VAD model with the 256ms batch model (8 chunks of 32ms)

![VAD/speed.png](VAD/speed.png)

### FSMN-VAD (`fsmn-vad-segment`)

CoreML FSMN-VAD (FunASR, ~5.2M), an alternative to silero-vad. Model: [FluidInference/fsmn-vad-coreml](https://huggingface.co/FluidInference/fsmn-vad-coreml). 2-stage: fbank80+LFR preprocessor (fp32/CPU) → FSMN scorer (fp16/ANE, enumerated buckets) → host decision (port of FunASR `FsmnVADStreaming`). Hardware: Apple M5 Pro.

Evaluated on the **mini50** labeled set via the standard `vad-benchmark` harness (per-clip speech/non-speech), same metric as the silero baseline:

| Backend | Accuracy | Precision | Recall | F1 | RTFx |
|---------|----------|-----------|--------|----|------|
| silero (baseline) | 82.0% | 73.5% | 100% | 84.7% | 1408× |
| **FSMN-VAD** | **98.0%** | **96.2%** | 100% | **98.0%** | 640× |

FSMN-VAD is far more precise (96.2% vs 73.5%) at the same 100% recall — many fewer false speech detections — at ~640× real-time. Fidelity vs FunASR's own segments: frame F1 97.4%, boundaries within ~50 ms (`vad_bench.py` in the conversion repo).

Full [FluidInference/musan](https://huggingface.co/datasets/FluidInference/musan) noise set (774 noise clips) — noise rejection / specificity (correctly classified non-speech):

| Backend | Noise rejected (specificity) | False-positive rate | RTFx |
|---------|------------------------------|---------------------|------|
| silero | 69.8% | 30.2% | 1341× |
| **FSMN-VAD** | **81.9%** | **18.1%** | 571× |

On the full MUSAN noise set FSMN-VAD rejects 12 pp more noise as non-speech (18% vs 30% false positives) — consistently more precise than silero on both the balanced (mini50) and noise-heavy (full MUSAN) evaluations.

Long audio is processed in ~30 s chunks (the FSMN's dilated conv needs fixed shapes; RangeDim is rejected by the ANE/BNNS compiler).

```bash
swift run -c release fluidaudiocli vad-benchmark --dataset mini50 --backend fsmn
swift run -c release fluidaudiocli fsmn-vad-segment audio.wav
```
![VAD/correlation.png](VAD/correlation.png)

Dataset: https://github.com/Lab41/VOiCES-subset
Expand Down
23 changes: 23 additions & 0 deletions Sources/FluidAudio/ModelNames.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ public enum Repo: String, CaseIterable, Sendable {
/// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE
/// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice.
case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
/// FSMN-VAD voice activity detection (FunASR). See VAD/Fsmn.
case fsmnVad = "FluidInference/fsmn-vad-coreml"
/// Paraformer-large (zh) — non-autoregressive ASR: SANM encoder + CIF
/// predictor (host-side integrate-and-fire) + parallel decoder. See ASR/Paraformer.
case paraformerLargeZh = "FluidInference/paraformer-large-zh-coreml"
Expand Down Expand Up @@ -76,6 +78,8 @@ public enum Repo: String, CaseIterable, Sendable {
return "parakeet-ctc-0.6b-zh-cn-coreml"
case .senseVoiceSmall:
return "sensevoice-small-coreml"
case .fsmnVad:
return "fsmn-vad-coreml"
case .paraformerLargeZh:
return "paraformer-large-zh-coreml"
case .parakeetJa:
Expand Down Expand Up @@ -442,6 +446,23 @@ public enum ModelNames {
]
}

/// FSMN-VAD model names (2 CoreML stages + host decision).
/// Preprocessor (fp32/CPU): waveform -> 400-d features (fbank80 + LFR m=5,n=1)
/// FsmnVad (fp16/ANE): features -> [1,T,248] frame scores (col 0 = silence prob)
/// Plus `vad_config.json` (auto-fetched as a root file).
public enum FsmnVad {
public static let preprocessor = "FsmnVadPreprocessor"
public static let scorer = "FsmnVad"

public static let preprocessorFile = preprocessor + ".mlmodelc"
public static let scorerFile = scorer + ".mlmodelc"

public static let requiredModels: Set<String> = [
preprocessorFile,
scorerFile,
]
}

/// Paraformer-large (zh) model names. 4 CoreML stages + host CIF:
/// Preprocessor (fp32/CPU): waveform -> 560-d LFR features
/// Encoder (fp16/ANE): SANM encoder (enumerated buckets)
Expand Down Expand Up @@ -1098,6 +1119,8 @@ public enum ModelNames {
return ModelNames.CTCZhCn.requiredModels
case .senseVoiceSmall:
return ModelNames.SenseVoice.requiredModels
case .fsmnVad:
return ModelNames.FsmnVad.requiredModels
case .paraformerLargeZh:
return ModelNames.ParaformerZh.requiredModels
case .parakeetJa:
Expand Down
163 changes: 163 additions & 0 deletions Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
@preconcurrency import CoreML
import Foundation

/// A detected speech segment, in milliseconds.
public struct FsmnVadSegment: Sendable, Equatable {
public let startMs: Int
public let endMs: Int
}

/// FSMN-VAD voice activity detection: audio -> speech segments.
///
/// Pipeline: waveform -> [Preprocessor fp32/CPU] -> 400-d features
/// -> [FSMN fp16/ANE, enumerated buckets] -> per-frame scores (col 0 = silence prob)
/// -> host decision (window-detector hysteresis + silence->endpoint) -> [start_ms, end_ms].
///
/// Audio longer than the largest bucket is processed in ~30 s chunks; the per-frame
/// silence probabilities are concatenated and the decision runs once over all frames.
public actor FsmnVadManager {

// Enumerated scorer buckets (post-LFR frames; matches the converted model).
private static let buckets = [512, 1024, 2048, 3072]
private static let featureDim = 400
private static let waveformScale: Float = 32_768.0

// Decision params (derived from FunASR vad_opts; 10 ms frames).
private static let silenceThreshold: Float = 0.2 // GetFrameState: speech if silence_prob <= 0.2
private static let windowFrames = 20 // window_size_ms 200 / 10
private static let silToSpeech = 15 // sil_to_speech_time 150 / 10
private static let speechToSil = 15 // speech_to_sil_time 150 / 10
private static let maxEndSilenceFrames = 80 // max_end_silence_time 800 / 10
private static let lookbackFrames = 20 // lookback_time_start_point 200 / 10
private static let lookaheadFrames = 10 // lookahead_time_end_point 100 / 10
private static let maxSegmentFrames = 6000 // max_single_segment_time 60000 / 10
private static let frameMs = 10

private let models: FsmnVadModels
private static let logger = AppLogger(category: "FsmnVadManager")

public init(models: FsmnVadModels) {
self.models = models
}

public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> FsmnVadManager {
FsmnVadManager(models: try await FsmnVadModels.downloadAndLoad(progressHandler: progressHandler))
}

public func detect(audioURL: URL) throws -> [FsmnVadSegment] {
let audio = try autoreleasepool { () -> [Float] in
let converter = AudioConverter(sampleRate: 16_000)
return try converter.resampleAudioFile(audioURL)
}
return try detect(audio: audio)
}

public func detect(audio: [Float]) throws -> [FsmnVadSegment] {
let silence = try silenceProbabilities(audio: audio)
return decide(silence: silence)
}

// MARK: - Scoring (chunked)

/// Per-frame silence probability over the whole audio (concatenated across chunks).
private func silenceProbabilities(audio: [Float]) throws -> [Float] {
// ~30 s chunks (largest bucket); samples ≈ frames * 160.
let chunkSamples = (Self.buckets.last! - Self.windowFrames) * 160
var sil: [Float] = []
var offset = 0
while offset < audio.count {
let end = min(offset + chunkSamples, audio.count)
let chunk = Array(audio[offset..<end])
// Drain CoreML's autoreleased MLMultiArrays per chunk so memory stays
// bounded on long audio; otherwise they accumulate across the whole file.
let chunkSil = try autoreleasepool { try chunkSilence(chunk) }
sil.append(contentsOf: chunkSil)
offset = end
}
return sil
}

private func chunkSilence(_ audio: [Float]) throws -> [Float] {
let n = audio.count
let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32)
let wp = wav.dataPointer.assumingMemoryBound(to: Float32.self)
for i in 0..<n { wp[i] = audio[i] * Self.waveformScale }
let feats = try models.preprocessor.prediction(
from: MLDictionaryFeatureProvider(dictionary: ["waveform": MLFeatureValue(multiArray: wav)]))
guard let f = feats.featureValue(for: "features")?.multiArrayValue else {
throw ASRError.processingFailed("FSMN-VAD preprocessor produced no `features`")
}
let t = f.shape[1].intValue
if t == 0 { return [] }
let bucket = Self.buckets.first(where: { $0 >= t }) ?? Self.buckets.last!
let speech = try MLMultiArray(shape: [1, bucket as NSNumber, Self.featureDim as NSNumber], dataType: .float32)
let sp = speech.dataPointer.assumingMemoryBound(to: Float32.self)
memset(sp, 0, bucket * Self.featureDim * MemoryLayout<Float32>.size)
let count = t * Self.featureDim
if f.dataType == .float32 {
memcpy(sp, f.dataPointer, count * MemoryLayout<Float32>.size)
} else {
for i in 0..<count { sp[i] = f[i].floatValue }
}
let out = try models.scorer.prediction(
from: MLDictionaryFeatureProvider(dictionary: ["feats": MLFeatureValue(multiArray: speech)]))
guard let scores = out.featureValue(for: "scores")?.multiArrayValue else {
throw ASRError.processingFailed("FSMN-VAD scorer produced no `scores`")
}
let vocab = scores.shape[2].intValue
var sil = [Float](repeating: 0, count: t)
if scores.dataType == .float32 {
let p = scores.dataPointer.assumingMemoryBound(to: Float32.self)
for frame in 0..<t { sil[frame] = p[frame * vocab] } // col 0 = silence prob
} else {
for frame in 0..<t { sil[frame] = scores[[0, frame as NSNumber, 0]].floatValue }
}
return sil
}

// MARK: - Decision (port of FunASR FsmnVADStreaming)

private func decide(silence: [Float]) -> [FsmnVadSegment] {
let T = silence.count
var win = [Int](repeating: 0, count: Self.windowFrames)
var pos = 0
var winSum = 0
var preSpeech = false
var inSeg = false
var segStart = 0
var contSil = 0
var segs: [FsmnVadSegment] = []

func close(at frame: Int) {
segs.append(FsmnVadSegment(startMs: segStart * Self.frameMs, endMs: frame * Self.frameMs))
inSeg = false
}

for t in 0..<T {
let cur = silence[t] <= Self.silenceThreshold ? 1 : 0
winSum -= win[pos]
winSum += cur
win[pos] = cur
pos = (pos + 1) % Self.windowFrames
if !preSpeech && winSum >= Self.silToSpeech {
preSpeech = true
if !inSeg {
inSeg = true
segStart = max(0, t - Self.silToSpeech - Self.lookbackFrames)
contSil = 0
}
} else if preSpeech && winSum <= Self.speechToSil {
preSpeech = false
}
if inSeg && !preSpeech { contSil += 1 } else { contSil = 0 }
if inSeg && contSil >= Self.maxEndSilenceFrames {
close(at: t - Self.maxEndSilenceFrames + Self.lookaheadFrames)
} else if inSeg && (t - segStart) >= Self.maxSegmentFrames {
close(at: t)
preSpeech = false
}
}
if inSeg { close(at: T) }
return segs
}
}
88 changes: 88 additions & 0 deletions Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
@preconcurrency import CoreML
import Foundation

/// Loaded FSMN-VAD CoreML models.
///
/// 2 stages from `FluidInference/fsmn-vad-coreml`:
/// - `preprocessor` (fp32, CPU): waveform -> [1, T, 400] features (fbank80 + LFR m=5,n=1)
/// - `scorer` (fp16, ANE): features -> [1, T, 248] frame scores (col 0 = silence prob)
public struct FsmnVadModels: Sendable {

public let preprocessor: MLModel
public let scorer: MLModel

private static let logger = AppLogger(category: "FsmnVadModels")

public init(preprocessor: MLModel, scorer: MLModel) {
self.preprocessor = preprocessor
self.scorer = scorer
}

public static func downloadAndLoad(
progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> FsmnVadModels {
try load(from: try await download(progressHandler: progressHandler))
}

public static func download(
force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> URL {
let root = modelsRootDirectory()
let dir = root.appendingPathComponent(Repo.fsmnVad.folderName, isDirectory: true)
if !force && modelsExist(at: dir) {
logger.info("FSMN-VAD models already present at: \(dir.path)")
return dir
}
if force { try? FileManager.default.removeItem(at: dir) }
logger.info("Downloading FSMN-VAD models from HuggingFace...")
try await DownloadUtils.downloadRepo(.fsmnVad, to: root, progressHandler: progressHandler)
return dir
}

public static func modelsExist(at directory: URL) -> Bool {
let fm = FileManager.default
return [ModelNames.FsmnVad.preprocessorFile, ModelNames.FsmnVad.scorerFile].allSatisfy {
fm.fileExists(atPath: directory.appendingPathComponent($0).path)
}
}

public static func load(from directory: URL) throws -> FsmnVadModels {
let cpu = MLModelConfiguration()
cpu.computeUnits = .cpuOnly
let ane = MLModelConfiguration()
ane.computeUnits = .cpuAndNeuralEngine
let pre = try loadModel(named: ModelNames.FsmnVad.preprocessor, from: directory, configuration: cpu)
let scorer = try loadModel(named: ModelNames.FsmnVad.scorer, from: directory, configuration: ane)
logger.info("Loaded FSMN-VAD models")
return FsmnVadModels(preprocessor: pre, scorer: scorer)
}

private static func loadModel(
named name: String, from directory: URL, configuration: MLModelConfiguration
) throws -> MLModel {
let compiled = directory.appendingPathComponent("\(name).mlmodelc")
let pkg = directory.appendingPathComponent("\(name).mlpackage")
let url: URL
if FileManager.default.fileExists(atPath: compiled.path) {
url = compiled
} else if FileManager.default.fileExists(atPath: pkg.path) {
url = try MLModel.compileModel(at: pkg)
} else {
throw ASRError.processingFailed("FSMN-VAD model not found: \(name)")
}
return try MLModel(contentsOf: url, configuration: configuration)
}

private static func modelsRootDirectory() -> URL {
let fm = FileManager.default
if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
return
appSupport
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
return fm.temporaryDirectory
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
}
37 changes: 37 additions & 0 deletions Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#if os(macOS)
import AVFoundation
import FluidAudio
import Foundation

/// `fsmn-vad-segment <audio>` — print detected speech segments [start_ms, end_ms].
enum FsmnVadSegmentCommand {
private static let logger = AppLogger(category: "FsmnVadSegment")

static func run(arguments: [String]) async {
let paths = arguments.filter { !$0.hasPrefix("-") }
guard let audioPath = paths.first else {
print("Usage: fluidaudio fsmn-vad-segment <audio-file>")
return
}
let url = URL(fileURLWithPath: audioPath)
guard FileManager.default.fileExists(atPath: url.path) else {
logger.error("Error: Audio file not found: \(audioPath)")
return
}
do {
logger.info("Loading FSMN-VAD models...")
let vad = try await FsmnVadManager.load()
let start = Date()
let segments = try await vad.detect(audioURL: url)
logger.info(
"Detected \(segments.count) speech segment(s) in \(String(format: "%.2f", Date().timeIntervalSince(start)))s"
)
for s in segments {
print("[\(s.startMs), \(s.endMs)] (\(String(format: "%.2f", Double(s.endMs - s.startMs) / 1000.0)) s)")
}
} catch {
logger.error("VAD failed: \(error)")
}
}
}
#endif
Loading
Loading