From 838592e85c23330e72e472f94774b78e572a9abe Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 31 May 2026 21:24:39 -0400 Subject: [PATCH 1/7] feat(vad): FSMN-VAD backend + CLI CoreML FSMN-VAD from FluidInference/fsmn-vad-coreml: 2-stage (fbank80+LFR preprocessor fp32/CPU -> FSMN scorer fp16/ANE enumerated [512..3072] -> [1,T,248] scores) + a host decision (port of FunASR FsmnVADStreaming: speech if silence_prob<=0.2, 20-frame window hysteresis at 15, max_end_silence 800ms, lookback/lookahead, max_single_segment 60s) -> [start_ms,end_ms]. Long audio chunked at ~30s; silence probs concatenated, decision once. - ModelNames: fsmnVad Repo + FsmnVad registry - VAD/Fsmn/: FsmnVadModels, FsmnVadManager (+ FsmnVadSegment) - CLI: fsmn-vad-segment Verified vs FunASR on 20s clip: [120,19960] vs [70,19980] (~50ms). Alternative to silero-vad. --- Sources/FluidAudio/ModelNames.swift | 23 +++ .../FluidAudio/VAD/Fsmn/FsmnVadManager.swift | 157 ++++++++++++++++++ .../FluidAudio/VAD/Fsmn/FsmnVadModels.swift | 88 ++++++++++ .../Commands/VAD/FsmnVadSegmentCommand.swift | 37 +++++ Sources/FluidAudioCLI/FluidAudioCLI.swift | 2 + 5 files changed, 307 insertions(+) create mode 100644 Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift create mode 100644 Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift create mode 100644 Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 9dfeeb181..0394380b5 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -12,6 +12,8 @@ public enum Repo: String, CaseIterable, Sendable { /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice. case senseVoiceSmall = "FluidInference/sensevoice-small-coreml" + /// FSMN-VAD voice activity detection (FunASR). See VAD/Fsmn. + case fsmnVad = "FluidInference/fsmn-vad-coreml" // Japanese hybrid TDT: INT8 CTC-trained preprocessor+encoder paired with a // TDT decoder+joint. CTC-only inference for Japanese was removed in // 846924a1d; only the preprocessor+encoder files from this repo are reused. @@ -73,6 +75,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-ctc-0.6b-zh-cn-coreml" case .senseVoiceSmall: return "sensevoice-small-coreml" + case .fsmnVad: + return "fsmn-vad-coreml" case .parakeetJa: return "parakeet-0.6b-ja-coreml" case .parakeetEou160: @@ -437,6 +441,23 @@ public enum ModelNames { ] } + /// FSMN-VAD model names (2 CoreML stages + host decision). + /// Preprocessor (fp32/CPU): waveform -> 400-d features (fbank80 + LFR m=5,n=1) + /// FsmnVad (fp16/ANE): features -> [1,T,248] frame scores (col 0 = silence prob) + /// Plus `vad_config.json` (auto-fetched as a root file). + public enum FsmnVad { + public static let preprocessor = "FsmnVadPreprocessor" + public static let scorer = "FsmnVad" + + public static let preprocessorFile = preprocessor + ".mlmodelc" + public static let scorerFile = scorer + ".mlmodelc" + + public static let requiredModels: Set = [ + preprocessorFile, + scorerFile, + ] + } + /// TDT ja (Japanese) model names. /// /// Hybrid layout: the CTC-trained preprocessor + encoder from the @@ -1061,6 +1082,8 @@ public enum ModelNames { return ModelNames.CTCZhCn.requiredModels case .senseVoiceSmall: return ModelNames.SenseVoice.requiredModels + case .fsmnVad: + return ModelNames.FsmnVad.requiredModels case .parakeetJa: return ModelNames.TDTJa.requiredModels case .parakeetEou160, .parakeetEou320, .parakeetEou1280: diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift new file mode 100644 index 000000000..83a76b22c --- /dev/null +++ b/Sources/FluidAudio/VAD/Fsmn/FsmnVadManager.swift @@ -0,0 +1,157 @@ +@preconcurrency import CoreML +import Foundation + +/// A detected speech segment, in milliseconds. +public struct FsmnVadSegment: Sendable, Equatable { + public let startMs: Int + public let endMs: Int +} + +/// FSMN-VAD voice activity detection: audio -> speech segments. +/// +/// Pipeline: waveform -> [Preprocessor fp32/CPU] -> 400-d features +/// -> [FSMN fp16/ANE, enumerated buckets] -> per-frame scores (col 0 = silence prob) +/// -> host decision (window-detector hysteresis + silence->endpoint) -> [start_ms, end_ms]. +/// +/// Audio longer than the largest bucket is processed in ~30 s chunks; the per-frame +/// silence probabilities are concatenated and the decision runs once over all frames. +public actor FsmnVadManager { + + // Enumerated scorer buckets (post-LFR frames; matches the converted model). + private static let buckets = [512, 1024, 2048, 3072] + private static let featureDim = 400 + private static let waveformScale: Float = 32_768.0 + + // Decision params (derived from FunASR vad_opts; 10 ms frames). + private static let silenceThreshold: Float = 0.2 // GetFrameState: speech if silence_prob <= 0.2 + private static let windowFrames = 20 // window_size_ms 200 / 10 + private static let silToSpeech = 15 // sil_to_speech_time 150 / 10 + private static let speechToSil = 15 // speech_to_sil_time 150 / 10 + private static let maxEndSilenceFrames = 80 // max_end_silence_time 800 / 10 + private static let lookbackFrames = 20 // lookback_time_start_point 200 / 10 + private static let lookaheadFrames = 10 // lookahead_time_end_point 100 / 10 + private static let maxSegmentFrames = 6000 // max_single_segment_time 60000 / 10 + private static let frameMs = 10 + + private let models: FsmnVadModels + private static let logger = AppLogger(category: "FsmnVadManager") + + public init(models: FsmnVadModels) { + self.models = models + } + + public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> FsmnVadManager { + FsmnVadManager(models: try await FsmnVadModels.downloadAndLoad(progressHandler: progressHandler)) + } + + public func detect(audioURL: URL) throws -> [FsmnVadSegment] { + let converter = AudioConverter(sampleRate: 16_000) + return try detect(audio: try converter.resampleAudioFile(audioURL)) + } + + public func detect(audio: [Float]) throws -> [FsmnVadSegment] { + let silence = try silenceProbabilities(audio: audio) + return decide(silence: silence) + } + + // MARK: - Scoring (chunked) + + /// Per-frame silence probability over the whole audio (concatenated across chunks). + private func silenceProbabilities(audio: [Float]) throws -> [Float] { + // ~30 s chunks (largest bucket); samples ≈ frames * 160. + let chunkSamples = (Self.buckets.last! - Self.windowFrames) * 160 + var sil: [Float] = [] + var offset = 0 + while offset < audio.count { + let end = min(offset + chunkSamples, audio.count) + let chunk = Array(audio[offset.. [Float] { + let n = audio.count + let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32) + let wp = wav.dataPointer.assumingMemoryBound(to: Float32.self) + for i in 0..= t }) ?? Self.buckets.last! + let speech = try MLMultiArray(shape: [1, bucket as NSNumber, Self.featureDim as NSNumber], dataType: .float32) + let sp = speech.dataPointer.assumingMemoryBound(to: Float32.self) + memset(sp, 0, bucket * Self.featureDim * MemoryLayout.size) + let count = t * Self.featureDim + if f.dataType == .float32 { + memcpy(sp, f.dataPointer, count * MemoryLayout.size) + } else { + for i in 0.. [FsmnVadSegment] { + let T = silence.count + var win = [Int](repeating: 0, count: Self.windowFrames) + var pos = 0 + var winSum = 0 + var preSpeech = false + var inSeg = false + var segStart = 0 + var contSil = 0 + var segs: [FsmnVadSegment] = [] + + func close(at frame: Int) { + segs.append(FsmnVadSegment(startMs: segStart * Self.frameMs, endMs: frame * Self.frameMs)) + inSeg = false + } + + for t in 0..= Self.silToSpeech { + preSpeech = true + if !inSeg { + inSeg = true + segStart = max(0, t - Self.silToSpeech - Self.lookbackFrames) + contSil = 0 + } + } else if preSpeech && winSum <= Self.speechToSil { + preSpeech = false + } + if inSeg && !preSpeech { contSil += 1 } else { contSil = 0 } + if inSeg && contSil >= Self.maxEndSilenceFrames { + close(at: t - Self.maxEndSilenceFrames + Self.lookaheadFrames) + } else if inSeg && (t - segStart) >= Self.maxSegmentFrames { + close(at: t) + preSpeech = false + } + } + if inSeg { close(at: T) } + return segs + } +} diff --git a/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift b/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift new file mode 100644 index 000000000..77470571e --- /dev/null +++ b/Sources/FluidAudio/VAD/Fsmn/FsmnVadModels.swift @@ -0,0 +1,88 @@ +@preconcurrency import CoreML +import Foundation + +/// Loaded FSMN-VAD CoreML models. +/// +/// 2 stages from `FluidInference/fsmn-vad-coreml`: +/// - `preprocessor` (fp32, CPU): waveform -> [1, T, 400] features (fbank80 + LFR m=5,n=1) +/// - `scorer` (fp16, ANE): features -> [1, T, 248] frame scores (col 0 = silence prob) +public struct FsmnVadModels: Sendable { + + public let preprocessor: MLModel + public let scorer: MLModel + + private static let logger = AppLogger(category: "FsmnVadModels") + + public init(preprocessor: MLModel, scorer: MLModel) { + self.preprocessor = preprocessor + self.scorer = scorer + } + + public static func downloadAndLoad( + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> FsmnVadModels { + try load(from: try await download(progressHandler: progressHandler)) + } + + public static func download( + force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let root = modelsRootDirectory() + let dir = root.appendingPathComponent(Repo.fsmnVad.folderName, isDirectory: true) + if !force && modelsExist(at: dir) { + logger.info("FSMN-VAD models already present at: \(dir.path)") + return dir + } + if force { try? FileManager.default.removeItem(at: dir) } + logger.info("Downloading FSMN-VAD models from HuggingFace...") + try await DownloadUtils.downloadRepo(.fsmnVad, to: root, progressHandler: progressHandler) + return dir + } + + public static func modelsExist(at directory: URL) -> Bool { + let fm = FileManager.default + return [ModelNames.FsmnVad.preprocessorFile, ModelNames.FsmnVad.scorerFile].allSatisfy { + fm.fileExists(atPath: directory.appendingPathComponent($0).path) + } + } + + public static func load(from directory: URL) throws -> FsmnVadModels { + let cpu = MLModelConfiguration() + cpu.computeUnits = .cpuOnly + let ane = MLModelConfiguration() + ane.computeUnits = .cpuAndNeuralEngine + let pre = try loadModel(named: ModelNames.FsmnVad.preprocessor, from: directory, configuration: cpu) + let scorer = try loadModel(named: ModelNames.FsmnVad.scorer, from: directory, configuration: ane) + logger.info("Loaded FSMN-VAD models") + return FsmnVadModels(preprocessor: pre, scorer: scorer) + } + + private static func loadModel( + named name: String, from directory: URL, configuration: MLModelConfiguration + ) throws -> MLModel { + let compiled = directory.appendingPathComponent("\(name).mlmodelc") + let pkg = directory.appendingPathComponent("\(name).mlpackage") + let url: URL + if FileManager.default.fileExists(atPath: compiled.path) { + url = compiled + } else if FileManager.default.fileExists(atPath: pkg.path) { + url = try MLModel.compileModel(at: pkg) + } else { + throw ASRError.processingFailed("FSMN-VAD model not found: \(name)") + } + return try MLModel(contentsOf: url, configuration: configuration) + } + + private static func modelsRootDirectory() -> URL { + let fm = FileManager.default + if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first { + return + appSupport + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } + return fm.temporaryDirectory + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } +} diff --git a/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift b/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift new file mode 100644 index 000000000..5449d165d --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/VAD/FsmnVadSegmentCommand.swift @@ -0,0 +1,37 @@ +#if os(macOS) +import AVFoundation +import FluidAudio +import Foundation + +/// `fsmn-vad-segment