From 70fec89a2852f2b468b35bceb528640204425971 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 31 May 2026 21:11:43 -0400 Subject: [PATCH 1/2] feat(speaker): CAM++ speaker-embedding backend + CLI CoreML CAM++ from FluidInference/campplus-coreml: 2-stage (fbank80 preprocessor fp32/CPU -> CAM++ RangeDim -> 192-d L2-normalized embedding). Dynamic time dim (RangeDim, CPU/GPU; tiny model) avoids padding corrupting the stats-pooled embedding. - ModelNames: campPlus Repo + CampPlus registry - Speaker/: CampPlusModels, CampPlusEmbedder (audio -> embedding, cosine) - CLI: campplus-embed (one file -> embedding; two -> speaker-verification cosine) Verified: same-speaker cosine 0.74 vs different 0.35 on the example pairs. --- Sources/FluidAudio/ModelNames.swift | 23 +++++ .../FluidAudio/Speaker/CampPlusEmbedder.swift | 62 +++++++++++++ .../FluidAudio/Speaker/CampPlusModels.swift | 91 +++++++++++++++++++ .../Speaker/CampPlusEmbedCommand.swift | 33 +++++++ Sources/FluidAudioCLI/FluidAudioCLI.swift | 2 + 5 files changed, 211 insertions(+) create mode 100644 Sources/FluidAudio/Speaker/CampPlusEmbedder.swift create mode 100644 Sources/FluidAudio/Speaker/CampPlusModels.swift create mode 100644 Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 9dfeeb181..62fd369f3 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -12,6 +12,9 @@ public enum Repo: String, CaseIterable, Sendable { /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice. case senseVoiceSmall = "FluidInference/sensevoice-small-coreml" + /// CAM++ speaker-embedding model (fbank80 -> 192-d) for speaker verification / + /// diarization clustering. See Speaker/CampPlusEmbedder. + case campPlus = "FluidInference/campplus-coreml" // Japanese hybrid TDT: INT8 CTC-trained preprocessor+encoder paired with a // TDT decoder+joint. CTC-only inference for Japanese was removed in // 846924a1d; only the preprocessor+encoder files from this repo are reused. @@ -73,6 +76,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-ctc-0.6b-zh-cn-coreml" case .senseVoiceSmall: return "sensevoice-small-coreml" + case .campPlus: + return "campplus-coreml" case .parakeetJa: return "parakeet-0.6b-ja-coreml" case .parakeetEou160: @@ -437,6 +442,22 @@ public enum ModelNames { ] } + /// CAM++ speaker-embedding model names (2 CoreML stages). + /// Preprocessor (fp32/CPU): waveform -> 80-d fbank + /// CamPlusPlus (fp16/ANE): fbank -> 192-d speaker embedding + public enum CampPlus { + public static let preprocessor = "CamPlusPreprocessor" + public static let model = "CamPlusPlus" + + public static let preprocessorFile = preprocessor + ".mlmodelc" + public static let modelFile = model + ".mlmodelc" + + public static let requiredModels: Set = [ + preprocessorFile, + modelFile, + ] + } + /// TDT ja (Japanese) model names. /// /// Hybrid layout: the CTC-trained preprocessor + encoder from the @@ -1061,6 +1082,8 @@ public enum ModelNames { return ModelNames.CTCZhCn.requiredModels case .senseVoiceSmall: return ModelNames.SenseVoice.requiredModels + case .campPlus: + return ModelNames.CampPlus.requiredModels case .parakeetJa: return ModelNames.TDTJa.requiredModels case .parakeetEou160, .parakeetEou320, .parakeetEou1280: diff --git a/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift new file mode 100644 index 000000000..3f526d0d7 --- /dev/null +++ b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift @@ -0,0 +1,62 @@ +@preconcurrency import CoreML +import Foundation + +/// CAM++ speaker-embedding extractor: audio -> 192-d L2-normalized embedding. +/// +/// Use cosine similarity between embeddings for speaker verification / diarization +/// clustering. Pipeline: waveform -> [Preprocessor fp32/CPU] -> fbank [1,T,80] +/// -> [CAM++ fp16/ANE] -> [1,192] -> L2 normalize. +public actor CampPlusEmbedder { + + public static let embeddingDim = 192 + private static let waveformScale: Float = 32_768.0 // kaldi int16 range + + private let models: CampPlusModels + private static let logger = AppLogger(category: "CampPlusEmbedder") + + public init(models: CampPlusModels) { + self.models = models + } + + public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> CampPlusEmbedder { + CampPlusEmbedder(models: try await CampPlusModels.downloadAndLoad(progressHandler: progressHandler)) + } + + /// 16 kHz mono file -> 192-d L2-normalized embedding. + public func embed(audioURL: URL) throws -> [Float] { + let converter = AudioConverter(sampleRate: 16_000) + return try embed(audio: try converter.resampleAudioFile(audioURL)) + } + + /// 16 kHz mono samples ([-1, 1]) -> 192-d L2-normalized embedding. + public func embed(audio: [Float]) throws -> [Float] { + let n = audio.count + let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32) + let p = wav.dataPointer.assumingMemoryBound(to: Float32.self) + for i in 0.. Float { + zip(a, b).reduce(0) { $0 + $1.0 * $1.1 } + } +} diff --git a/Sources/FluidAudio/Speaker/CampPlusModels.swift b/Sources/FluidAudio/Speaker/CampPlusModels.swift new file mode 100644 index 000000000..a3e462ded --- /dev/null +++ b/Sources/FluidAudio/Speaker/CampPlusModels.swift @@ -0,0 +1,91 @@ +@preconcurrency import CoreML +import Foundation + +/// Loaded CAM++ CoreML models (speaker embedding). +/// +/// 2 stages from `FluidInference/campplus-coreml`: +/// - `preprocessor` (fp32, CPU): waveform -> [1, T, 80] fbank +/// - `model` (fp16, ANE): fbank -> [1, 192] speaker embedding +public struct CampPlusModels: Sendable { + + public let preprocessor: MLModel + public let model: MLModel + + private static let logger = AppLogger(category: "CampPlusModels") + + public init(preprocessor: MLModel, model: MLModel) { + self.preprocessor = preprocessor + self.model = model + } + + public static func downloadAndLoad( + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> CampPlusModels { + try load(from: try await download(progressHandler: progressHandler)) + } + + public static func download( + force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let modelsRoot = modelsRootDirectory() + let targetDir = modelsRoot.appendingPathComponent(Repo.campPlus.folderName, isDirectory: true) + if !force && modelsExist(at: targetDir) { + logger.info("CAM++ models already present at: \(targetDir.path)") + return targetDir + } + if force { try? FileManager.default.removeItem(at: targetDir) } + logger.info("Downloading CAM++ models from HuggingFace...") + try await DownloadUtils.downloadRepo(.campPlus, to: modelsRoot, progressHandler: progressHandler) + return targetDir + } + + public static func modelsExist(at directory: URL) -> Bool { + let fm = FileManager.default + return [ModelNames.CampPlus.preprocessorFile, ModelNames.CampPlus.modelFile].allSatisfy { + fm.fileExists(atPath: directory.appendingPathComponent($0).path) + } + } + + public static func load(from directory: URL) throws -> CampPlusModels { + let cpu = MLModelConfiguration() + cpu.computeUnits = .cpuOnly + // CAM++ uses a dynamic time dim (RangeDim) which the ANE compiler rejects; + // it's tiny (~7.2M), so run on CPU/GPU. Dynamic length avoids padding + // corrupting the statistics-pooled embedding. + let gpu = MLModelConfiguration() + gpu.computeUnits = .cpuAndGPU + let pre = try loadModel(named: ModelNames.CampPlus.preprocessor, from: directory, configuration: cpu) + let model = try loadModel(named: ModelNames.CampPlus.model, from: directory, configuration: gpu) + logger.info("Loaded CAM++ speaker-embedding models") + return CampPlusModels(preprocessor: pre, model: model) + } + + private static func loadModel( + named name: String, from directory: URL, configuration: MLModelConfiguration + ) throws -> MLModel { + let compiled = directory.appendingPathComponent("\(name).mlmodelc") + let pkg = directory.appendingPathComponent("\(name).mlpackage") + let url: URL + if FileManager.default.fileExists(atPath: compiled.path) { + url = compiled + } else if FileManager.default.fileExists(atPath: pkg.path) { + url = try MLModel.compileModel(at: pkg) + } else { + throw ASRError.processingFailed("CAM++ model not found: \(name)") + } + return try MLModel(contentsOf: url, configuration: configuration) + } + + private static func modelsRootDirectory() -> URL { + let fm = FileManager.default + if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first { + return + appSupport + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } + return fm.temporaryDirectory + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } +} diff --git a/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift new file mode 100644 index 000000000..db7eb2751 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift @@ -0,0 +1,33 @@ +#if os(macOS) +import AVFoundation +import FluidAudio +import Foundation + +/// `campplus-embed [b.wav]` +/// One file → prints the 192-d embedding norm/preview; two files → cosine similarity +/// (speaker verification). +enum CampPlusEmbedCommand { + private static let logger = AppLogger(category: "CampPlusEmbed") + + static func run(arguments: [String]) async { + let paths = arguments.filter { !$0.hasPrefix("-") } + guard let a = paths.first else { + print("Usage: fluidaudio campplus-embed [b.wav]") + return + } + do { + let embedder = try await CampPlusEmbedder.load() + let ea = try await embedder.embed(audioURL: URL(fileURLWithPath: a)) + if paths.count >= 2 { + let eb = try await embedder.embed(audioURL: URL(fileURLWithPath: paths[1])) + let cos = CampPlusEmbedder.cosine(ea, eb) + print(String(format: "cosine = %.4f (%@)", cos, cos >= 0.5 ? "same speaker" : "different")) + } else { + print("embedding: dim=\(ea.count), first 5 = \(ea.prefix(5).map { String(format: "%.3f", $0) })") + } + } catch { + logger.error("CAM++ embed failed: \(error)") + } + } +} +#endif diff --git a/Sources/FluidAudioCLI/FluidAudioCLI.swift b/Sources/FluidAudioCLI/FluidAudioCLI.swift index f0c9a97b3..df747d84e 100644 --- a/Sources/FluidAudioCLI/FluidAudioCLI.swift +++ b/Sources/FluidAudioCLI/FluidAudioCLI.swift @@ -86,6 +86,8 @@ struct FluidAudioCLI { await CtcZhCnTranscribeCommand.run(arguments: Array(arguments.dropFirst(2))) case "sensevoice-transcribe": await SenseVoiceTranscribeCommand.run(arguments: Array(arguments.dropFirst(2))) + case "campplus-embed": + await CampPlusEmbedCommand.run(arguments: Array(arguments.dropFirst(2))) case "sensevoice-benchmark": await SenseVoiceBenchmark.run(arguments: Array(arguments.dropFirst(2))) case "ctc-zh-cn-benchmark": From 00009b62c21d6a58dcd0289ecd05781f588284b5 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 31 May 2026 21:32:55 -0400 Subject: [PATCH 2/2] docs(speaker): CAM++ benchmark in Benchmarks.md (AISHELL EER 0.48%) --- Documentation/Benchmarks.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 8ad282b62..d282c0518 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -507,6 +507,25 @@ swift run -c release fluidaudiocli nemotron-benchmark --chunk 560 Both offline and online versions use the community-1 model (via FluidInference/speaker-diarization-coreml). +### CAM++ speaker embedding (`campplus-embed`) + +CoreML CAM++ (FunASR, ~7.2M) speaker-embedding extractor. Model: [FluidInference/campplus-coreml](https://huggingface.co/FluidInference/campplus-coreml). 2-stage: fbank80 preprocessor (fp32/CPU) → CAM++ (RangeDim, CPU/GPU) → 192-d L2-normalized embedding. Hardware: Apple M5 Pro. + +| Metric | Value | +|--------|-------| +| **AISHELL-1 EER** | **0.48%** | +| Same-speaker cosine (mean) | 0.805 | +| Different-speaker cosine (mean) | 0.256 | +| Trial set | 20 speakers, 6000 same / 6000 different pairs | + +**Notes:** +- EER on AISHELL-1 (clean, read Mandarin) — easier than the official CN-Celeb benchmark (~6–7%); this validates the CoreML embedding discriminates speakers (CoreML↔torch embedding cosine 0.9997–0.99999). +- Speaker id parsed from the AISHELL `name` field (`BAC009S0764W...` → `S0764`). + +```bash +swift run -c release fluidaudiocli campplus-embed a.wav b.wav # cosine similarity +``` + ### Offline diarization pipeline For slightly ~1.2% worse DER we default to a higher step ratio segmentation duration than the baseline community-1 pipeline. This allows us to get nearly ~2x the speed (as expected because we're processing 1/2 of the embeddings). For highly critical use cases, one may should use step ratio = 0.1 and minSegmentDurationSeconds = 0.0