diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 77c6abe04..d980135ea 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -534,6 +534,25 @@ swift run -c release fluidaudiocli nemotron-benchmark --chunk 560 Both offline and online versions use the community-1 model (via FluidInference/speaker-diarization-coreml). +### CAM++ speaker embedding (`campplus-embed`) + +CoreML CAM++ (FunASR, ~7.2M) speaker-embedding extractor. Model: [FluidInference/campplus-coreml](https://huggingface.co/FluidInference/campplus-coreml). 2-stage: fbank80 preprocessor (fp32/CPU) → CAM++ (RangeDim, CPU/GPU) → 192-d L2-normalized embedding. Hardware: Apple M5 Pro. + +| Metric | Value | +|--------|-------| +| **AISHELL-1 EER** | **0.48%** | +| Same-speaker cosine (mean) | 0.805 | +| Different-speaker cosine (mean) | 0.256 | +| Trial set | 20 speakers, 6000 same / 6000 different pairs | + +**Notes:** +- EER on AISHELL-1 (clean, read Mandarin) — easier than the official CN-Celeb benchmark (~6–7%); this validates the CoreML embedding discriminates speakers (CoreML↔torch embedding cosine 0.9997–0.99999). +- Speaker id parsed from the AISHELL `name` field (`BAC009S0764W...` → `S0764`). + +```bash +swift run -c release fluidaudiocli campplus-embed a.wav b.wav # cosine similarity +``` + ### Offline diarization pipeline For slightly ~1.2% worse DER we default to a higher step ratio segmentation duration than the baseline community-1 pipeline. This allows us to get nearly ~2x the speed (as expected because we're processing 1/2 of the embeddings). For highly critical use cases, one may should use step ratio = 0.1 and minSegmentDurationSeconds = 0.0 diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index ca0014a61..a6f26dfdd 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -12,6 +12,9 @@ public enum Repo: String, CaseIterable, Sendable { /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice. case senseVoiceSmall = "FluidInference/sensevoice-small-coreml" + /// CAM++ speaker-embedding model (fbank80 -> 192-d) for speaker verification / + /// diarization clustering. See Speaker/CampPlusEmbedder. + case campPlus = "FluidInference/campplus-coreml" /// Paraformer-large (zh) — non-autoregressive ASR: SANM encoder + CIF /// predictor (host-side integrate-and-fire) + parallel decoder. See ASR/Paraformer. case paraformerLargeZh = "FluidInference/paraformer-large-zh-coreml" @@ -76,6 +79,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-ctc-0.6b-zh-cn-coreml" case .senseVoiceSmall: return "sensevoice-small-coreml" + case .campPlus: + return "campplus-coreml" case .paraformerLargeZh: return "paraformer-large-zh-coreml" case .parakeetJa: @@ -442,6 +447,22 @@ public enum ModelNames { ] } + /// CAM++ speaker-embedding model names (2 CoreML stages). + /// Preprocessor (fp32/CPU): waveform -> 80-d fbank + /// CamPlusPlus (fp16/ANE): fbank -> 192-d speaker embedding + public enum CampPlus { + public static let preprocessor = "CamPlusPreprocessor" + public static let model = "CamPlusPlus" + + public static let preprocessorFile = preprocessor + ".mlmodelc" + public static let modelFile = model + ".mlmodelc" + + public static let requiredModels: Set = [ + preprocessorFile, + modelFile, + ] + } + /// Paraformer-large (zh) model names. 4 CoreML stages + host CIF: /// Preprocessor (fp32/CPU): waveform -> 560-d LFR features /// Encoder (fp16/ANE): SANM encoder (enumerated buckets) @@ -1098,6 +1119,8 @@ public enum ModelNames { return ModelNames.CTCZhCn.requiredModels case .senseVoiceSmall: return ModelNames.SenseVoice.requiredModels + case .campPlus: + return ModelNames.CampPlus.requiredModels case .paraformerLargeZh: return ModelNames.ParaformerZh.requiredModels case .parakeetJa: diff --git a/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift new file mode 100644 index 000000000..3f526d0d7 --- /dev/null +++ b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift @@ -0,0 +1,62 @@ +@preconcurrency import CoreML +import Foundation + +/// CAM++ speaker-embedding extractor: audio -> 192-d L2-normalized embedding. +/// +/// Use cosine similarity between embeddings for speaker verification / diarization +/// clustering. Pipeline: waveform -> [Preprocessor fp32/CPU] -> fbank [1,T,80] +/// -> [CAM++ fp16/ANE] -> [1,192] -> L2 normalize. +public actor CampPlusEmbedder { + + public static let embeddingDim = 192 + private static let waveformScale: Float = 32_768.0 // kaldi int16 range + + private let models: CampPlusModels + private static let logger = AppLogger(category: "CampPlusEmbedder") + + public init(models: CampPlusModels) { + self.models = models + } + + public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> CampPlusEmbedder { + CampPlusEmbedder(models: try await CampPlusModels.downloadAndLoad(progressHandler: progressHandler)) + } + + /// 16 kHz mono file -> 192-d L2-normalized embedding. + public func embed(audioURL: URL) throws -> [Float] { + let converter = AudioConverter(sampleRate: 16_000) + return try embed(audio: try converter.resampleAudioFile(audioURL)) + } + + /// 16 kHz mono samples ([-1, 1]) -> 192-d L2-normalized embedding. + public func embed(audio: [Float]) throws -> [Float] { + let n = audio.count + let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32) + let p = wav.dataPointer.assumingMemoryBound(to: Float32.self) + for i in 0.. Float { + zip(a, b).reduce(0) { $0 + $1.0 * $1.1 } + } +} diff --git a/Sources/FluidAudio/Speaker/CampPlusModels.swift b/Sources/FluidAudio/Speaker/CampPlusModels.swift new file mode 100644 index 000000000..a3e462ded --- /dev/null +++ b/Sources/FluidAudio/Speaker/CampPlusModels.swift @@ -0,0 +1,91 @@ +@preconcurrency import CoreML +import Foundation + +/// Loaded CAM++ CoreML models (speaker embedding). +/// +/// 2 stages from `FluidInference/campplus-coreml`: +/// - `preprocessor` (fp32, CPU): waveform -> [1, T, 80] fbank +/// - `model` (fp16, ANE): fbank -> [1, 192] speaker embedding +public struct CampPlusModels: Sendable { + + public let preprocessor: MLModel + public let model: MLModel + + private static let logger = AppLogger(category: "CampPlusModels") + + public init(preprocessor: MLModel, model: MLModel) { + self.preprocessor = preprocessor + self.model = model + } + + public static func downloadAndLoad( + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> CampPlusModels { + try load(from: try await download(progressHandler: progressHandler)) + } + + public static func download( + force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let modelsRoot = modelsRootDirectory() + let targetDir = modelsRoot.appendingPathComponent(Repo.campPlus.folderName, isDirectory: true) + if !force && modelsExist(at: targetDir) { + logger.info("CAM++ models already present at: \(targetDir.path)") + return targetDir + } + if force { try? FileManager.default.removeItem(at: targetDir) } + logger.info("Downloading CAM++ models from HuggingFace...") + try await DownloadUtils.downloadRepo(.campPlus, to: modelsRoot, progressHandler: progressHandler) + return targetDir + } + + public static func modelsExist(at directory: URL) -> Bool { + let fm = FileManager.default + return [ModelNames.CampPlus.preprocessorFile, ModelNames.CampPlus.modelFile].allSatisfy { + fm.fileExists(atPath: directory.appendingPathComponent($0).path) + } + } + + public static func load(from directory: URL) throws -> CampPlusModels { + let cpu = MLModelConfiguration() + cpu.computeUnits = .cpuOnly + // CAM++ uses a dynamic time dim (RangeDim) which the ANE compiler rejects; + // it's tiny (~7.2M), so run on CPU/GPU. Dynamic length avoids padding + // corrupting the statistics-pooled embedding. + let gpu = MLModelConfiguration() + gpu.computeUnits = .cpuAndGPU + let pre = try loadModel(named: ModelNames.CampPlus.preprocessor, from: directory, configuration: cpu) + let model = try loadModel(named: ModelNames.CampPlus.model, from: directory, configuration: gpu) + logger.info("Loaded CAM++ speaker-embedding models") + return CampPlusModels(preprocessor: pre, model: model) + } + + private static func loadModel( + named name: String, from directory: URL, configuration: MLModelConfiguration + ) throws -> MLModel { + let compiled = directory.appendingPathComponent("\(name).mlmodelc") + let pkg = directory.appendingPathComponent("\(name).mlpackage") + let url: URL + if FileManager.default.fileExists(atPath: compiled.path) { + url = compiled + } else if FileManager.default.fileExists(atPath: pkg.path) { + url = try MLModel.compileModel(at: pkg) + } else { + throw ASRError.processingFailed("CAM++ model not found: \(name)") + } + return try MLModel(contentsOf: url, configuration: configuration) + } + + private static func modelsRootDirectory() -> URL { + let fm = FileManager.default + if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first { + return + appSupport + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } + return fm.temporaryDirectory + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } +} diff --git a/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift new file mode 100644 index 000000000..db7eb2751 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift @@ -0,0 +1,33 @@ +#if os(macOS) +import AVFoundation +import FluidAudio +import Foundation + +/// `campplus-embed [b.wav]` +/// One file → prints the 192-d embedding norm/preview; two files → cosine similarity +/// (speaker verification). +enum CampPlusEmbedCommand { + private static let logger = AppLogger(category: "CampPlusEmbed") + + static func run(arguments: [String]) async { + let paths = arguments.filter { !$0.hasPrefix("-") } + guard let a = paths.first else { + print("Usage: fluidaudio campplus-embed [b.wav]") + return + } + do { + let embedder = try await CampPlusEmbedder.load() + let ea = try await embedder.embed(audioURL: URL(fileURLWithPath: a)) + if paths.count >= 2 { + let eb = try await embedder.embed(audioURL: URL(fileURLWithPath: paths[1])) + let cos = CampPlusEmbedder.cosine(ea, eb) + print(String(format: "cosine = %.4f (%@)", cos, cos >= 0.5 ? "same speaker" : "different")) + } else { + print("embedding: dim=\(ea.count), first 5 = \(ea.prefix(5).map { String(format: "%.3f", $0) })") + } + } catch { + logger.error("CAM++ embed failed: \(error)") + } + } +} +#endif diff --git a/Sources/FluidAudioCLI/FluidAudioCLI.swift b/Sources/FluidAudioCLI/FluidAudioCLI.swift index aaeb00856..963eb62b8 100644 --- a/Sources/FluidAudioCLI/FluidAudioCLI.swift +++ b/Sources/FluidAudioCLI/FluidAudioCLI.swift @@ -86,6 +86,8 @@ struct FluidAudioCLI { await CtcZhCnTranscribeCommand.run(arguments: Array(arguments.dropFirst(2))) case "sensevoice-transcribe": await SenseVoiceTranscribeCommand.run(arguments: Array(arguments.dropFirst(2))) + case "campplus-embed": + await CampPlusEmbedCommand.run(arguments: Array(arguments.dropFirst(2))) case "sensevoice-benchmark": await SenseVoiceBenchmark.run(arguments: Array(arguments.dropFirst(2))) case "paraformer-transcribe":