From 70fec89a2852f2b468b35bceb528640204425971 Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 21:11:43 -0400
Subject: [PATCH 1/2] feat(speaker): CAM++ speaker-embedding backend + CLI

CoreML CAM++ from FluidInference/campplus-coreml: 2-stage (fbank80 preprocessor
fp32/CPU -> CAM++ RangeDim -> 192-d L2-normalized embedding). Dynamic time dim
(RangeDim, CPU/GPU; tiny model) avoids padding corrupting the stats-pooled embedding.

- ModelNames: campPlus Repo + CampPlus registry
- Speaker/: CampPlusModels, CampPlusEmbedder (audio -> embedding, cosine)
- CLI: campplus-embed (one file -> embedding; two -> speaker-verification cosine)
Verified: same-speaker cosine 0.74 vs different 0.35 on the example pairs.
---
 Sources/FluidAudio/ModelNames.swift           | 23 +++++
 .../FluidAudio/Speaker/CampPlusEmbedder.swift | 62 +++++++++++++
 .../FluidAudio/Speaker/CampPlusModels.swift   | 91 +++++++++++++++++++
 .../Speaker/CampPlusEmbedCommand.swift        | 33 +++++++
 Sources/FluidAudioCLI/FluidAudioCLI.swift     |  2 +
 5 files changed, 211 insertions(+)
 create mode 100644 Sources/FluidAudio/Speaker/CampPlusEmbedder.swift
 create mode 100644 Sources/FluidAudio/Speaker/CampPlusModels.swift
 create mode 100644 Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 9dfeeb181..62fd369f3 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -12,6 +12,9 @@ public enum Repo: String, CaseIterable, Sendable {
     /// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE
     /// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice.
     case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
+    /// CAM++ speaker-embedding model (fbank80 -> 192-d) for speaker verification /
+    /// diarization clustering. See Speaker/CampPlusEmbedder.
+    case campPlus = "FluidInference/campplus-coreml"
     // Japanese hybrid TDT: INT8 CTC-trained preprocessor+encoder paired with a
     // TDT decoder+joint. CTC-only inference for Japanese was removed in
     // 846924a1d; only the preprocessor+encoder files from this repo are reused.
@@ -73,6 +76,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "parakeet-ctc-0.6b-zh-cn-coreml"
         case .senseVoiceSmall:
             return "sensevoice-small-coreml"
+        case .campPlus:
+            return "campplus-coreml"
         case .parakeetJa:
             return "parakeet-0.6b-ja-coreml"
         case .parakeetEou160:
@@ -437,6 +442,22 @@ public enum ModelNames {
         ]
     }
 
+    /// CAM++ speaker-embedding model names (2 CoreML stages).
+    ///   Preprocessor (fp32/CPU): waveform -> 80-d fbank
+    ///   CamPlusPlus (fp16/ANE): fbank -> 192-d speaker embedding
+    public enum CampPlus {
+        public static let preprocessor = "CamPlusPreprocessor"
+        public static let model = "CamPlusPlus"
+
+        public static let preprocessorFile = preprocessor + ".mlmodelc"
+        public static let modelFile = model + ".mlmodelc"
+
+        public static let requiredModels: Set<String> = [
+            preprocessorFile,
+            modelFile,
+        ]
+    }
+
     /// TDT ja (Japanese) model names.
     ///
     /// Hybrid layout: the CTC-trained preprocessor + encoder from the
@@ -1061,6 +1082,8 @@ public enum ModelNames {
             return ModelNames.CTCZhCn.requiredModels
         case .senseVoiceSmall:
             return ModelNames.SenseVoice.requiredModels
+        case .campPlus:
+            return ModelNames.CampPlus.requiredModels
         case .parakeetJa:
             return ModelNames.TDTJa.requiredModels
         case .parakeetEou160, .parakeetEou320, .parakeetEou1280:
diff --git a/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift
new file mode 100644
index 000000000..3f526d0d7
--- /dev/null
+++ b/Sources/FluidAudio/Speaker/CampPlusEmbedder.swift
@@ -0,0 +1,62 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// CAM++ speaker-embedding extractor: audio -> 192-d L2-normalized embedding.
+///
+/// Use cosine similarity between embeddings for speaker verification / diarization
+/// clustering. Pipeline: waveform -> [Preprocessor fp32/CPU] -> fbank [1,T,80]
+/// -> [CAM++ fp16/ANE] -> [1,192] -> L2 normalize.
+public actor CampPlusEmbedder {
+
+    public static let embeddingDim = 192
+    private static let waveformScale: Float = 32_768.0  // kaldi int16 range
+
+    private let models: CampPlusModels
+    private static let logger = AppLogger(category: "CampPlusEmbedder")
+
+    public init(models: CampPlusModels) {
+        self.models = models
+    }
+
+    public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> CampPlusEmbedder {
+        CampPlusEmbedder(models: try await CampPlusModels.downloadAndLoad(progressHandler: progressHandler))
+    }
+
+    /// 16 kHz mono file -> 192-d L2-normalized embedding.
+    public func embed(audioURL: URL) throws -> [Float] {
+        let converter = AudioConverter(sampleRate: 16_000)
+        return try embed(audio: try converter.resampleAudioFile(audioURL))
+    }
+
+    /// 16 kHz mono samples ([-1, 1]) -> 192-d L2-normalized embedding.
+    public func embed(audio: [Float]) throws -> [Float] {
+        let n = audio.count
+        let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32)
+        let p = wav.dataPointer.assumingMemoryBound(to: Float32.self)
+        for i in 0..<n { p[i] = audio[i] * Self.waveformScale }
+        let feats = try models.preprocessor.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["waveform": MLFeatureValue(multiArray: wav)]))
+        guard let fbank = feats.featureValue(for: "features")?.multiArrayValue else {
+            throw ASRError.processingFailed("CAM++ preprocessor produced no `features`")
+        }
+        let out = try models.model.prediction(
+            from: MLDictionaryFeatureProvider(dictionary: ["feats": MLFeatureValue(multiArray: fbank)]))
+        guard let emb = out.featureValue(for: "embedding")?.multiArrayValue else {
+            throw ASRError.processingFailed("CAM++ produced no `embedding`")
+        }
+        var v = [Float](repeating: 0, count: emb.count)
+        if emb.dataType == .float32 {
+            let ep = emb.dataPointer.assumingMemoryBound(to: Float32.self)
+            for i in 0..<emb.count { v[i] = ep[i] }
+        } else {
+            for i in 0..<emb.count { v[i] = emb[i].floatValue }
+        }
+        let norm = max(sqrt(v.reduce(0) { $0 + $1 * $1 }), 1e-9)
+        return v.map { $0 / norm }
+    }
+
+    /// Cosine similarity of two L2-normalized embeddings.
+    public nonisolated static func cosine(_ a: [Float], _ b: [Float]) -> Float {
+        zip(a, b).reduce(0) { $0 + $1.0 * $1.1 }
+    }
+}
diff --git a/Sources/FluidAudio/Speaker/CampPlusModels.swift b/Sources/FluidAudio/Speaker/CampPlusModels.swift
new file mode 100644
index 000000000..a3e462ded
--- /dev/null
+++ b/Sources/FluidAudio/Speaker/CampPlusModels.swift
@@ -0,0 +1,91 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Loaded CAM++ CoreML models (speaker embedding).
+///
+/// 2 stages from `FluidInference/campplus-coreml`:
+///   - `preprocessor` (fp32, CPU): waveform -> [1, T, 80] fbank
+///   - `model` (fp16, ANE): fbank -> [1, 192] speaker embedding
+public struct CampPlusModels: Sendable {
+
+    public let preprocessor: MLModel
+    public let model: MLModel
+
+    private static let logger = AppLogger(category: "CampPlusModels")
+
+    public init(preprocessor: MLModel, model: MLModel) {
+        self.preprocessor = preprocessor
+        self.model = model
+    }
+
+    public static func downloadAndLoad(
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> CampPlusModels {
+        try load(from: try await download(progressHandler: progressHandler))
+    }
+
+    public static func download(
+        force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let modelsRoot = modelsRootDirectory()
+        let targetDir = modelsRoot.appendingPathComponent(Repo.campPlus.folderName, isDirectory: true)
+        if !force && modelsExist(at: targetDir) {
+            logger.info("CAM++ models already present at: \(targetDir.path)")
+            return targetDir
+        }
+        if force { try? FileManager.default.removeItem(at: targetDir) }
+        logger.info("Downloading CAM++ models from HuggingFace...")
+        try await DownloadUtils.downloadRepo(.campPlus, to: modelsRoot, progressHandler: progressHandler)
+        return targetDir
+    }
+
+    public static func modelsExist(at directory: URL) -> Bool {
+        let fm = FileManager.default
+        return [ModelNames.CampPlus.preprocessorFile, ModelNames.CampPlus.modelFile].allSatisfy {
+            fm.fileExists(atPath: directory.appendingPathComponent($0).path)
+        }
+    }
+
+    public static func load(from directory: URL) throws -> CampPlusModels {
+        let cpu = MLModelConfiguration()
+        cpu.computeUnits = .cpuOnly
+        // CAM++ uses a dynamic time dim (RangeDim) which the ANE compiler rejects;
+        // it's tiny (~7.2M), so run on CPU/GPU. Dynamic length avoids padding
+        // corrupting the statistics-pooled embedding.
+        let gpu = MLModelConfiguration()
+        gpu.computeUnits = .cpuAndGPU
+        let pre = try loadModel(named: ModelNames.CampPlus.preprocessor, from: directory, configuration: cpu)
+        let model = try loadModel(named: ModelNames.CampPlus.model, from: directory, configuration: gpu)
+        logger.info("Loaded CAM++ speaker-embedding models")
+        return CampPlusModels(preprocessor: pre, model: model)
+    }
+
+    private static func loadModel(
+        named name: String, from directory: URL, configuration: MLModelConfiguration
+    ) throws -> MLModel {
+        let compiled = directory.appendingPathComponent("\(name).mlmodelc")
+        let pkg = directory.appendingPathComponent("\(name).mlpackage")
+        let url: URL
+        if FileManager.default.fileExists(atPath: compiled.path) {
+            url = compiled
+        } else if FileManager.default.fileExists(atPath: pkg.path) {
+            url = try MLModel.compileModel(at: pkg)
+        } else {
+            throw ASRError.processingFailed("CAM++ model not found: \(name)")
+        }
+        return try MLModel(contentsOf: url, configuration: configuration)
+    }
+
+    private static func modelsRootDirectory() -> URL {
+        let fm = FileManager.default
+        if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
+            return
+                appSupport
+                .appendingPathComponent("FluidAudio", isDirectory: true)
+                .appendingPathComponent("Models", isDirectory: true)
+        }
+        return fm.temporaryDirectory
+            .appendingPathComponent("FluidAudio", isDirectory: true)
+            .appendingPathComponent("Models", isDirectory: true)
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift
new file mode 100644
index 000000000..db7eb2751
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift
@@ -0,0 +1,33 @@
+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+
+/// `campplus-embed <a.wav> [b.wav]`
+/// One file → prints the 192-d embedding norm/preview; two files → cosine similarity
+/// (speaker verification).
+enum CampPlusEmbedCommand {
+    private static let logger = AppLogger(category: "CampPlusEmbed")
+
+    static func run(arguments: [String]) async {
+        let paths = arguments.filter { !$0.hasPrefix("-") }
+        guard let a = paths.first else {
+            print("Usage: fluidaudio campplus-embed <a.wav> [b.wav]")
+            return
+        }
+        do {
+            let embedder = try await CampPlusEmbedder.load()
+            let ea = try await embedder.embed(audioURL: URL(fileURLWithPath: a))
+            if paths.count >= 2 {
+                let eb = try await embedder.embed(audioURL: URL(fileURLWithPath: paths[1]))
+                let cos = CampPlusEmbedder.cosine(ea, eb)
+                print(String(format: "cosine = %.4f  (%@)", cos, cos >= 0.5 ? "same speaker" : "different"))
+            } else {
+                print("embedding: dim=\(ea.count), first 5 = \(ea.prefix(5).map { String(format: "%.3f", $0) })")
+            }
+        } catch {
+            logger.error("CAM++ embed failed: \(error)")
+        }
+    }
+}
+#endif
diff --git a/Sources/FluidAudioCLI/FluidAudioCLI.swift b/Sources/FluidAudioCLI/FluidAudioCLI.swift
index f0c9a97b3..df747d84e 100644
--- a/Sources/FluidAudioCLI/FluidAudioCLI.swift
+++ b/Sources/FluidAudioCLI/FluidAudioCLI.swift
@@ -86,6 +86,8 @@ struct FluidAudioCLI {
             await CtcZhCnTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
         case "sensevoice-transcribe":
             await SenseVoiceTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
+        case "campplus-embed":
+            await CampPlusEmbedCommand.run(arguments: Array(arguments.dropFirst(2)))
         case "sensevoice-benchmark":
             await SenseVoiceBenchmark.run(arguments: Array(arguments.dropFirst(2)))
         case "ctc-zh-cn-benchmark":

From 00009b62c21d6a58dcd0289ecd05781f588284b5 Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sun, 31 May 2026 21:32:55 -0400
Subject: [PATCH 2/2] docs(speaker): CAM++ benchmark in Benchmarks.md (AISHELL
 EER 0.48%)

---
 Documentation/Benchmarks.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 8ad282b62..d282c0518 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -507,6 +507,25 @@ swift run -c release fluidaudiocli nemotron-benchmark --chunk 560
 
 Both offline and online versions use the community-1 model (via FluidInference/speaker-diarization-coreml).
 
+### CAM++ speaker embedding (`campplus-embed`)
+
+CoreML CAM++ (FunASR, ~7.2M) speaker-embedding extractor. Model: [FluidInference/campplus-coreml](https://huggingface.co/FluidInference/campplus-coreml). 2-stage: fbank80 preprocessor (fp32/CPU) → CAM++ (RangeDim, CPU/GPU) → 192-d L2-normalized embedding. Hardware: Apple M5 Pro.
+
+| Metric | Value |
+|--------|-------|
+| **AISHELL-1 EER** | **0.48%** |
+| Same-speaker cosine (mean) | 0.805 |
+| Different-speaker cosine (mean) | 0.256 |
+| Trial set | 20 speakers, 6000 same / 6000 different pairs |
+
+**Notes:**
+- EER on AISHELL-1 (clean, read Mandarin) — easier than the official CN-Celeb benchmark (~6–7%); this validates the CoreML embedding discriminates speakers (CoreML↔torch embedding cosine 0.9997–0.99999).
+- Speaker id parsed from the AISHELL `name` field (`BAC009S0764W...` → `S0764`).
+
+```bash
+swift run -c release fluidaudiocli campplus-embed a.wav b.wav   # cosine similarity
+```
+
 ### Offline diarization pipeline
 
 For slightly ~1.2% worse DER we default to a higher step ratio segmentation duration than the baseline community-1 pipeline. This allows us to get nearly ~2x the speed (as expected because we're processing 1/2 of the embeddings). For highly critical use cases, one may should use step ratio = 0.1 and minSegmentDurationSeconds = 0.0