Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Documentation/Benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,25 @@ swift run -c release fluidaudiocli nemotron-benchmark --chunk 560

Both offline and online versions use the community-1 model (via FluidInference/speaker-diarization-coreml).

### CAM++ speaker embedding (`campplus-embed`)

CoreML CAM++ (FunASR, ~7.2M) speaker-embedding extractor. Model: [FluidInference/campplus-coreml](https://huggingface.co/FluidInference/campplus-coreml). 2-stage: fbank80 preprocessor (fp32/CPU) → CAM++ (RangeDim, CPU/GPU) → 192-d L2-normalized embedding. Hardware: Apple M5 Pro.

| Metric | Value |
|--------|-------|
| **AISHELL-1 EER** | **0.48%** |
| Same-speaker cosine (mean) | 0.805 |
| Different-speaker cosine (mean) | 0.256 |
| Trial set | 20 speakers, 6000 same / 6000 different pairs |

**Notes:**
- EER on AISHELL-1 (clean, read Mandarin) — easier than the official CN-Celeb benchmark (~6–7%); this validates the CoreML embedding discriminates speakers (CoreML↔torch embedding cosine 0.9997–0.99999).
- Speaker id parsed from the AISHELL `name` field (`BAC009S0764W...` → `S0764`).

```bash
swift run -c release fluidaudiocli campplus-embed a.wav b.wav # cosine similarity
```

### Offline diarization pipeline

For slightly ~1.2% worse DER we default to a higher step ratio segmentation duration than the baseline community-1 pipeline. This allows us to get nearly ~2x the speed (as expected because we're processing 1/2 of the embeddings). For highly critical use cases, one may should use step ratio = 0.1 and minSegmentDurationSeconds = 0.0
Expand Down
23 changes: 23 additions & 0 deletions Sources/FluidAudio/ModelNames.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ public enum Repo: String, CaseIterable, Sendable {
/// 3-stage: fp32 CPU preprocessor (waveform→560-d LFR feats) + fp16 ANE
/// encoder+CTC (+ fp32 fallback) + host greedy-CTC decode. See ASR/SenseVoice.
case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
/// CAM++ speaker-embedding model (fbank80 -> 192-d) for speaker verification /
/// diarization clustering. See Speaker/CampPlusEmbedder.
case campPlus = "FluidInference/campplus-coreml"
/// Paraformer-large (zh) — non-autoregressive ASR: SANM encoder + CIF
/// predictor (host-side integrate-and-fire) + parallel decoder. See ASR/Paraformer.
case paraformerLargeZh = "FluidInference/paraformer-large-zh-coreml"
Expand Down Expand Up @@ -76,6 +79,8 @@ public enum Repo: String, CaseIterable, Sendable {
return "parakeet-ctc-0.6b-zh-cn-coreml"
case .senseVoiceSmall:
return "sensevoice-small-coreml"
case .campPlus:
return "campplus-coreml"
case .paraformerLargeZh:
return "paraformer-large-zh-coreml"
case .parakeetJa:
Expand Down Expand Up @@ -442,6 +447,22 @@ public enum ModelNames {
]
}

/// CAM++ speaker-embedding model names (2 CoreML stages).
/// Preprocessor (fp32/CPU): waveform -> 80-d fbank
/// CamPlusPlus (fp16/ANE): fbank -> 192-d speaker embedding
public enum CampPlus {
public static let preprocessor = "CamPlusPreprocessor"
public static let model = "CamPlusPlus"

public static let preprocessorFile = preprocessor + ".mlmodelc"
public static let modelFile = model + ".mlmodelc"

public static let requiredModels: Set<String> = [
preprocessorFile,
modelFile,
]
}

/// Paraformer-large (zh) model names. 4 CoreML stages + host CIF:
/// Preprocessor (fp32/CPU): waveform -> 560-d LFR features
/// Encoder (fp16/ANE): SANM encoder (enumerated buckets)
Expand Down Expand Up @@ -1098,6 +1119,8 @@ public enum ModelNames {
return ModelNames.CTCZhCn.requiredModels
case .senseVoiceSmall:
return ModelNames.SenseVoice.requiredModels
case .campPlus:
return ModelNames.CampPlus.requiredModels
case .paraformerLargeZh:
return ModelNames.ParaformerZh.requiredModels
case .parakeetJa:
Expand Down
62 changes: 62 additions & 0 deletions Sources/FluidAudio/Speaker/CampPlusEmbedder.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
@preconcurrency import CoreML
import Foundation

/// CAM++ speaker-embedding extractor: audio -> 192-d L2-normalized embedding.
///
/// Use cosine similarity between embeddings for speaker verification / diarization
/// clustering. Pipeline: waveform -> [Preprocessor fp32/CPU] -> fbank [1,T,80]
/// -> [CAM++ fp16/ANE] -> [1,192] -> L2 normalize.
public actor CampPlusEmbedder {

public static let embeddingDim = 192
private static let waveformScale: Float = 32_768.0 // kaldi int16 range

private let models: CampPlusModels
private static let logger = AppLogger(category: "CampPlusEmbedder")

public init(models: CampPlusModels) {
self.models = models
}

public static func load(progressHandler: DownloadUtils.ProgressHandler? = nil) async throws -> CampPlusEmbedder {
CampPlusEmbedder(models: try await CampPlusModels.downloadAndLoad(progressHandler: progressHandler))
}

/// 16 kHz mono file -> 192-d L2-normalized embedding.
public func embed(audioURL: URL) throws -> [Float] {
let converter = AudioConverter(sampleRate: 16_000)
return try embed(audio: try converter.resampleAudioFile(audioURL))
}

/// 16 kHz mono samples ([-1, 1]) -> 192-d L2-normalized embedding.
public func embed(audio: [Float]) throws -> [Float] {
let n = audio.count
let wav = try MLMultiArray(shape: [1, n as NSNumber], dataType: .float32)
let p = wav.dataPointer.assumingMemoryBound(to: Float32.self)
for i in 0..<n { p[i] = audio[i] * Self.waveformScale }
let feats = try models.preprocessor.prediction(
from: MLDictionaryFeatureProvider(dictionary: ["waveform": MLFeatureValue(multiArray: wav)]))
guard let fbank = feats.featureValue(for: "features")?.multiArrayValue else {
throw ASRError.processingFailed("CAM++ preprocessor produced no `features`")
}
let out = try models.model.prediction(
from: MLDictionaryFeatureProvider(dictionary: ["feats": MLFeatureValue(multiArray: fbank)]))
guard let emb = out.featureValue(for: "embedding")?.multiArrayValue else {
throw ASRError.processingFailed("CAM++ produced no `embedding`")
}
var v = [Float](repeating: 0, count: emb.count)
if emb.dataType == .float32 {
let ep = emb.dataPointer.assumingMemoryBound(to: Float32.self)
for i in 0..<emb.count { v[i] = ep[i] }
} else {
for i in 0..<emb.count { v[i] = emb[i].floatValue }
}
let norm = max(sqrt(v.reduce(0) { $0 + $1 * $1 }), 1e-9)
return v.map { $0 / norm }
}

/// Cosine similarity of two L2-normalized embeddings.
public nonisolated static func cosine(_ a: [Float], _ b: [Float]) -> Float {
zip(a, b).reduce(0) { $0 + $1.0 * $1.1 }
}
}
91 changes: 91 additions & 0 deletions Sources/FluidAudio/Speaker/CampPlusModels.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
@preconcurrency import CoreML
import Foundation

/// Loaded CAM++ CoreML models (speaker embedding).
///
/// 2 stages from `FluidInference/campplus-coreml`:
/// - `preprocessor` (fp32, CPU): waveform -> [1, T, 80] fbank
/// - `model` (fp16, ANE): fbank -> [1, 192] speaker embedding
public struct CampPlusModels: Sendable {

public let preprocessor: MLModel
public let model: MLModel

private static let logger = AppLogger(category: "CampPlusModels")

public init(preprocessor: MLModel, model: MLModel) {
self.preprocessor = preprocessor
self.model = model
}

public static func downloadAndLoad(
progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> CampPlusModels {
try load(from: try await download(progressHandler: progressHandler))
}

public static func download(
force: Bool = false, progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> URL {
let modelsRoot = modelsRootDirectory()
let targetDir = modelsRoot.appendingPathComponent(Repo.campPlus.folderName, isDirectory: true)
if !force && modelsExist(at: targetDir) {
logger.info("CAM++ models already present at: \(targetDir.path)")
return targetDir
}
if force { try? FileManager.default.removeItem(at: targetDir) }
logger.info("Downloading CAM++ models from HuggingFace...")
try await DownloadUtils.downloadRepo(.campPlus, to: modelsRoot, progressHandler: progressHandler)
return targetDir
}

public static func modelsExist(at directory: URL) -> Bool {
let fm = FileManager.default
return [ModelNames.CampPlus.preprocessorFile, ModelNames.CampPlus.modelFile].allSatisfy {
fm.fileExists(atPath: directory.appendingPathComponent($0).path)
}
}

public static func load(from directory: URL) throws -> CampPlusModels {
let cpu = MLModelConfiguration()
cpu.computeUnits = .cpuOnly
// CAM++ uses a dynamic time dim (RangeDim) which the ANE compiler rejects;
// it's tiny (~7.2M), so run on CPU/GPU. Dynamic length avoids padding
// corrupting the statistics-pooled embedding.
let gpu = MLModelConfiguration()
gpu.computeUnits = .cpuAndGPU
let pre = try loadModel(named: ModelNames.CampPlus.preprocessor, from: directory, configuration: cpu)
let model = try loadModel(named: ModelNames.CampPlus.model, from: directory, configuration: gpu)
logger.info("Loaded CAM++ speaker-embedding models")
return CampPlusModels(preprocessor: pre, model: model)
}

private static func loadModel(
named name: String, from directory: URL, configuration: MLModelConfiguration
) throws -> MLModel {
let compiled = directory.appendingPathComponent("\(name).mlmodelc")
let pkg = directory.appendingPathComponent("\(name).mlpackage")
let url: URL
if FileManager.default.fileExists(atPath: compiled.path) {
url = compiled
} else if FileManager.default.fileExists(atPath: pkg.path) {
url = try MLModel.compileModel(at: pkg)
} else {
throw ASRError.processingFailed("CAM++ model not found: \(name)")
}
return try MLModel(contentsOf: url, configuration: configuration)
}

private static func modelsRootDirectory() -> URL {
let fm = FileManager.default
if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
return
appSupport
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
return fm.temporaryDirectory
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
}
33 changes: 33 additions & 0 deletions Sources/FluidAudioCLI/Commands/Speaker/CampPlusEmbedCommand.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#if os(macOS)
import AVFoundation
import FluidAudio
import Foundation

/// `campplus-embed <a.wav> [b.wav]`
/// One file → prints the 192-d embedding norm/preview; two files → cosine similarity
/// (speaker verification).
enum CampPlusEmbedCommand {
private static let logger = AppLogger(category: "CampPlusEmbed")

static func run(arguments: [String]) async {
let paths = arguments.filter { !$0.hasPrefix("-") }
guard let a = paths.first else {
print("Usage: fluidaudio campplus-embed <a.wav> [b.wav]")
return
}
do {
let embedder = try await CampPlusEmbedder.load()
let ea = try await embedder.embed(audioURL: URL(fileURLWithPath: a))
if paths.count >= 2 {
let eb = try await embedder.embed(audioURL: URL(fileURLWithPath: paths[1]))
let cos = CampPlusEmbedder.cosine(ea, eb)
print(String(format: "cosine = %.4f (%@)", cos, cos >= 0.5 ? "same speaker" : "different"))
} else {
print("embedding: dim=\(ea.count), first 5 = \(ea.prefix(5).map { String(format: "%.3f", $0) })")
}
} catch {
logger.error("CAM++ embed failed: \(error)")
}
}
}
#endif
2 changes: 2 additions & 0 deletions Sources/FluidAudioCLI/FluidAudioCLI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ struct FluidAudioCLI {
await CtcZhCnTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
case "sensevoice-transcribe":
await SenseVoiceTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
case "campplus-embed":
await CampPlusEmbedCommand.run(arguments: Array(arguments.dropFirst(2)))
case "sensevoice-benchmark":
await SenseVoiceBenchmark.run(arguments: Array(arguments.dropFirst(2)))
case "paraformer-transcribe":
Expand Down
Loading