From c435b9cc18708075fc91c93f6aaa22fa0c436196 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Wed, 17 Jun 2026 01:09:18 -0400
Subject: [PATCH 1/2] feat(asr/canary): Canary-1B-v2 AED engine + CTC-spotter
 custom vocab

Add NVIDIA Canary-1B-v2 (attention encoder-decoder) as a selectable ASR
engine, converted to CoreML (int4 on ANE, iOS18). Pipeline: fp32/CPU mel
preprocessor -> FastConformer encoder -> autoregressive transformer decoder
+ 1024->16384 projection, greedy decode to EOS.

- CanaryManager: actor, 15s window, reads decoder seq length from the model
- CanaryModels: download/load from FluidInference/canary-1b-v2-coreml (int4/fp16/int8)
- CanaryKeywordBooster: reuses the CTC keyword spotter to add custom-vocabulary
  support to canary (fuzzy replace + timestamp-guided insertion)
- CLI: canary-transcribe, canary-earnings-benchmark (OpenBench-comparable P/R/F1)
- ModelNames: Repo.canary1bV2 + ModelNames.Canary + CanaryPrecision
- Tests: CanaryConfigTests

Earnings22-keywords (full 772, same scorer as OpenBench):
  canary+vocab WER 16.5%, keyword F1 0.95 (beats Argmax parakeet-v3 0.89)
---
 .../FluidAudio/ASR/Canary/CanaryConfig.swift  |  63 ++++
 .../ASR/Canary/CanaryKeywordBooster.swift     | 169 +++++++++++
 .../FluidAudio/ASR/Canary/CanaryManager.swift | 274 ++++++++++++++++++
 .../FluidAudio/ASR/Canary/CanaryModels.swift  | 126 ++++++++
 Sources/FluidAudio/ModelNames.swift           |  43 +++
 .../ASR/CanaryEarningsBenchmark.swift         | 196 +++++++++++++
 .../ASR/CanaryTranscribeCommand.swift         | 208 +++++++++++++
 Sources/FluidAudioCLI/FluidAudioCLI.swift     |   4 +
 .../ASR/Canary/CanaryConfigTests.swift        |  63 ++++
 9 files changed, 1146 insertions(+)
 create mode 100644 Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
 create mode 100644 Sources/FluidAudio/ASR/Canary/CanaryKeywordBooster.swift
 create mode 100644 Sources/FluidAudio/ASR/Canary/CanaryManager.swift
 create mode 100644 Sources/FluidAudio/ASR/Canary/CanaryModels.swift
 create mode 100644 Sources/FluidAudioCLI/Commands/ASR/CanaryEarningsBenchmark.swift
 create mode 100644 Sources/FluidAudioCLI/Commands/ASR/CanaryTranscribeCommand.swift
 create mode 100644 Tests/FluidAudioTests/ASR/Canary/CanaryConfigTests.swift

diff --git a/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift b/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
new file mode 100644
index 00000000..9c173ea1
--- /dev/null
+++ b/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
@@ -0,0 +1,63 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Canary encoder/decoder weight precision.
+///
+/// `int4` (per-block-32 symmetric) runs on the Neural Engine and is the
+/// smallest build (~573 MB) — but int4 weight payloads require iOS18 / macOS 15.
+/// `fp16` is the iOS17 parity fallback (exact match to PyTorch). `int8`
+/// (per-channel) decodes correctly only on CPU — it crashes the GPU/ANE MPSGraph
+/// backend — so it is a CPU/size-only option.
+public enum CanaryPrecision: String, Sendable, CaseIterable {
+    case int4
+    case fp16
+    case int8
+
+    var encoderName: String {
+        switch self {
+        case .int4: return ModelNames.Canary.encoderInt4
+        case .fp16: return ModelNames.Canary.encoder
+        case .int8: return ModelNames.Canary.encoderInt8
+        }
+    }
+
+    var decoderName: String {
+        switch self {
+        case .int4: return ModelNames.Canary.decoderInt4
+        case .fp16: return ModelNames.Canary.decoder
+        case .int8: return ModelNames.Canary.decoderInt8
+        }
+    }
+
+    /// int8 only decodes correctly on CPU; int4/fp16 run on the Neural Engine.
+    var computeUnits: MLComputeUnits {
+        self == .int8 ? .cpuOnly : .cpuAndNeuralEngine
+    }
+}
+
+/// Fixed-shape contract for the canary-1b-v2 CoreML pipeline (15 s window).
+public enum CanaryConfig {
+    public static let sampleRate = 16000
+    /// 15 s window — the preprocessor input is fixed at this sample count.
+    public static let maxSamples = 240_000
+    public static let melDim = 128
+    public static let melFrames = 1501
+    public static let encoderHidden = 1024
+    public static let encoderFrames = 188
+    /// Decoder is exported at a fixed `[1, maxDecoderSteps]`. 128 covers a 15 s
+    /// window (max observed ~108 tokens incl. prompt) and is ~1.5× faster than 256.
+    /// `CanaryManager` reads the real length from the loaded model, so this is just
+    /// the contract/fallback value.
+    public static let maxDecoderSteps = 128
+    public static let vocabSize = 16384
+
+    // Special token ids (the model's real decoder ids — see vocab.json).
+    public static let eosId = 3  // <|endoftext|>
+    public static let padId = 2  // <pad>
+    public static let bosId = 4  // <|startoftranscript|>
+
+    /// canary2 prompt for English transcribe + punctuation/capitalization:
+    /// ▁ <|startofcontext|> <|startoftranscript|> <|emo:undefined|> <|en|> <|en|>
+    /// <|pnc|> <|noitn|> <|notimestamp|> <|nodiarize|>
+    public static let promptEnTranscribePnc: [Int32] = [16053, 7, 4, 16, 64, 64, 5, 9, 11, 13]
+}
diff --git a/Sources/FluidAudio/ASR/Canary/CanaryKeywordBooster.swift b/Sources/FluidAudio/ASR/Canary/CanaryKeywordBooster.swift
new file mode 100644
index 00000000..eee6ac3e
--- /dev/null
+++ b/Sources/FluidAudio/ASR/Canary/CanaryKeywordBooster.swift
@@ -0,0 +1,169 @@
+import Foundation
+
+/// Applies custom-vocabulary keyword boosting to a Canary (AED) transcript using
+/// the existing CTC keyword spotter — the same detector the parakeet "ctc custom
+/// vocab" path uses.
+///
+/// Canary decodes autoregressively and emits no per-frame timestamps, so the
+/// timestamp-constrained CTC rescorer (`VocabularyRescorer.ctcTokenRescore`)
+/// cannot be applied directly. Instead this reuses the engine-independent
+/// `CtcKeywordSpotter` to detect dictionary terms from the audio, then injects
+/// each detected term into Canary's transcript by fuzzy string match: a span that
+/// is close-but-not-exact to a detected term (i.e. Canary mis-spelled the domain
+/// word) is replaced with the canonical term.
+public struct CanaryKeywordBooster: Sendable {
+
+    public struct Result: Sendable {
+        public let text: String
+        /// Distinct terms the CTC spotter detected in the audio.
+        public let detected: [String]
+        /// Terms actually substituted into the transcript.
+        public let applied: [String]
+    }
+
+    private let spotter: CtcKeywordSpotter
+    private let tokenizer: CtcTokenizer
+    /// CTC detection score floor (log-prob; higher = stronger). Matches the
+    /// permissive detection threshold the earnings benchmark uses.
+    private let minScore: Float
+    /// Replace a transcript span only when its similarity to the term is at least
+    /// this (close enough to be the same word mis-transcribed).
+    private let minSimilarity: Float
+    /// …and below this (above it the span is already essentially the term).
+    private let maxSimilarity: Float
+    /// When a detected term has no fuzzy-matchable span (canary missed it entirely),
+    /// insert it at the position implied by the CTC detection time.
+    private let insertOnMiss: Bool
+    /// Only insert (vs replace) when the detection score clears this stronger floor —
+    /// protects precision against weak detections being force-inserted.
+    private let insertScoreFloor: Float
+
+    private static let logger = AppLogger(category: "CanaryKeywordBooster")
+
+    public init(
+        spotter: CtcKeywordSpotter,
+        tokenizer: CtcTokenizer,
+        minScore: Float = -15.0,
+        minSimilarity: Float = 0.60,
+        maxSimilarity: Float = 0.97,
+        insertOnMiss: Bool = true,
+        insertScoreFloor: Float = -6.0
+    ) {
+        self.spotter = spotter
+        self.tokenizer = tokenizer
+        self.minScore = minScore
+        self.minSimilarity = minSimilarity
+        self.maxSimilarity = maxSimilarity
+        self.insertOnMiss = insertOnMiss
+        self.insertScoreFloor = insertScoreFloor
+    }
+
+    /// Load the CTC spotter + tokenizer (parakeet-tdt_ctc-110m) and build a booster.
+    public static func load(
+        minScore: Float = -15.0,
+        minSimilarity: Float = 0.60,
+        insertOnMiss: Bool = true,
+        insertScoreFloor: Float = -6.0
+    ) async throws -> CanaryKeywordBooster {
+        let models = try await CtcModels.downloadAndLoad()
+        let tokenizer = try await CtcTokenizer.load()
+        return CanaryKeywordBooster(
+            spotter: CtcKeywordSpotter(models: models), tokenizer: tokenizer, minScore: minScore,
+            minSimilarity: minSimilarity, insertOnMiss: insertOnMiss, insertScoreFloor: insertScoreFloor)
+    }
+
+    /// Ensure every term carries CTC token IDs (the spotter scores by them).
+    private func tokenized(_ vocabulary: CustomVocabularyContext) -> CustomVocabularyContext {
+        let terms = vocabulary.terms.map { term -> CustomVocabularyTerm in
+            if let ids = term.ctcTokenIds, !ids.isEmpty { return term }
+            let ids = tokenizer.encode(term.text)
+            return CustomVocabularyTerm(
+                text: term.text, weight: term.weight, aliases: term.aliases, tokenIds: term.tokenIds,
+                ctcTokenIds: ids)
+        }
+        return CustomVocabularyContext(
+            terms: terms, alpha: vocabulary.alpha, minCtcScore: vocabulary.minCtcScore,
+            minSimilarity: vocabulary.minSimilarity, minCombinedConfidence: vocabulary.minCombinedConfidence,
+            minTermLength: vocabulary.minTermLength)
+    }
+
+    /// Inject CTC-spotted custom-vocabulary terms into `transcript`.
+    public func boost(
+        transcript: String, audioSamples: [Float], vocabulary: CustomVocabularyContext
+    ) async throws -> Result {
+        let vocab = tokenized(vocabulary)
+        let spot = try await spotter.spotKeywordsWithLogProbs(
+            audioSamples: audioSamples, customVocabulary: vocab, minScore: minScore)
+
+        // Best CTC detection (score + start time) per detected term.
+        var detByTerm: [String: (term: CustomVocabularyTerm, score: Float, startTime: TimeInterval)] = [:]
+        for d in spot.detections where d.score >= minScore {
+            let key = d.term.textLowercased
+            if let cur = detByTerm[key], cur.score >= d.score { continue }
+            detByTerm[key] = (d.term, d.score, d.startTime)
+        }
+        let detected = detByTerm.values.map { $0.term.text }.sorted()
+        guard !detByTerm.isEmpty else {
+            return Result(text: transcript, detected: detected, applied: [])
+        }
+        let duration = max(0.001, Double(audioSamples.count) / 16000.0)
+
+        // Strongest detections first; longer phrases before shorter to avoid
+        // a single word stealing a multi-word match.
+        let ordered = detByTerm.values.sorted {
+            $0.term.text.split(separator: " ").count != $1.term.text.split(separator: " ").count
+                ? $0.term.text.split(separator: " ").count > $1.term.text.split(separator: " ").count
+                : $0.score > $1.score
+        }
+
+        var words = transcript.split(separator: " ").map(String.init)
+        var applied: [String] = []
+
+        for entry in ordered {
+            let term = entry.term
+            let termLower = term.textLowercased
+            // Already present (case-insensitive substring) → nothing to fix.
+            if words.joined(separator: " ").lowercased().contains(termLower) { continue }
+
+            let termWords = term.text.split(separator: " ").map(String.init)
+            let span = max(1, termWords.count)
+
+            // 1) Fuzzy replace: a close-but-wrong span is canary mis-spelling the term.
+            var bestIdx = -1
+            var bestSim: Float = 0
+            if words.count >= span {
+                for i in 0...(words.count - span) {
+                    let window = normalize(words[i..<(i + span)].joined(separator: " "))
+                    let sim = VocabularyRescorer.stringSimilarity(window, termLower)
+                    if sim > bestSim {
+                        bestSim = sim
+                        bestIdx = i
+                    }
+                }
+            }
+
+            if bestIdx >= 0, bestSim >= minSimilarity, bestSim < maxSimilarity {
+                words.replaceSubrange(bestIdx..<(bestIdx + span), with: termWords)
+                applied.append(term.text)
+                continue
+            }
+
+            // 2) Timestamp-guided insertion: canary missed the word entirely (no fuzzy
+            // span). The CTC detection still localizes it in time, so insert it at the
+            // proportional word position. Gated by a stronger score floor to protect
+            // precision.
+            if insertOnMiss, entry.score >= insertScoreFloor, !words.isEmpty {
+                let frac = min(1.0, max(0.0, entry.startTime / duration))
+                let pos = min(words.count, Int((frac * Double(words.count)).rounded()))
+                words.insert(contentsOf: termWords, at: pos)
+                applied.append(term.text)
+            }
+        }
+
+        return Result(text: words.joined(separator: " "), detected: detected, applied: applied)
+    }
+
+    private func normalize(_ s: String) -> String {
+        s.lowercased().filter { !$0.isPunctuation }
+    }
+}
diff --git a/Sources/FluidAudio/ASR/Canary/CanaryManager.swift b/Sources/FluidAudio/ASR/Canary/CanaryManager.swift
new file mode 100644
index 00000000..f21b436f
--- /dev/null
+++ b/Sources/FluidAudio/ASR/Canary/CanaryManager.swift
@@ -0,0 +1,274 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Manager for NVIDIA Canary-1B-v2 transcription (attention encoder-decoder).
+///
+/// Pipeline: waveform → [Preprocessor fp32/CPU] mel → [Encoder int4/ANE] →
+/// transpose to [1, T, D] → greedy autoregressive loop ([Decoder] → last hidden
+/// → [Projection] → argmax until EOS) → SentencePiece detokenize.
+///
+/// The decoder carries no KV cache: each step re-runs the full `[1, 256]` token
+/// sequence (matches the converted CoreML model). The 15 s window is fixed; audio
+/// longer than 15 s is truncated (chunking is a future addition).
+public actor CanaryManager {
+
+    private let models: CanaryModels
+    private let prompt: [Int32]
+    private static let logger = AppLogger(category: "CanaryManager")
+
+    public init(models: CanaryModels, prompt: [Int32] = CanaryConfig.promptEnTranscribePnc) {
+        self.models = models
+        self.prompt = prompt
+    }
+
+    /// Load models from the default cache (downloading if needed), then build a manager.
+    public static func load(
+        precision: CanaryPrecision = .int4,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> CanaryManager {
+        let models = try await CanaryModels.downloadAndLoad(precision: precision, progressHandler: progressHandler)
+        return CanaryManager(models: models)
+    }
+
+    /// Transcribe a 16 kHz mono audio file.
+    public func transcribe(audioURL: URL) throws -> String {
+        let converter = AudioConverter(sampleRate: Double(CanaryConfig.sampleRate))
+        let samples = try converter.resampleAudioFile(audioURL)
+        return try transcribe(audio: samples)
+    }
+
+    /// Transcribe 16 kHz mono float samples (in [-1, 1]).
+    public func transcribe(audio: [Float]) throws -> String {
+        let (mel, melLength) = try runPreprocessor(audio: audio)
+        let (encoder, encoderLength) = try runEncoder(mel: mel, melLength: melLength)
+        let (embeddings, encoderMask) = try makeDecoderContext(encoder: encoder, encoderLength: encoderLength)
+        let tokens = try greedyDecode(embeddings: embeddings, encoderMask: encoderMask)
+        return detokenize(tokens)
+    }
+
+    // MARK: - Pipeline
+
+    /// waveform → mel [1, 128, 1501]. Audio is padded/truncated to the fixed 15 s window.
+    private func runPreprocessor(audio: [Float]) throws -> (MLMultiArray, MLMultiArray) {
+        let maxN = CanaryConfig.maxSamples
+        let validN = min(audio.count, maxN)
+        if audio.count > maxN {
+            Self.logger.warning("Audio \(audio.count) samples > 15 s window; truncating to \(maxN)")
+        }
+
+        let signal = try MLMultiArray(shape: [1, maxN as NSNumber], dataType: .float32)
+        let sptr = signal.dataPointer.assumingMemoryBound(to: Float32.self)
+        memset(sptr, 0, maxN * MemoryLayout<Float32>.size)
+        audio.prefix(validN).withUnsafeBufferPointer { src in
+            sptr.update(from: src.baseAddress!, count: validN)
+        }
+
+        let length = try MLMultiArray(shape: [1], dataType: .int32)
+        length[0] = NSNumber(value: validN)
+
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "audio_signal": MLFeatureValue(multiArray: signal),
+            "audio_length": MLFeatureValue(multiArray: length),
+        ])
+        let out = try models.preprocessor.prediction(from: input)
+        guard let mel = out.featureValue(for: "processed")?.multiArrayValue,
+            let melLen = out.featureValue(for: "processed_length")?.multiArrayValue
+        else {
+            throw ASRError.processingFailed("Canary preprocessor produced no `processed`")
+        }
+        return (mel, melLen)
+    }
+
+    /// mel → encoder [1, 1024, 188].
+    private func runEncoder(mel: MLMultiArray, melLength: MLMultiArray) throws -> (MLMultiArray, Int) {
+        let featLen = try MLMultiArray(shape: [1], dataType: .int32)
+        featLen[0] = NSNumber(value: melLength[0].intValue)
+
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "features": MLFeatureValue(multiArray: mel),
+            "features_length": MLFeatureValue(multiArray: featLen),
+        ])
+        let out = try models.encoder.prediction(from: input)
+        guard let enc = out.featureValue(for: "encoder")?.multiArrayValue else {
+            throw ASRError.processingFailed("Canary encoder produced no `encoder`")
+        }
+        let encLen = out.featureValue(for: "encoder_length")?.multiArrayValue?[0].intValue ?? CanaryConfig.encoderFrames
+        return (enc, encLen)
+    }
+
+    /// encoder [1, D, T] → encoder_embeddings [1, T, D] + encoder_mask [1, T].
+    ///
+    /// CoreML pads the encoder's last dim to a 64-element boundary (T=188 →
+    /// stride 192), so the transpose must use the array's real strides, not a
+    /// dense linear read.
+    private func makeDecoderContext(encoder: MLMultiArray, encoderLength: Int) throws -> (MLMultiArray, MLMultiArray) {
+        let d = CanaryConfig.encoderHidden
+        let t = CanaryConfig.encoderFrames
+        let emb = try MLMultiArray(shape: [1, t as NSNumber, d as NSNumber], dataType: .float32)
+        let eptr = emb.dataPointer.assumingMemoryBound(to: Float32.self)
+        let strides = encoder.strides.map { $0.intValue }
+        let sD = strides[1]
+        let sT = strides[2]
+        let read = floatReader(encoder)
+        for ti in 0..<t {
+            let dst = ti * d
+            let tBase = ti * sT
+            for di in 0..<d {
+                eptr[dst + di] = read(di * sD + tBase)
+            }
+        }
+
+        let mask = try MLMultiArray(shape: [1, t as NSNumber], dataType: .float32)
+        let mptr = mask.dataPointer.assumingMemoryBound(to: Float32.self)
+        let valid = min(max(encoderLength, 1), t)
+        for i in 0..<t { mptr[i] = i < valid ? 1.0 : 0.0 }
+        return (emb, mask)
+    }
+
+    /// Greedy autoregressive decode: returns generated token ids (prompt stripped).
+    private func greedyDecode(embeddings: MLMultiArray, encoderMask: MLMultiArray) throws -> [Int] {
+        // Use the decoder's actual sequence length (the exported `[1, S]` shape),
+        // so a shorter decoder export (e.g. S=128) is picked up automatically.
+        let s =
+            models.decoder.modelDescription.inputDescriptionsByName["input_ids"]?
+            .multiArrayConstraint?.shape.last?.intValue ?? CanaryConfig.maxDecoderSteps
+
+        let inputIds = try MLMultiArray(shape: [1, s as NSNumber], dataType: .int32)
+        let decoderMask = try MLMultiArray(shape: [1, s as NSNumber], dataType: .float32)
+        let idptr = inputIds.dataPointer.assumingMemoryBound(to: Int32.self)
+        let mkptr = decoderMask.dataPointer.assumingMemoryBound(to: Float32.self)
+        for i in 0..<s {
+            idptr[i] = 0
+            mkptr[i] = 0
+        }
+        let promptLen = min(prompt.count, s)
+        for i in 0..<promptLen {
+            idptr[i] = prompt[i]
+            mkptr[i] = 1
+        }
+        var pos = promptLen
+
+        let hidden = try MLMultiArray(shape: [1, CanaryConfig.encoderHidden as NSNumber], dataType: .float32)
+        let hptr = hidden.dataPointer.assumingMemoryBound(to: Float32.self)
+        let d = CanaryConfig.encoderHidden
+
+        var generated: [Int] = []
+        while pos < s {
+            let input = try MLDictionaryFeatureProvider(dictionary: [
+                "input_ids": MLFeatureValue(multiArray: inputIds),
+                "decoder_mask": MLFeatureValue(multiArray: decoderMask),
+                "encoder_embeddings": MLFeatureValue(multiArray: embeddings),
+                "encoder_mask": MLFeatureValue(multiArray: encoderMask),
+            ])
+            let out = try models.decoder.prediction(from: input)
+            guard let dec = out.featureValue(for: "decoder")?.multiArrayValue else {
+                throw ASRError.processingFailed("Canary decoder produced no `decoder`")
+            }
+
+            // hidden state at the last valid position (decoder output may be stride-padded)
+            let decStrides = dec.strides.map { $0.intValue }
+            let rowBase = (pos - 1) * decStrides[1]
+            let elemStride = decStrides[2]
+            let readDec = floatReader(dec)
+            for h in 0..<d { hptr[h] = readDec(rowBase + h * elemStride) }
+
+            let projInput = try MLDictionaryFeatureProvider(dictionary: [
+                "hidden": MLFeatureValue(multiArray: hidden)
+            ])
+            let projOut = try models.projection.prediction(from: projInput)
+            guard let logits = projOut.featureValue(for: "logits")?.multiArrayValue else {
+                throw ASRError.processingFailed("Canary projection produced no `logits`")
+            }
+
+            let next = argmax(logits)
+            if next == CanaryConfig.eosId { break }
+
+            generated.append(next)
+            idptr[pos] = Int32(next)
+            mkptr[pos] = 1
+            pos += 1
+        }
+        return generated
+    }
+
+    private func detokenize(_ tokens: [Int]) -> String {
+        models.tokenizer.decode(ids: tokens)
+            .replacingOccurrences(of: "<\\|[^|]*\\|>", with: "", options: .regularExpression)
+            .trimmingCharacters(in: .whitespaces)
+    }
+
+    // MARK: - MLMultiArray helpers
+
+    /// Returns a dtype-aware element reader for `arr` indexed by flat offset.
+    /// The closure captures a pointer derived from `arr.dataPointer`; it is only
+    /// valid while `arr` is alive (which it is for the duration of each use here).
+    private func floatReader(_ arr: MLMultiArray) -> (Int) -> Float {
+        switch arr.dataType {
+        case .float32:
+            let p = arr.dataPointer.assumingMemoryBound(to: Float32.self)
+            return { p[$0] }
+        case .float16:
+            let p = arr.dataPointer.assumingMemoryBound(to: UInt16.self)
+            return { float16BitsToFloat(p[$0]) }
+        default:
+            return { arr[$0].floatValue }
+        }
+    }
+
+    private func argmax(_ logits: MLMultiArray) -> Int {
+        let n = logits.count
+        var best = 0
+        var bestVal = -Float.greatestFiniteMagnitude
+        switch logits.dataType {
+        case .float32:
+            let p = logits.dataPointer.assumingMemoryBound(to: Float32.self)
+            for i in 0..<n where p[i] > bestVal {
+                bestVal = p[i]
+                best = i
+            }
+        case .float16:
+            let p = logits.dataPointer.assumingMemoryBound(to: UInt16.self)
+            for i in 0..<n {
+                let v = float16BitsToFloat(p[i])
+                if v > bestVal {
+                    bestVal = v
+                    best = i
+                }
+            }
+        default:
+            for i in 0..<n {
+                let v = logits[i].floatValue
+                if v > bestVal {
+                    bestVal = v
+                    best = i
+                }
+            }
+        }
+        return best
+    }
+}
+
+/// Decode an IEEE-754 half-precision bit pattern to Float (avoids a hard Float16 dependency).
+@inline(__always)
+private func float16BitsToFloat(_ h: UInt16) -> Float {
+    let sign = UInt32(h & 0x8000) << 16
+    let exp = UInt32(h & 0x7C00) >> 10
+    let mant = UInt32(h & 0x03FF)
+    if exp == 0 {
+        if mant == 0 { return Float(bitPattern: sign) }
+        // subnormal
+        var e: UInt32 = 127 - 15 + 1
+        var m = mant
+        while (m & 0x0400) == 0 {
+            m <<= 1
+            e -= 1
+        }
+        m &= 0x03FF
+        return Float(bitPattern: sign | (e << 23) | (m << 13))
+    }
+    if exp == 0x1F {
+        return Float(bitPattern: sign | 0x7F80_0000 | (mant << 13))
+    }
+    let e = exp - 15 + 127
+    return Float(bitPattern: sign | (e << 23) | (mant << 13))
+}
diff --git a/Sources/FluidAudio/ASR/Canary/CanaryModels.swift b/Sources/FluidAudio/ASR/Canary/CanaryModels.swift
new file mode 100644
index 00000000..4fc47f32
--- /dev/null
+++ b/Sources/FluidAudio/ASR/Canary/CanaryModels.swift
@@ -0,0 +1,126 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Loaded canary-1b-v2 CoreML models + tokenizer.
+///
+/// 4 stages from `FluidInference/canary-1b-v2-coreml`:
+///   - `preprocessor` (fp32, CPU): waveform [1,240000] → mel [1,128,1501]
+///   - `encoder` (int4 ANE / fp16): mel → encoder [1,1024,188]
+///   - `decoder` (int4 ANE / fp16): autoregressive transformer → hidden [1,256,1024]
+///   - `projection` (fp16, ANE): hidden [1,1024] → logits [1,16384]
+///   - `tokenizer`: 16384 SentencePiece pieces (id → piece)
+public struct CanaryModels: Sendable {
+
+    public let preprocessor: MLModel
+    public let encoder: MLModel
+    public let decoder: MLModel
+    public let projection: MLModel
+    public let tokenizer: Tokenizer
+
+    private static let logger = AppLogger(category: "CanaryModels")
+
+    public init(
+        preprocessor: MLModel, encoder: MLModel, decoder: MLModel, projection: MLModel, tokenizer: Tokenizer
+    ) {
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+        self.decoder = decoder
+        self.projection = projection
+        self.tokenizer = tokenizer
+    }
+
+    /// Download (if needed) and load all canary models.
+    public static func downloadAndLoad(
+        precision: CanaryPrecision = .int4,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> CanaryModels {
+        let directory = try await download(precision: precision, progressHandler: progressHandler)
+        return try load(from: directory, precision: precision)
+    }
+
+    /// Download the repo into the shared model cache; returns the model directory.
+    public static func download(
+        precision: CanaryPrecision = .int4,
+        force: Bool = false,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let modelsRoot = modelsRootDirectory()
+        let targetDir = modelsRoot.appendingPathComponent(Repo.canary1bV2.folderName, isDirectory: true)
+
+        if !force && modelsExist(at: targetDir, precision: precision) {
+            logger.info("Canary models already present at: \(targetDir.path)")
+            return targetDir
+        }
+        if force { try? FileManager.default.removeItem(at: targetDir) }
+
+        logger.info("Downloading Canary models (\(precision.rawValue)) from HuggingFace...")
+        try await DownloadUtils.downloadRepo(
+            .canary1bV2, to: modelsRoot, variant: precision.rawValue, progressHandler: progressHandler)
+        logger.info("Successfully downloaded Canary models")
+        return targetDir
+    }
+
+    public static func modelsExist(at directory: URL, precision: CanaryPrecision = .int4) -> Bool {
+        let fm = FileManager.default
+        let required = ModelNames.Canary.requiredModels(precision: precision)
+        return required.allSatisfy { fm.fileExists(atPath: directory.appendingPathComponent($0).path) }
+    }
+
+    /// Load models from a directory that already contains the artifacts.
+    public static func load(from directory: URL, precision: CanaryPrecision = .int4) throws -> CanaryModels {
+        // Preprocessor runs fp32 on CPU (power-spectrum / log exceed fp16 range).
+        let cpuConfig = MLModelConfiguration()
+        cpuConfig.computeUnits = .cpuOnly
+
+        let aneConfig = MLModelConfiguration()
+        aneConfig.computeUnits = precision.computeUnits
+
+        let neConfig = MLModelConfiguration()
+        neConfig.computeUnits = .cpuAndNeuralEngine
+
+        let preprocessor = try loadModel(
+            named: ModelNames.Canary.preprocessor, from: directory, configuration: cpuConfig)
+        let encoder = try loadModel(named: precision.encoderName, from: directory, configuration: aneConfig)
+        let decoder = try loadModel(named: precision.decoderName, from: directory, configuration: aneConfig)
+        let projection = try loadModel(
+            named: ModelNames.Canary.projection, from: directory, configuration: neConfig)
+        let tokenizer = try Tokenizer(
+            vocabPath: directory.appendingPathComponent(ModelNames.Canary.vocabularyFile))
+
+        logger.info("Loaded Canary (encoder/decoder: \(precision.rawValue))")
+        return CanaryModels(
+            preprocessor: preprocessor, encoder: encoder, decoder: decoder, projection: projection,
+            tokenizer: tokenizer)
+    }
+
+    // MARK: - Private
+
+    private static func loadModel(
+        named name: String, from directory: URL, configuration: MLModelConfiguration
+    ) throws -> MLModel {
+        let compiledPath = directory.appendingPathComponent("\(name).mlmodelc")
+        let packagePath = directory.appendingPathComponent("\(name).mlpackage")
+        let modelURL: URL
+        if FileManager.default.fileExists(atPath: compiledPath.path) {
+            modelURL = compiledPath
+        } else if FileManager.default.fileExists(atPath: packagePath.path) {
+            modelURL = try MLModel.compileModel(at: packagePath)
+        } else {
+            throw ASRError.processingFailed("Canary model not found: \(name)")
+        }
+        return try MLModel(contentsOf: modelURL, configuration: configuration)
+    }
+
+    private static func modelsRootDirectory() -> URL {
+        let fm = FileManager.default
+        if let appSupport = fm.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
+            return
+                appSupport
+                .appendingPathComponent("FluidAudio", isDirectory: true)
+                .appendingPathComponent("Models", isDirectory: true)
+        }
+        return fm.temporaryDirectory
+            .appendingPathComponent("FluidAudio", isDirectory: true)
+            .appendingPathComponent("Models", isDirectory: true)
+    }
+}
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 63dc5920..71b7f422 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -54,6 +54,13 @@ public enum Repo: String, CaseIterable, Sendable {
     case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml"
     case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml"
     case cohereTranscribeCoreml = "FluidInference/cohere-transcribe-03-2026-coreml/q8"
+    /// Canary-1B-v2 (NVIDIA) — attention encoder-decoder (AED) ASR, 25 European
+    /// languages, 16384-token SentencePiece BPE. 4-stage CoreML pipeline:
+    /// fp32/CPU preprocessor (waveform→mel) + FastConformer encoder + autoregressive
+    /// Transformer decoder (full-sequence re-run per step) + 1024→16384 projection,
+    /// greedy until EOS (id 3). int4 encoder/decoder run on ANE (iOS18); fp16 is the
+    /// iOS17 parity fallback. See ASR/Canary.
+    case canary1bV2 = "FluidInference/canary-1b-v2-coreml"
     /// StyleTTS2 LibriTTS — `iteration_3/compiled/` is the only directory
     /// with `.mlmodelc` artifacts; the parent repo also ships `packages/`
     /// (`.mlpackage` source) and `swift/` (a debug harness) that the Swift
@@ -129,6 +136,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "parakeet-tdt-ctc-110m-coreml"
         case .cohereTranscribeCoreml:
             return "cohere-transcribe-03-2026-coreml/q8"
+        case .canary1bV2:
+            return "canary-1b-v2-coreml"
         case .styletts2:
             return "StyleTTS-2-coreml/iteration_3/compiled"
         case .supertonic3:
@@ -426,6 +435,37 @@ public enum ModelNames {
         public static let requiredModels: Set<String> = requiredModels()
     }
 
+    /// Canary-1B-v2 (AED) model names. 4 CoreML stages + host greedy loop:
+    ///   Preprocessor (fp32/CPU): waveform [1,240000] -> mel [1,128,1501]
+    ///   Encoder (int4 ANE / fp16): mel -> encoder [1,1024,188]
+    ///   Decoder (int4 ANE / fp16): autoregressive transformer hidden states
+    ///   Projection (fp16/ANE): hidden [1,1024] -> logits [1,16384]
+    /// Plus `vocab.json` (16384 SentencePiece pieces, id -> piece). int4 needs iOS18.
+    public enum Canary {
+        public static let preprocessor = "Preprocessor"
+        public static let projection = "Projection"
+        public static let encoder = "Encoder"  // fp16, ANE, iOS17
+        public static let encoderInt4 = "EncoderInt4"  // int4, ANE, iOS18 (default)
+        public static let encoderInt8 = "EncoderInt8"  // int8, CPU-only
+        public static let decoder = "Decoder"  // fp16
+        public static let decoderInt4 = "DecoderInt4"  // int4 (default)
+        public static let decoderInt8 = "DecoderInt8"  // int8, CPU-only
+
+        public static let preprocessorFile = preprocessor + ".mlmodelc"
+        public static let projectionFile = projection + ".mlmodelc"
+        public static let vocabularyFile = "vocab.json"
+
+        public static func requiredModels(precision: CanaryPrecision = .int4) -> Set<String> {
+            [
+                preprocessorFile,
+                projectionFile,
+                precision.encoderName + ".mlmodelc",
+                precision.decoderName + ".mlmodelc",
+                vocabularyFile,
+            ]
+        }
+    }
+
     /// Paraformer-large (zh) model names. 4 CoreML stages + host CIF:
     ///   Preprocessor (fp32/CPU): waveform -> 560-d LFR features
     ///   Encoder (fp16/ANE): SANM encoder (enumerated buckets)
@@ -1283,6 +1323,9 @@ public enum ModelNames {
             return ModelNames.MultilingualG2P.requiredModels
         case .cohereTranscribeCoreml:
             return ModelNames.CohereTranscribe.requiredModels
+        case .canary1bV2:
+            return ModelNames.Canary.requiredModels(
+                precision: CanaryPrecision(rawValue: variant ?? "") ?? .int4)
         case .styletts2:
             // Sentinel variants:
             //   "all"     → 14 bundles (8 defaults + 6 buckets)
diff --git a/Sources/FluidAudioCLI/Commands/ASR/CanaryEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/CanaryEarningsBenchmark.swift
new file mode 100644
index 00000000..d312ad9b
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/ASR/CanaryEarningsBenchmark.swift
@@ -0,0 +1,196 @@
+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+
+/// `canary-earnings-benchmark [--max-files N] [--no-vocab] [--min-similarity F]`
+///
+/// Canary-1B-v2 base transcription + CTC-spotter custom-vocabulary boosting on the
+/// Earnings22-kws dataset. Mirrors the metrics of `ctc-earnings-benchmark`
+/// (macro WER + dictionary recall) so the two engines can be compared directly.
+enum CanaryEarningsBenchmark {
+    private static let logger = AppLogger(category: "CanaryEarningsBenchmark")
+
+    static func run(arguments: [String]) async {
+        var maxFiles = Int.max
+        var useVocab = true
+        var minSimilarity: Float = 0.60
+        var insertOnMiss = true
+        var insertScore: Float = -6.0
+        var dataDir: String?
+
+        var i = 0
+        while i < arguments.count {
+            switch arguments[i] {
+            case "--max-files":
+                i += 1
+                if i < arguments.count { maxFiles = Int(arguments[i]) ?? .max }
+            case "--no-vocab": useVocab = false
+            case "--min-similarity":
+                i += 1
+                if i < arguments.count { minSimilarity = Float(arguments[i]) ?? minSimilarity }
+            case "--no-insert": insertOnMiss = false
+            case "--insert-score":
+                i += 1
+                if i < arguments.count { insertScore = Float(arguments[i]) ?? insertScore }
+            case "--data-dir":
+                i += 1
+                if i < arguments.count { dataDir = arguments[i] }
+            case "--help", "-h":
+                print(
+                    """
+                    Usage: fluidaudio canary-earnings-benchmark [options]
+                      --max-files N        limit files
+                      --no-vocab           canary only (no keyword boosting) baseline
+                      --min-similarity F   fuzzy-replace threshold (default 0.60)
+                      --data-dir PATH      earnings22-kws test-dataset dir
+                    """)
+                return
+            default: break
+            }
+            i += 1
+        }
+
+        let root =
+            dataDir.map { URL(fileURLWithPath: $0) }
+            ?? URL(fileURLWithPath: NSHomeDirectory())
+            .appendingPathComponent("Library/Application Support/FluidAudio/earnings22-kws/test-dataset")
+        guard FileManager.default.fileExists(atPath: root.path) else {
+            logger.error("earnings22-kws not found at \(root.path); run: fluidaudio download --dataset earnings22-kws")
+            return
+        }
+
+        do {
+            logger.info("Loading canary + CTC keyword booster...")
+            let canary = try await CanaryManager.load(precision: .int4)
+            let booster =
+                useVocab
+                ? try await CanaryKeywordBooster.load(
+                    minSimilarity: minSimilarity, insertOnMiss: insertOnMiss, insertScoreFloor: insertScore)
+                : nil
+            let converter = AudioConverter(sampleRate: 16000)
+
+            let wavs =
+                (try? FileManager.default.contentsOfDirectory(at: root, includingPropertiesForKeys: nil))?
+                .filter { $0.pathExtension.lowercased() == "wav" }
+                .sorted { $0.lastPathComponent < $1.lastPathComponent } ?? []
+            let files = Array(wavs.prefix(maxFiles))
+            guard !files.isEmpty else {
+                logger.error("no wavs under \(root.path)")
+                return
+            }
+            logger.info("Canary earnings benchmark on \(files.count) files (vocab: \(useVocab))...")
+
+            var werBaseSum = 0.0
+            var werBoostSum = 0.0
+            var scored = 0
+            var dictFound = 0
+            var dictTotal = 0
+            // OpenBench-style keyword metric (same whole-word definition as ctc-earnings-benchmark)
+            var tp = 0
+            var fp = 0
+            var fn = 0
+            var termsApplied = 0
+            var audioTotal = 0.0
+            var computeTotal = 0.0
+
+            for (idx, wav) in files.enumerated() {
+                let stem = wav.deletingPathExtension()
+                let refURL = stem.appendingPathExtension("text").appendingPathExtension("txt")
+                let dictURL = stem.appendingPathExtension("dictionary").appendingPathExtension("txt")
+                guard
+                    let ref = (try? String(contentsOf: refURL, encoding: .utf8))?.trimmingCharacters(
+                        in: .whitespacesAndNewlines), !ref.isEmpty
+                else { continue }
+
+                do {
+                    let samples = try converter.resampleAudioFile(wav)
+                    let start = Date()
+                    let baseText = try await canary.transcribe(audio: samples)
+
+                    var finalText = baseText
+                    var dictTerms: [String] = []
+                    if useVocab, let booster, FileManager.default.fileExists(atPath: dictURL.path) {
+                        let vocab = try CustomVocabularyContext.loadFromSimpleFormat(from: dictURL)
+                        dictTerms = vocab.terms.map { $0.text }
+                        let r = try await booster.boost(transcript: baseText, audioSamples: samples, vocabulary: vocab)
+                        finalText = r.text
+                        termsApplied += r.applied.count
+                    }
+                    computeTotal += Date().timeIntervalSince(start)
+                    audioTotal += Double(samples.count) / 16000.0
+
+                    let wBase = WERCalculator.calculateWERMetrics(hypothesis: baseText, reference: ref).wer
+                    let wBoost = WERCalculator.calculateWERMetrics(hypothesis: finalText, reference: ref).wer
+                    werBaseSum += wBase
+                    werBoostSum += wBoost
+                    scored += 1
+
+                    if !dictTerms.isEmpty {
+                        // Strict, engine-agnostic keyword scoring (whole-word match in
+                        // normalized text) — identical definition to ctc-earnings-benchmark
+                        // and OpenBench: TP = in ref & hyp, FP = in hyp not ref, FN = in ref not hyp.
+                        let refLower = TextNormalizer.normalize(ref).lowercased()
+                        let hypLower = TextNormalizer.normalize(finalText).lowercased()
+                        let hayLower = finalText.lowercased()
+                        for term in dictTerms {
+                            dictTotal += 1
+                            if hayLower.contains(term.lowercased()) { dictFound += 1 }  // loose recall (substring)
+                            let tl = term.lowercased()
+                            let inRef = containsWholeWord(tl, in: refLower)
+                            let inHyp = containsWholeWord(tl, in: hypLower)
+                            if inRef && inHyp {
+                                tp += 1
+                            } else if inHyp && !inRef {
+                                fp += 1
+                            } else if inRef && !inHyp {
+                                fn += 1
+                            }
+                        }
+                    }
+
+                    if (idx + 1) % 25 == 0 || idx + 1 == files.count {
+                        logger.info("  \(idx + 1)/\(files.count) done")
+                    }
+                } catch {
+                    logger.error("  failed \(wav.lastPathComponent): \(error)")
+                }
+            }
+
+            let rtfx = computeTotal > 0 ? audioTotal / computeTotal : 0
+            let werBase = scored > 0 ? werBaseSum / Double(scored) * 100 : -1
+            let werBoost = scored > 0 ? werBoostSum / Double(scored) * 100 : -1
+            let recall = dictTotal > 0 ? Double(dictFound) / Double(dictTotal) * 100 : -1
+            print("")
+            print("===== Canary earnings benchmark =====")
+            print("Files scored     : \(scored)")
+            print("RTFx             : \(String(format: "%.2f", rtfx))x")
+            print("WER canary-only  : \(String(format: "%.2f", werBase))%")
+            if useVocab {
+                let precision = (tp + fp) > 0 ? Double(tp) / Double(tp + fp) : 0
+                let kwRecall = (tp + fn) > 0 ? Double(tp) / Double(tp + fn) : 0
+                let f1 = (precision + kwRecall) > 0 ? 2 * precision * kwRecall / (precision + kwRecall) : 0
+                print("WER canary+vocab : \(String(format: "%.2f", werBoost))%")
+                print("Terms applied    : \(termsApplied)")
+                print("Dict recall(loose): \(dictFound)/\(dictTotal) (\(String(format: "%.1f", recall))%)")
+                print(
+                    "Keyword Recall   : \(String(format: "%.3f", kwRecall)) (TP=\(tp), FN=\(fn))")
+                print(
+                    "Keyword Precision: \(String(format: "%.3f", precision)) (TP=\(tp), FP=\(fp))")
+                print("Keyword F1       : \(String(format: "%.3f", f1))")
+            }
+            print("=====================================")
+        } catch {
+            logger.error("Benchmark failed: \(error)")
+        }
+    }
+
+    /// Whole-word (\\b…\\b) case-insensitive match, matching the CTC benchmark's
+    /// keyword scoring so canary and ctc numbers are directly comparable.
+    private static func containsWholeWord(_ word: String, in text: String) -> Bool {
+        let pattern = "\\b\(NSRegularExpression.escapedPattern(for: word))\\b"
+        guard let regex = try? NSRegularExpression(pattern: pattern) else { return text.contains(word) }
+        return regex.firstMatch(in: text, range: NSRange(text.startIndex..., in: text)) != nil
+    }
+}
+#endif
diff --git a/Sources/FluidAudioCLI/Commands/ASR/CanaryTranscribeCommand.swift b/Sources/FluidAudioCLI/Commands/ASR/CanaryTranscribeCommand.swift
new file mode 100644
index 00000000..085ca97e
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/ASR/CanaryTranscribeCommand.swift
@@ -0,0 +1,208 @@
+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+
+/// `canary-transcribe <audio> [--fp16|--int8] [--reference "..."] [--verbose]`
+/// `canary-transcribe --benchmark <librispeech-dir> [--max-files N] [--fp16|--int8]`
+///
+/// Canary-1B-v2 attention encoder-decoder ASR. Default precision int4 (ANE).
+enum CanaryTranscribeCommand {
+    private static let logger = AppLogger(category: "CanaryTranscribe")
+
+    static func run(arguments: [String]) async {
+        var audioPath: String?
+        var benchmarkDir: String?
+        var reference: String?
+        var precision: CanaryPrecision = .int4
+        var maxFiles = Int.max
+        var maxDuration = Double.greatestFiniteMagnitude
+        var verbose = false
+
+        var i = 0
+        while i < arguments.count {
+            switch arguments[i] {
+            case "--fp16": precision = .fp16
+            case "--int8": precision = .int8
+            case "--int4": precision = .int4
+            case "--reference":
+                i += 1
+                if i < arguments.count { reference = arguments[i] }
+            case "--benchmark":
+                i += 1
+                if i < arguments.count { benchmarkDir = arguments[i] }
+            case "--max-files":
+                i += 1
+                if i < arguments.count { maxFiles = Int(arguments[i]) ?? .max }
+            case "--max-duration":
+                i += 1
+                if i < arguments.count { maxDuration = Double(arguments[i]) ?? .greatestFiniteMagnitude }
+            case "--verbose", "-v": verbose = true
+            case "--help", "-h":
+                printUsage()
+                return
+            default: if audioPath == nil { audioPath = arguments[i] }
+            }
+            i += 1
+        }
+
+        do {
+            logger.info("Loading Canary models (\(precision.rawValue))...")
+            let loadStart = Date()
+            let manager = try await CanaryManager.load(precision: precision)
+            if verbose { logger.info("Loaded in \(String(format: "%.1f", Date().timeIntervalSince(loadStart)))s") }
+
+            if let dir = benchmarkDir {
+                await runBenchmark(
+                    manager: manager, dir: dir, maxFiles: maxFiles, maxDuration: maxDuration, precision: precision)
+                return
+            }
+
+            guard let audioPath else {
+                logger.error("Error: No audio file specified")
+                printUsage()
+                return
+            }
+            let audioURL = URL(fileURLWithPath: audioPath)
+            guard FileManager.default.fileExists(atPath: audioURL.path) else {
+                logger.error("Error: Audio file not found: \(audioPath)")
+                return
+            }
+
+            let duration = audioDuration(audioURL)
+            let start = Date()
+            let text = try await manager.transcribe(audioURL: audioURL)
+            let elapsed = Date().timeIntervalSince(start)
+            let rtfx = duration > 0 ? duration / elapsed : 0
+
+            print(text)
+            logger.info(
+                "Time \(String(format: "%.2f", elapsed))s | audio \(String(format: "%.2f", duration))s | RTFx \(String(format: "%.2f", rtfx))x"
+            )
+            if let reference {
+                let m = WERCalculator.calculateWERMetrics(hypothesis: text, reference: reference)
+                logger.info(
+                    "WER \(String(format: "%.2f", m.wer * 100))% (\(m.substitutions)S \(m.deletions)D \(m.insertions)I / \(m.totalWords)w)"
+                )
+            }
+        } catch {
+            logger.error("Transcription failed: \(error)")
+        }
+    }
+
+    // MARK: - Benchmark (LibriSpeech-style directory)
+
+    private static func runBenchmark(
+        manager: CanaryManager, dir: String, maxFiles: Int, maxDuration: Double, precision: CanaryPrecision
+    ) async {
+        let root = URL(fileURLWithPath: dir)
+        let refs = loadLibriSpeechReferences(root: root)
+        var audios = findAudio(root: root).sorted { $0.lastPathComponent < $1.lastPathComponent }
+        var skippedLong = 0
+        if maxDuration < .greatestFiniteMagnitude {
+            let before = audios.count
+            audios = audios.filter { audioDuration($0) <= maxDuration }
+            skippedLong = before - audios.count
+        }
+        guard !audios.isEmpty else {
+            logger.error("No .wav/.flac files found under \(dir)")
+            return
+        }
+        let files = Array(audios.prefix(maxFiles))
+        logger.info(
+            "Benchmarking Canary (\(precision.rawValue)) on \(files.count) files (refs: \(refs.count), skipped \(skippedLong) > \(maxDuration)s)..."
+        )
+
+        var totalAudio = 0.0
+        var totalCompute = 0.0
+        var totalWords = 0
+        var totalErrors = 0
+        var scored = 0
+
+        for (idx, url) in files.enumerated() {
+            do {
+                let duration = audioDuration(url)
+                let start = Date()
+                let hyp = try await manager.transcribe(audioURL: url)
+                let elapsed = Date().timeIntervalSince(start)
+                totalAudio += duration
+                totalCompute += elapsed
+
+                let key = url.deletingPathExtension().lastPathComponent
+                if let ref = refs[key] {
+                    let m = WERCalculator.calculateWERMetrics(hypothesis: hyp, reference: ref)
+                    totalErrors += Int((m.wer * Double(m.totalWords)).rounded())
+                    totalWords += m.totalWords
+                    scored += 1
+                }
+                if (idx + 1) % 10 == 0 || idx + 1 == files.count {
+                    logger.info("  \(idx + 1)/\(files.count) done")
+                }
+            } catch {
+                logger.error("  failed \(url.lastPathComponent): \(error)")
+            }
+        }
+
+        let rtfx = totalCompute > 0 ? totalAudio / totalCompute : 0
+        let wer = totalWords > 0 ? Double(totalErrors) / Double(totalWords) * 100 : -1
+        print("")
+        print("===== Canary \(precision.rawValue) benchmark =====")
+        print("Files          : \(files.count) (\(scored) scored)")
+        print("Audio total    : \(String(format: "%.1f", totalAudio))s")
+        print("Compute total  : \(String(format: "%.1f", totalCompute))s")
+        print("RTFx           : \(String(format: "%.2f", rtfx))x")
+        if wer >= 0 { print("WER            : \(String(format: "%.2f", wer))% (\(totalWords) words)") }
+        print("================================")
+    }
+
+    private static func findAudio(root: URL) -> [URL] {
+        guard let en = FileManager.default.enumerator(at: root, includingPropertiesForKeys: nil) else { return [] }
+        var out: [URL] = []
+        for case let u as URL in en where ["wav", "flac"].contains(u.pathExtension.lowercased()) {
+            out.append(u)
+        }
+        return out
+    }
+
+    /// LibriSpeech `*.trans.txt`: each line is `<utt-id> TRANSCRIPT`.
+    private static func loadLibriSpeechReferences(root: URL) -> [String: String] {
+        var map: [String: String] = [:]
+        guard let en = FileManager.default.enumerator(at: root, includingPropertiesForKeys: nil) else { return map }
+        for case let u as URL in en where u.lastPathComponent.hasSuffix(".trans.txt") {
+            guard let content = try? String(contentsOf: u, encoding: .utf8) else { continue }
+            for line in content.split(separator: "\n") {
+                let parts = line.split(separator: " ", maxSplits: 1)
+                if parts.count == 2 { map[String(parts[0])] = String(parts[1]) }
+            }
+        }
+        return map
+    }
+
+    private static func audioDuration(_ url: URL) -> Double {
+        let asset = AVURLAsset(url: url)
+        return CMTimeGetSeconds(asset.duration)
+    }
+
+    private static func printUsage() {
+        print(
+            """
+            Usage:
+              fluidaudio canary-transcribe <audio-file> [options]
+              fluidaudio canary-transcribe --benchmark <librispeech-dir> [options]
+
+            Canary-1B-v2 attention encoder-decoder ASR (25 European languages, 15 s window).
+
+            Options:
+              --int4         int4 encoder/decoder (ANE, ~573 MB, iOS18) — default
+              --fp16         fp16 encoder/decoder (ANE, exact parity, iOS17)
+              --int8         int8 encoder/decoder (CPU only)
+              --reference T  reference transcript for single-file WER
+              --benchmark D  run over a LibriSpeech-style dir (uses *.trans.txt refs)
+              --max-files N  limit benchmark file count
+              --verbose,-v   print load + per-file timing
+              --help,-h      show this help
+            """
+        )
+    }
+}
+#endif
diff --git a/Sources/FluidAudioCLI/FluidAudioCLI.swift b/Sources/FluidAudioCLI/FluidAudioCLI.swift
index 2504661e..205be8ba 100644
--- a/Sources/FluidAudioCLI/FluidAudioCLI.swift
+++ b/Sources/FluidAudioCLI/FluidAudioCLI.swift
@@ -96,6 +96,10 @@ struct FluidAudioCLI {
             await CohereTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
         case "cohere-benchmark":
             await CohereBenchmark.run(arguments: Array(arguments.dropFirst(2)))
+        case "canary-transcribe":
+            await CanaryTranscribeCommand.run(arguments: Array(arguments.dropFirst(2)))
+        case "canary-earnings-benchmark":
+            await CanaryEarningsBenchmark.run(arguments: Array(arguments.dropFirst(2)))
         case "help", "--help", "-h":
             printUsage()
         default:
diff --git a/Tests/FluidAudioTests/ASR/Canary/CanaryConfigTests.swift b/Tests/FluidAudioTests/ASR/Canary/CanaryConfigTests.swift
new file mode 100644
index 00000000..96bb6547
--- /dev/null
+++ b/Tests/FluidAudioTests/ASR/Canary/CanaryConfigTests.swift
@@ -0,0 +1,63 @@
+import Foundation
+import XCTest
+
+@testable import FluidAudio
+
+final class CanaryConfigTests: XCTestCase {
+
+    // MARK: - Repo registration
+
+    func testRepoRegistered() {
+        XCTAssertEqual(Repo.canary1bV2.rawValue, "FluidInference/canary-1b-v2-coreml")
+        XCTAssertEqual(Repo.canary1bV2.name, "canary-1b-v2-coreml")
+        XCTAssertEqual(Repo.canary1bV2.folderName, "canary-1b-v2")
+        XCTAssertTrue(Repo.canary1bV2.remotePath.contains("FluidInference/"))
+    }
+
+    func testRequiredModelsByPrecision() {
+        let int4 = ModelNames.getRequiredModelNames(for: .canary1bV2, variant: "int4")
+        XCTAssertTrue(int4.contains("EncoderInt4.mlmodelc"))
+        XCTAssertTrue(int4.contains("DecoderInt4.mlmodelc"))
+        XCTAssertTrue(int4.contains("Preprocessor.mlmodelc"))
+        XCTAssertTrue(int4.contains("Projection.mlmodelc"))
+        XCTAssertTrue(int4.contains("vocab.json"))
+
+        let fp16 = ModelNames.getRequiredModelNames(for: .canary1bV2, variant: "fp16")
+        XCTAssertTrue(fp16.contains("Encoder.mlmodelc"))
+        XCTAssertTrue(fp16.contains("Decoder.mlmodelc"))
+
+        // default (nil variant) falls back to int4
+        let def = ModelNames.getRequiredModelNames(for: .canary1bV2, variant: nil)
+        XCTAssertEqual(def, int4)
+    }
+
+    // MARK: - Precision → model name / compute units
+
+    func testPrecisionModelNames() {
+        XCTAssertEqual(CanaryPrecision.int4.encoderName, ModelNames.Canary.encoderInt4)
+        XCTAssertEqual(CanaryPrecision.int4.decoderName, ModelNames.Canary.decoderInt4)
+        XCTAssertEqual(CanaryPrecision.fp16.encoderName, ModelNames.Canary.encoder)
+        XCTAssertEqual(CanaryPrecision.int8.encoderName, ModelNames.Canary.encoderInt8)
+    }
+
+    func testPrecisionComputeUnits() {
+        // int8 crashes the GPU/ANE MPSGraph backend → CPU only; int4/fp16 run on ANE.
+        XCTAssertEqual(CanaryPrecision.int8.computeUnits, .cpuOnly)
+        XCTAssertEqual(CanaryPrecision.int4.computeUnits, .cpuAndNeuralEngine)
+        XCTAssertEqual(CanaryPrecision.fp16.computeUnits, .cpuAndNeuralEngine)
+    }
+
+    // MARK: - Config contract
+
+    func testConfigContract() {
+        XCTAssertEqual(CanaryConfig.sampleRate, 16000)
+        XCTAssertEqual(CanaryConfig.maxSamples, 240_000)  // 15 s
+        XCTAssertEqual(CanaryConfig.encoderHidden, 1024)
+        XCTAssertEqual(CanaryConfig.vocabSize, 16384)
+        XCTAssertEqual(CanaryConfig.eosId, 3)  // <|endoftext|>
+        XCTAssertEqual(CanaryConfig.padId, 2)
+        XCTAssertEqual(CanaryConfig.bosId, 4)
+        // canary2 English transcribe+pnc prompt
+        XCTAssertEqual(CanaryConfig.promptEnTranscribePnc, [16053, 7, 4, 16, 64, 64, 5, 9, 11, 13])
+    }
+}

From 4f7b5d232ede7521dc89588cc3117379da426596 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Wed, 17 Jun 2026 21:20:33 -0400
Subject: [PATCH 2/2] feat(asr/canary): long-form chunking for audio >15s

Split audio longer than the 15s window into overlapping 15s windows
(hop = 15s - 3s overlap), decode each independently, and stitch adjacent
windows at the seam via token-level longest-common-substring
(mergeTokenStreams, mirroring CoherePipeline). Audio <=15s is unchanged
(single-window). No model change - each window still sees the fixed 15s
contract and the decoder is reset per window.

Unblocks >15s datasets (e.g. FDA) that the fixed-window decoder previously
truncated. Adds CanaryChunkMergeTests for the seam stitcher.
---
 .../FluidAudio/ASR/Canary/CanaryConfig.swift  |  5 ++
 .../FluidAudio/ASR/Canary/CanaryManager.swift | 85 ++++++++++++++++++-
 .../ASR/Canary/CanaryChunkMergeTests.swift    | 63 ++++++++++++++
 3 files changed, 151 insertions(+), 2 deletions(-)
 create mode 100644 Tests/FluidAudioTests/ASR/Canary/CanaryChunkMergeTests.swift

diff --git a/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift b/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
index 9c173ea1..9996429d 100644
--- a/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
+++ b/Sources/FluidAudio/ASR/Canary/CanaryConfig.swift
@@ -40,6 +40,11 @@ public enum CanaryConfig {
     public static let sampleRate = 16000
     /// 15 s window — the preprocessor input is fixed at this sample count.
     public static let maxSamples = 240_000
+    /// Overlap between adjacent windows when chunking audio longer than 15 s.
+    /// 3 s (~19 tokens) gives the seam LCS-merge enough shared context to align
+    /// reliably while wasting little recompute. Hop = maxSamples − this.
+    public static let chunkOverlapSeconds = 3.0
+    public static let chunkOverlapSamples = 48_000
     public static let melDim = 128
     public static let melFrames = 1501
     public static let encoderHidden = 1024
diff --git a/Sources/FluidAudio/ASR/Canary/CanaryManager.swift b/Sources/FluidAudio/ASR/Canary/CanaryManager.swift
index f21b436f..386228e3 100644
--- a/Sources/FluidAudio/ASR/Canary/CanaryManager.swift
+++ b/Sources/FluidAudio/ASR/Canary/CanaryManager.swift
@@ -38,12 +38,93 @@ public actor CanaryManager {
     }
 
     /// Transcribe 16 kHz mono float samples (in [-1, 1]).
+    ///
+    /// Audio within the 15 s window is decoded in one pass. Longer audio is split
+    /// into overlapping 15 s windows (hop = 15 s − `chunkOverlapSeconds`), decoded
+    /// independently, and stitched at the seams via token-level
+    /// longest-common-substring (`mergeTokenStreams`). No model change — each
+    /// window still sees the fixed 15 s contract and the decoder is reset per window.
     public func transcribe(audio: [Float]) throws -> String {
+        let maxN = CanaryConfig.maxSamples
+        if audio.count <= maxN {
+            return detokenize(try transcribeWindow(audio: audio))
+        }
+
+        let hop = maxN - CanaryConfig.chunkOverlapSamples
+        var merged: [Int] = []
+        var start = 0
+        var chunkIndex = 0
+        while start < audio.count {
+            let end = min(start + maxN, audio.count)
+            // Don't decode a final tail that is pure overlap — the previous window
+            // already covered it.
+            if chunkIndex > 0, (end - start) <= (maxN - hop) { break }
+
+            let tokens = try transcribeWindow(audio: Array(audio[start..<end]))
+            merged = Self.mergeTokenStreams(prefix: merged, suffix: tokens)
+
+            chunkIndex += 1
+            if end >= audio.count { break }
+            start += hop
+        }
+        return detokenize(merged)
+    }
+
+    /// Run the 4-stage pipeline over a single ≤15 s window; returns generated
+    /// token ids (prompt stripped, EOS excluded).
+    private func transcribeWindow(audio: [Float]) throws -> [Int] {
         let (mel, melLength) = try runPreprocessor(audio: audio)
         let (encoder, encoderLength) = try runEncoder(mel: mel, melLength: melLength)
         let (embeddings, encoderMask) = try makeDecoderContext(encoder: encoder, encoderLength: encoderLength)
-        let tokens = try greedyDecode(embeddings: embeddings, encoderMask: encoderMask)
-        return detokenize(tokens)
+        return try greedyDecode(embeddings: embeddings, encoderMask: encoderMask)
+    }
+
+    /// Merge two adjacent window token streams using longest-common-substring.
+    ///
+    /// Both windows transcribe `chunkOverlapSeconds` of identical audio at their
+    /// seam, so their token ids share a common substring near the prefix's tail /
+    /// the suffix's head. Search a bounded window (`windowTokens` at the boundary)
+    /// for the longest common substring of length ≥ `minMatch`. On a hit, drop the
+    /// suffix's matched head so the seam is not duplicated; on a miss, concatenate
+    /// plainly — better to duplicate a few tokens than to lose content.
+    static func mergeTokenStreams(
+        prefix: [Int],
+        suffix: [Int],
+        windowTokens: Int = 32,
+        minMatch: Int = 4
+    ) -> [Int] {
+        if prefix.isEmpty { return suffix }
+        if suffix.isEmpty { return prefix }
+
+        let pTail = Array(prefix.suffix(windowTokens))
+        let sHead = Array(suffix.prefix(windowTokens))
+        let m = pTail.count
+        let n = sHead.count
+        if m == 0 || n == 0 { return prefix + suffix }
+
+        // Classic LCS-substring DP (O(m·n), m,n ≤ windowTokens).
+        var dp = [Int](repeating: 0, count: n + 1)
+        var bestLen = 0
+        var bestSEnd = 0  // index in sHead (exclusive) where the match ends
+        for i in 1...m {
+            var prev = 0
+            for j in 1...n {
+                let temp = dp[j]
+                if pTail[i - 1] == sHead[j - 1] {
+                    dp[j] = prev + 1
+                    if dp[j] > bestLen {
+                        bestLen = dp[j]
+                        bestSEnd = j
+                    }
+                } else {
+                    dp[j] = 0
+                }
+                prev = temp
+            }
+        }
+
+        guard bestLen >= minMatch else { return prefix + suffix }
+        return prefix + Array(suffix.dropFirst(bestSEnd))
     }
 
     // MARK: - Pipeline
diff --git a/Tests/FluidAudioTests/ASR/Canary/CanaryChunkMergeTests.swift b/Tests/FluidAudioTests/ASR/Canary/CanaryChunkMergeTests.swift
new file mode 100644
index 00000000..cb689242
--- /dev/null
+++ b/Tests/FluidAudioTests/ASR/Canary/CanaryChunkMergeTests.swift
@@ -0,0 +1,63 @@
+import Foundation
+import XCTest
+
+@testable import FluidAudio
+
+/// Tests for the long-form chunk stitcher (`CanaryManager.mergeTokenStreams`),
+/// which joins adjacent 15 s window token streams at their overlapping seam.
+final class CanaryChunkMergeTests: XCTestCase {
+
+    func testEmptyPrefixReturnsSuffix() {
+        XCTAssertEqual(CanaryManager.mergeTokenStreams(prefix: [], suffix: [1, 2, 3]), [1, 2, 3])
+    }
+
+    func testEmptySuffixReturnsPrefix() {
+        XCTAssertEqual(CanaryManager.mergeTokenStreams(prefix: [1, 2, 3], suffix: []), [1, 2, 3])
+    }
+
+    func testCleanSeamDropsDuplicatedOverlap() {
+        // prefix ends with [7,8,9,10]; suffix starts with the same overlap then continues.
+        let prefix = [1, 2, 3, 7, 8, 9, 10]
+        let suffix = [7, 8, 9, 10, 11, 12]
+        let merged = CanaryManager.mergeTokenStreams(prefix: prefix, suffix: suffix)
+        XCTAssertEqual(merged, [1, 2, 3, 7, 8, 9, 10, 11, 12])
+    }
+
+    func testPartialOverlapAlignsAtLongestMatch() {
+        // Overlap is the 5-token run [20,21,22,23,24]; the leading 99 in suffix is noise.
+        let prefix = [1, 2, 20, 21, 22, 23, 24]
+        let suffix = [99, 20, 21, 22, 23, 24, 30, 31]
+        let merged = CanaryManager.mergeTokenStreams(prefix: prefix, suffix: suffix)
+        XCTAssertEqual(merged, [1, 2, 20, 21, 22, 23, 24, 30, 31])
+    }
+
+    func testNoMatchConcatenatesPlainly() {
+        // No shared run ≥ minMatch — better to duplicate than lose content.
+        let prefix = [1, 2, 3]
+        let suffix = [4, 5, 6]
+        XCTAssertEqual(CanaryManager.mergeTokenStreams(prefix: prefix, suffix: suffix), [1, 2, 3, 4, 5, 6])
+    }
+
+    func testShortMatchBelowThresholdIsNotTrusted() {
+        // A 3-token incidental match is below the default minMatch=4 → plain concat.
+        let prefix = [1, 5, 6, 7]
+        let suffix = [5, 6, 7, 8, 9]
+        let merged = CanaryManager.mergeTokenStreams(prefix: prefix, suffix: suffix, minMatch: 4)
+        XCTAssertEqual(merged, [1, 5, 6, 7, 5, 6, 7, 8, 9])
+    }
+
+    func testMatchAtThresholdIsTrusted() {
+        let prefix = [1, 5, 6, 7, 8]
+        let suffix = [5, 6, 7, 8, 9]
+        let merged = CanaryManager.mergeTokenStreams(prefix: prefix, suffix: suffix, minMatch: 4)
+        XCTAssertEqual(merged, [1, 5, 6, 7, 8, 9])
+    }
+
+    func testOverlapConfigContract() {
+        XCTAssertEqual(CanaryConfig.chunkOverlapSamples, 48_000)  // 3 s @ 16 kHz
+        XCTAssertEqual(
+            CanaryConfig.chunkOverlapSamples,
+            Int(CanaryConfig.chunkOverlapSeconds * Double(CanaryConfig.sampleRate)))
+        XCTAssertLessThan(CanaryConfig.chunkOverlapSamples, CanaryConfig.maxSamples)
+    }
+}