FuJacob · FuJacob · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj
diff --git a/Cotabby/App/Coordinators/SettingsCoordinator.swift b/Cotabby/App/Coordinators/SettingsCoordinator.swift
@@ -20,6 +20,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
     private let modelDownloadManager: ModelDownloadManager
     private let huggingFaceSearchService: HuggingFaceSearchService
     private let performanceMetricsStore: PerformanceMetricsStore
+    private let qualityMetricsStore: SuggestionQualityMetricsStore
     private let systemMetricsStore: SystemMetricsStore
     private let onShowWelcome: () -> Void
     private let clearEmojiHistory: () -> Void
@@ -36,6 +37,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
         modelDownloadManager: ModelDownloadManager,
         huggingFaceSearchService: HuggingFaceSearchService,
         performanceMetricsStore: PerformanceMetricsStore,
+        qualityMetricsStore: SuggestionQualityMetricsStore,
         systemMetricsStore: SystemMetricsStore,
         onShowWelcome: @escaping () -> Void,
         clearEmojiHistory: @escaping () -> Void
@@ -49,6 +51,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
         self.modelDownloadManager = modelDownloadManager
         self.huggingFaceSearchService = huggingFaceSearchService
         self.performanceMetricsStore = performanceMetricsStore
+        self.qualityMetricsStore = qualityMetricsStore
         self.systemMetricsStore = systemMetricsStore
         self.onShowWelcome = onShowWelcome
         self.clearEmojiHistory = clearEmojiHistory
@@ -76,6 +79,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
                     modelDownloadManager: modelDownloadManager,
                     huggingFaceSearchService: huggingFaceSearchService,
                     performanceMetricsStore: performanceMetricsStore,
+                    qualityMetricsStore: qualityMetricsStore,
                     systemMetricsStore: systemMetricsStore,
                     onShowWelcome: onShowWelcome,
                     clearEmojiHistory: clearEmojiHistory

diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift
@@ -122,6 +122,7 @@ extension SuggestionCoordinator {
 
         deferAcceptanceBookkeeping { [weak self] in
             self?.recordAcceptedWords(from: acceptedChunk)
+            self?.recordSuggestionAcceptedIfFirstChunk(of: sessionForAcceptance)
         }
 
         cancelPredictionWork()
@@ -563,6 +564,14 @@ extension SuggestionCoordinator {
         }
     }
 
+    /// Marks the session's suggestion accepted in the quality counters, once per suggestion: only
+    /// the first chunk counts, so word-by-word walks of one suggestion add nothing further and the
+    /// acceptance rate stays suggestions-accepted over suggestions-shown.
+    private func recordSuggestionAcceptedIfFirstChunk(of session: ActiveSuggestionSession) {
+        guard session.consumedCharacterCount == 0 else { return }
+        qualityMetricsStore.recordAcceptedSuggestion()
+    }
+
     /// Updates the global productivity counter from text accepted via Tab.
     func recordAcceptedWords(from acceptedChunk: String) {
         let acceptedWordCount = SuggestionSessionReconciler.acceptedWordCount(in: acceptedChunk)

diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
@@ -20,11 +20,18 @@ extension SuggestionCoordinator {
             return
         }
 
+        // The debounce window adapts to the last generation latency: snappier when the model is
+        // fast, calmer when it is slow (fewer doomed generations to cancel). The configured value
+        // is the fallback until a first latency exists.
+        let debounceMilliseconds = DebouncePolicy.milliseconds(
+            lastGenerationLatencyMilliseconds: latestLatencyMilliseconds,
+            fallback: settingsSnapshot.debounceMilliseconds
+        )
         // The debounce clock starts at the keystroke, not here. The host-publish poll has already
         // consumed real wall time waiting for the host to publish the keystroke to AX, and that
         // wait collapses bursts just as well as sleeping does. Stacking the full debounce on top
         // of the publish wait was pure added latency, so only the unconsumed remainder is slept.
-        let remainingDelay = max(0, settingsSnapshot.debounceMilliseconds - consumedDelayMilliseconds)
+        let remainingDelay = max(0, debounceMilliseconds - consumedDelayMilliseconds)
 
         // Task cancellation in Swift is cooperative, so we also use an explicit work id.
         // That gives us strict "latest request wins" semantics even if an old task wakes up late.
@@ -42,7 +49,7 @@ extension SuggestionCoordinator {
         logStage(
             "debouncing",
             workID: workID,
-            message: "Debouncing (\(settingsSnapshot.debounceMilliseconds)ms window) before generating."
+            message: "Debouncing (\(debounceMilliseconds)ms window, \(remainingDelay)ms remaining) before generating."
         )
     }
 
@@ -496,6 +503,10 @@ extension SuggestionCoordinator {
         guard liveContext.generation == result.generation else {
 
             latestRawModelOutput = SuggestionDebugLogger.debugPreview(result.rawText)
+            // Lifecycle discards are counted under their own reasons so `generated` always equals
+            // `shown` plus the suppression histogram; without this, every drop here silently
+            // inflated the generated count against the others.
+            qualityMetricsStore.recordSuppressed(reason: "discardedStaleContext")
             logStage(
                 "stale-drop",
                 workID: workID,
@@ -514,6 +525,11 @@ extension SuggestionCoordinator {
             clearSuggestion()
             hideOverlay(reason: "Overlay hidden because the model returned an empty continuation.")
             state = .idle
+            // The router already counted engine-attributed suppressions (normalizer, confidence
+            // floor); only the unattributed "model produced nothing" case needs a ledger entry.
+            if result.suppressionReason == nil {
+                qualityMetricsStore.recordSuppressed(reason: "emptyUnattributed")
+            }
             logStage(
                 "empty-result",
                 workID: workID,
@@ -529,6 +545,7 @@ extension SuggestionCoordinator {
             clearSuggestion(clearDiagnostics: true)
             hideOverlay(reason: "Overlay hidden because text is selected.")
             state = .idle
+            qualityMetricsStore.recordSuppressed(reason: "discardedSelection")
             logStage(
                 "selected-text",
                 workID: workID,
@@ -553,6 +570,7 @@ extension SuggestionCoordinator {
             clearSuggestion(clearDiagnostics: false)
             hideOverlay(reason: "Overlay hidden because the regeneration only echoed the just-accepted text before the host published it.")
             state = .idle
+            qualityMetricsStore.recordSuppressed(reason: "discardedAcceptEcho")
             logStage(
                 "stale-accept-echo",
                 workID: workID,
@@ -576,6 +594,8 @@ extension SuggestionCoordinator {
             clearSuggestion()
             hideOverlay(reason: "Overlay hidden because the completion failed the seam guard.")
             state = .idle
+            let seamReason = if case .seamMisspelling = seamVerdict { "seamMisspelling" } else { "seamJunkPunctuationRun" }
+            qualityMetricsStore.recordSuppressed(reason: seamReason)
             logStage(
                 "seam-suppressed",
                 workID: workID,
@@ -589,6 +609,9 @@ extension SuggestionCoordinator {
 
         latestLatencyMilliseconds = Int(result.latency * 1000)
         latestGenerationNumber = liveContext.generation
+        // One shown event per suggestion: this is the only place a fresh generation becomes
+        // visible (re-presentations after partial accepts reuse the same session).
+        qualityMetricsStore.recordShown()
         let session = interactionState.startSession(
             fullText: result.text,
             liveContext: liveContext,

diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
@@ -54,6 +54,9 @@ final class SuggestionCoordinator: ObservableObject {
     /// `CotabbyAppEnvironment`) so the underlying `NSSpellChecker` document tag persists across the
     /// coordinator's lifetime instead of churning per keystroke.
     let spellChecker: CurrentWordSpellChecker
+    /// Always-on quality counters (shown / suppressed / accepted). The router counts generation
+    /// outcomes; the coordinator owns the display-time and acceptance events only it can see.
+    let qualityMetricsStore: SuggestionQualityMetricsStore
     /// Frequency-ranked correction source (SymSpell). Used first for the correction word, with
     /// `spellChecker` as the fallback while its index is still loading or when it has no suggestion.
     let symSpellCorrector: SymSpellCorrector
@@ -163,6 +166,7 @@ final class SuggestionCoordinator: ObservableObject {
         spellChecker: CurrentWordSpellChecker,
         symSpellCorrector: SymSpellCorrector,
         spellingLanguageResolver: SpellingLanguageResolver = SpellingLanguageResolver(),
+        qualityMetricsStore: SuggestionQualityMetricsStore,
         userDefaults: UserDefaults = .standard
     ) {
         let storedTotalTabAcceptedWordCount = userDefaults.integer(
@@ -184,6 +188,7 @@ final class SuggestionCoordinator: ObservableObject {
         self.spellChecker = spellChecker
         self.symSpellCorrector = symSpellCorrector
         self.spellingLanguageResolver = spellingLanguageResolver
+        self.qualityMetricsStore = qualityMetricsStore
         self.userDefaults = userDefaults
         settingsSnapshot = suggestionSettings.snapshot
         // These collaborators isolate "how overlay/logging works" from "when the coordinator

diff --git a/Cotabby/App/Core/CotabbyAppEnvironment.swift b/Cotabby/App/Core/CotabbyAppEnvironment.swift
@@ -34,6 +34,7 @@ final class CotabbyAppEnvironment {
     let welcomeCoordinator: WelcomeCoordinator
     let huggingFaceSearchService: HuggingFaceSearchService
     let performanceMetricsStore: PerformanceMetricsStore
+    let qualityMetricsStore: SuggestionQualityMetricsStore
     let settingsCoordinator: SettingsCoordinator
     let activationIndicatorController: ActivationIndicatorController
     let focusDebugOverlayController: FocusDebugOverlayController?
@@ -113,6 +114,9 @@ final class CotabbyAppEnvironment {
         )
         let huggingFaceSearchService = HuggingFaceSearchService()
         let performanceMetricsStore = PerformanceMetricsStore()
+        // Always-on quality counters (generated / shown / suppressed-by-reason / accepted).
+        // Counters only, no content, so unlike latency tracking there is no opt-in gate.
+        let qualityMetricsStore = SuggestionQualityMetricsStore()
         // Live CPU/RAM graph backing for the Performance pane. Holds no state until the pane asks it
         // to start sampling, so constructing it eagerly here costs nothing.
         let systemMetricsStore = SystemMetricsStore()
@@ -157,6 +161,7 @@ final class CotabbyAppEnvironment {
             foundationModelEngine: foundationModelEngine,
             llamaEngine: LlamaSuggestionEngine(runtimeManager: runtimeManager),
             performanceMetricsStore: performanceMetricsStore,
+            qualityMetricsStore: qualityMetricsStore,
             llamaModelNameProvider: { [weak runtimeManager] in
                 runtimeManager?.currentModelFilename
             }
@@ -176,6 +181,7 @@ final class CotabbyAppEnvironment {
             modelDownloadManager: modelDownloadManager,
             huggingFaceSearchService: huggingFaceSearchService,
             performanceMetricsStore: performanceMetricsStore,
+            qualityMetricsStore: qualityMetricsStore,
             systemMetricsStore: systemMetricsStore,
             onShowWelcome: { [weak welcomeCoordinator] in
                 welcomeCoordinator?.showWelcome()
@@ -213,7 +219,8 @@ final class CotabbyAppEnvironment {
             configuration: configuration,
             spellChecker: spellChecker,
             symSpellCorrector: symSpellCorrector,
-            spellingLanguageResolver: SpellingLanguageResolver()
+            spellingLanguageResolver: SpellingLanguageResolver(),
+            qualityMetricsStore: qualityMetricsStore
         )
 
         // The emoji picker is a sibling to the suggestion coordinator. It reuses the input monitor,
@@ -276,6 +283,7 @@ final class CotabbyAppEnvironment {
         self.welcomeCoordinator = welcomeCoordinator
         self.huggingFaceSearchService = huggingFaceSearchService
         self.performanceMetricsStore = performanceMetricsStore
+        self.qualityMetricsStore = qualityMetricsStore
         self.settingsCoordinator = settingsCoordinator
         self.activationIndicatorController = activationIndicatorController
         self.focusDebugOverlayController = FocusDebugOverlayController.isEnabled

diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -197,6 +197,28 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     /// degenerate instant stops (e.g. a lone leading period). Lives here so length presets can tune
     /// the floor without reaching into `DecodeStopPolicy`; the default preserves prior behavior.
     var sentenceStopMinimumTokens: Int = 2
+
+    /// Stop decoding the moment the raw distribution's most-likely next token is end-of-generation,
+    /// even when the stochastic sampler drew something else. The model's top choice being "stop"
+    /// is the strongest anti-rambling signal available per token, and the engine computes it while
+    /// the logits row is hot, so honoring it costs nothing here.
+    var stopAtArgmaxEOG: Bool = true
+}
+
+/// One generation's text plus the confidence signals the caller needs for suppression accounting.
+/// Returned instead of a bare string so a confidence-suppressed completion is attributed to the
+/// real reason rather than reading as "the model produced nothing".
+struct LlamaGenerationOutput: Equatable, Sendable {
+    let text: String
+    /// Mean per-token log-probability of the generated tokens; nil when confidence gating was off
+    /// (the engine skips the per-token logprob work entirely) or nothing was generated.
+    let averageLogprob: Double?
+    /// True when the completion was withheld because `averageLogprob` fell below the floor.
+    let suppressedByLowConfidence: Bool
+
+    static func text(_ text: String) -> LlamaGenerationOutput {
+        LlamaGenerationOutput(text: text, averageLogprob: nil, suppressedByLowConfidence: false)
+    }
 }
 
 /// The concrete runtime assets selected during bootstrap after checking available model files.

diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
@@ -449,6 +449,25 @@ struct SuggestionResult: Equatable, Sendable {
     let rawText: String
     let text: String
     let latency: TimeInterval
+    /// Raw value of the `CompletionSuppressionReason` that emptied `text`, when one applies.
+    /// Carried as a string so the coordinator's quality accounting never needs the normalizer
+    /// type, and so engine-specific reasons can ride along without enum churn. The explicit
+    /// initializer default keeps existing call sites compiling unchanged.
+    let suppressionReason: String?
+
+    init(
+        generation: UInt64,
+        rawText: String,
+        text: String,
+        latency: TimeInterval,
+        suppressionReason: String? = nil
+    ) {
+        self.generation = generation
+        self.rawText = rawText
+        self.text = text
+        self.latency = latency
+        self.suppressionReason = suppressionReason
+    }
 }
 
 /// Represents one active inline-completion session after the model has produced a suggestion.

diff --git a/Cotabby/Models/SuggestionQualityMetricsStore.swift b/Cotabby/Models/SuggestionQualityMetricsStore.swift
@@ -0,0 +1,86 @@
+import Combine
+import Foundation
+
+/// Local, always-on counters that answer "is suggestion quality improving for real use": how many
+/// completions were generated, how many were shown, why the withheld ones were withheld, and how
+/// many shown suggestions the user actually accepted.
+///
+/// Latency tracking (`PerformanceMetricsStore`) stays opt-in because it records per-request rows;
+/// these are lifetime counters with zero content, so they run unconditionally and survive restarts.
+/// Acceptance rate (accepted / shown) is the closest thing to ground truth the app can measure on
+/// device, and the suppression histogram tells the difference between "the model produced nothing"
+/// and "a specific guard fired", which otherwise only exists scattered through debug-only JSONL.
+@MainActor
+final class SuggestionQualityMetricsStore: ObservableObject {
+    struct Counters: Codable, Equatable {
+        var generated = 0
+        var shown = 0
+        /// Sessions the user accepted at least once. Counted per suggestion, not per Tab press,
+        /// so word-by-word acceptance of one suggestion is one acceptance.
+        var acceptedSuggestions = 0
+        /// Keyed by `CompletionSuppressionReason` raw values plus coordinator-level reasons
+        /// (the seam guard verdicts). String-keyed so new reasons never need a schema migration.
+        var suppressedByReason: [String: Int] = [:]
+        var firstRecordedAt: Date?
+
+        var suppressedTotal: Int { suppressedByReason.values.reduce(0, +) }
+
+        var acceptanceRate: Double? {
+            guard shown > 0 else { return nil }
+            return Double(acceptedSuggestions) / Double(shown)
+        }
+    }
+
+    @Published private(set) var counters: Counters
+
+    private let userDefaults: UserDefaults
+    private static let defaultsKey = "cotabbyQualityMetricsCounters"
+
+    /// Stored-property @MainActor classes deallocated inside app-hosted tests double-free without
+    /// an explicitly nonisolated deinit (the isolated-deinit runtime path over-releases). Same
+    /// workaround as the other main-actor stores exercised by tests.
+    nonisolated deinit {}
+
+    init(userDefaults: UserDefaults = .standard) {
+        self.userDefaults = userDefaults
+        if let data = userDefaults.data(forKey: Self.defaultsKey),
+           let decoded = try? JSONDecoder().decode(Counters.self, from: data) {
+            counters = decoded
+        } else {
+            counters = Counters()
+        }
+    }
+
+    func recordGenerated() {
+        mutate { $0.generated += 1 }
+    }
+
+    func recordShown() {
+        mutate { $0.shown += 1 }
+    }
+
+    func recordAcceptedSuggestion() {
+        mutate { $0.acceptedSuggestions += 1 }
+    }
+
+    func recordSuppressed(reason: String) {
+        mutate { $0.suppressedByReason[reason, default: 0] += 1 }
+    }
+
+    func reset() {
+        counters = Counters()
+        userDefaults.removeObject(forKey: Self.defaultsKey)
+    }
+
+    private func mutate(_ change: (inout Counters) -> Void) {
+        var updated = counters
+        change(&updated)
+        if updated.firstRecordedAt == nil {
+            updated.firstRecordedAt = Date()
+        }
+        counters = updated
+        if let data = try? JSONEncoder().encode(updated) {
+            userDefaults.set(data, forKey: Self.defaultsKey)
+        }
+    }
+}