Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Cotabby.xcodeproj/project.pbxproj

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions Cotabby/App/Coordinators/SettingsCoordinator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
private let modelDownloadManager: ModelDownloadManager
private let huggingFaceSearchService: HuggingFaceSearchService
private let performanceMetricsStore: PerformanceMetricsStore
private let qualityMetricsStore: SuggestionQualityMetricsStore
private let systemMetricsStore: SystemMetricsStore
private let onShowWelcome: () -> Void
private let clearEmojiHistory: () -> Void
Expand All @@ -36,6 +37,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
modelDownloadManager: ModelDownloadManager,
huggingFaceSearchService: HuggingFaceSearchService,
performanceMetricsStore: PerformanceMetricsStore,
qualityMetricsStore: SuggestionQualityMetricsStore,
systemMetricsStore: SystemMetricsStore,
onShowWelcome: @escaping () -> Void,
clearEmojiHistory: @escaping () -> Void
Expand All @@ -49,6 +51,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
self.modelDownloadManager = modelDownloadManager
self.huggingFaceSearchService = huggingFaceSearchService
self.performanceMetricsStore = performanceMetricsStore
self.qualityMetricsStore = qualityMetricsStore
self.systemMetricsStore = systemMetricsStore
self.onShowWelcome = onShowWelcome
self.clearEmojiHistory = clearEmojiHistory
Expand Down Expand Up @@ -76,6 +79,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate {
modelDownloadManager: modelDownloadManager,
huggingFaceSearchService: huggingFaceSearchService,
performanceMetricsStore: performanceMetricsStore,
qualityMetricsStore: qualityMetricsStore,
systemMetricsStore: systemMetricsStore,
onShowWelcome: onShowWelcome,
clearEmojiHistory: clearEmojiHistory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ extension SuggestionCoordinator {

deferAcceptanceBookkeeping { [weak self] in
self?.recordAcceptedWords(from: acceptedChunk)
self?.recordSuggestionAcceptedIfFirstChunk(of: sessionForAcceptance)
}

cancelPredictionWork()
Expand Down Expand Up @@ -563,6 +564,14 @@ extension SuggestionCoordinator {
}
}

/// Marks the session's suggestion accepted in the quality counters, once per suggestion: only
/// the first chunk counts, so word-by-word walks of one suggestion add nothing further and the
/// acceptance rate stays suggestions-accepted over suggestions-shown.
private func recordSuggestionAcceptedIfFirstChunk(of session: ActiveSuggestionSession) {
guard session.consumedCharacterCount == 0 else { return }
Comment on lines 564 to +571

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Correction acceptances inflate acceptedSuggestions without a matching shown increment

recordSuggestionAcceptedIfFirstChunk only guards on consumedCharacterCount == 0; it fires for any session kind, including .correction(typoWord:). presentCorrection never calls recordShown(), so every accepted correction increments acceptedSuggestions without a corresponding shown entry. Since acceptanceRate = acceptedSuggestions / shown, a session where the user accepts even one correction while the shown count is in single digits produces a rate above 100%, making the metric actively misleading.

Add guard case .continuation = session.kind else { return } before the recordAcceptedSuggestion() call to restrict the counter to generated completions only.

Fix in Codex Fix in Claude Code

qualityMetricsStore.recordAcceptedSuggestion()
}

/// Updates the global productivity counter from text accepted via Tab.
func recordAcceptedWords(from acceptedChunk: String) {
let acceptedWordCount = SuggestionSessionReconciler.acceptedWordCount(in: acceptedChunk)
Expand Down
27 changes: 25 additions & 2 deletions Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,18 @@ extension SuggestionCoordinator {
return
}

// The debounce window adapts to the last generation latency: snappier when the model is
// fast, calmer when it is slow (fewer doomed generations to cancel). The configured value
// is the fallback until a first latency exists.
let debounceMilliseconds = DebouncePolicy.milliseconds(
lastGenerationLatencyMilliseconds: latestLatencyMilliseconds,
fallback: settingsSnapshot.debounceMilliseconds
)
// The debounce clock starts at the keystroke, not here. The host-publish poll has already
// consumed real wall time waiting for the host to publish the keystroke to AX, and that
// wait collapses bursts just as well as sleeping does. Stacking the full debounce on top
// of the publish wait was pure added latency, so only the unconsumed remainder is slept.
let remainingDelay = max(0, settingsSnapshot.debounceMilliseconds - consumedDelayMilliseconds)
let remainingDelay = max(0, debounceMilliseconds - consumedDelayMilliseconds)

// Task cancellation in Swift is cooperative, so we also use an explicit work id.
// That gives us strict "latest request wins" semantics even if an old task wakes up late.
Expand All @@ -42,7 +49,7 @@ extension SuggestionCoordinator {
logStage(
"debouncing",
workID: workID,
message: "Debouncing (\(settingsSnapshot.debounceMilliseconds)ms window) before generating."
message: "Debouncing (\(debounceMilliseconds)ms window, \(remainingDelay)ms remaining) before generating."
)
}

Expand Down Expand Up @@ -496,6 +503,10 @@ extension SuggestionCoordinator {
guard liveContext.generation == result.generation else {

latestRawModelOutput = SuggestionDebugLogger.debugPreview(result.rawText)
// Lifecycle discards are counted under their own reasons so `generated` always equals
// `shown` plus the suppression histogram; without this, every drop here silently
Comment on lines 503 to +507

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Double-counted suppressions when engine suppression meets coordinator discard

The stale-drop, selected-text, and stale-accept-echo gates unconditionally call recordSuppressed, but the router already called recordSuppressed(reason: result.suppressionReason) for the same result whenever the engine attributed a suppression (e.g., "lowConfidence" or a normalizer reason). Any result that was engine-suppressed AND then discarded by a coordinator gate (common during fast typing at ~187 ms p50 latency, where ~21% of results hit the confidence floor and stale-drop fires frequently) will increment suppressedTotal twice, breaking the invariant stated in the adjacent comment ("generated always equals shown plus the suppression histogram"). The empty-result gate already guards with if result.suppressionReason == nil — the three lifecycle-discard gates need the same guard.

Fix in Codex Fix in Claude Code

// inflated the generated count against the others.
qualityMetricsStore.recordSuppressed(reason: "discardedStaleContext")
logStage(
"stale-drop",
workID: workID,
Expand All @@ -514,6 +525,11 @@ extension SuggestionCoordinator {
clearSuggestion()
hideOverlay(reason: "Overlay hidden because the model returned an empty continuation.")
state = .idle
// The router already counted engine-attributed suppressions (normalizer, confidence
// floor); only the unattributed "model produced nothing" case needs a ledger entry.
if result.suppressionReason == nil {
qualityMetricsStore.recordSuppressed(reason: "emptyUnattributed")
}
logStage(
"empty-result",
workID: workID,
Expand All @@ -529,6 +545,7 @@ extension SuggestionCoordinator {
clearSuggestion(clearDiagnostics: true)
hideOverlay(reason: "Overlay hidden because text is selected.")
state = .idle
qualityMetricsStore.recordSuppressed(reason: "discardedSelection")
logStage(
"selected-text",
workID: workID,
Expand All @@ -553,6 +570,7 @@ extension SuggestionCoordinator {
clearSuggestion(clearDiagnostics: false)
hideOverlay(reason: "Overlay hidden because the regeneration only echoed the just-accepted text before the host published it.")
state = .idle
qualityMetricsStore.recordSuppressed(reason: "discardedAcceptEcho")
logStage(
"stale-accept-echo",
workID: workID,
Expand All @@ -576,6 +594,8 @@ extension SuggestionCoordinator {
clearSuggestion()
hideOverlay(reason: "Overlay hidden because the completion failed the seam guard.")
state = .idle
let seamReason = if case .seamMisspelling = seamVerdict { "seamMisspelling" } else { "seamJunkPunctuationRun" }
qualityMetricsStore.recordSuppressed(reason: seamReason)
logStage(
"seam-suppressed",
workID: workID,
Expand All @@ -589,6 +609,9 @@ extension SuggestionCoordinator {

latestLatencyMilliseconds = Int(result.latency * 1000)
latestGenerationNumber = liveContext.generation
// One shown event per suggestion: this is the only place a fresh generation becomes
// visible (re-presentations after partial accepts reuse the same session).
qualityMetricsStore.recordShown()
let session = interactionState.startSession(
fullText: result.text,
liveContext: liveContext,
Expand Down
5 changes: 5 additions & 0 deletions Cotabby/App/Coordinators/SuggestionCoordinator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ final class SuggestionCoordinator: ObservableObject {
/// `CotabbyAppEnvironment`) so the underlying `NSSpellChecker` document tag persists across the
/// coordinator's lifetime instead of churning per keystroke.
let spellChecker: CurrentWordSpellChecker
/// Always-on quality counters (shown / suppressed / accepted). The router counts generation
/// outcomes; the coordinator owns the display-time and acceptance events only it can see.
let qualityMetricsStore: SuggestionQualityMetricsStore
/// Frequency-ranked correction source (SymSpell). Used first for the correction word, with
/// `spellChecker` as the fallback while its index is still loading or when it has no suggestion.
let symSpellCorrector: SymSpellCorrector
Expand Down Expand Up @@ -163,6 +166,7 @@ final class SuggestionCoordinator: ObservableObject {
spellChecker: CurrentWordSpellChecker,
symSpellCorrector: SymSpellCorrector,
spellingLanguageResolver: SpellingLanguageResolver = SpellingLanguageResolver(),
qualityMetricsStore: SuggestionQualityMetricsStore,
userDefaults: UserDefaults = .standard
) {
let storedTotalTabAcceptedWordCount = userDefaults.integer(
Expand All @@ -184,6 +188,7 @@ final class SuggestionCoordinator: ObservableObject {
self.spellChecker = spellChecker
self.symSpellCorrector = symSpellCorrector
self.spellingLanguageResolver = spellingLanguageResolver
self.qualityMetricsStore = qualityMetricsStore
self.userDefaults = userDefaults
settingsSnapshot = suggestionSettings.snapshot
// These collaborators isolate "how overlay/logging works" from "when the coordinator
Expand Down
10 changes: 9 additions & 1 deletion Cotabby/App/Core/CotabbyAppEnvironment.swift
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ final class CotabbyAppEnvironment {
let welcomeCoordinator: WelcomeCoordinator
let huggingFaceSearchService: HuggingFaceSearchService
let performanceMetricsStore: PerformanceMetricsStore
let qualityMetricsStore: SuggestionQualityMetricsStore
let settingsCoordinator: SettingsCoordinator
let activationIndicatorController: ActivationIndicatorController
let focusDebugOverlayController: FocusDebugOverlayController?
Expand Down Expand Up @@ -113,6 +114,9 @@ final class CotabbyAppEnvironment {
)
let huggingFaceSearchService = HuggingFaceSearchService()
let performanceMetricsStore = PerformanceMetricsStore()
// Always-on quality counters (generated / shown / suppressed-by-reason / accepted).
// Counters only, no content, so unlike latency tracking there is no opt-in gate.
let qualityMetricsStore = SuggestionQualityMetricsStore()
// Live CPU/RAM graph backing for the Performance pane. Holds no state until the pane asks it
// to start sampling, so constructing it eagerly here costs nothing.
let systemMetricsStore = SystemMetricsStore()
Expand Down Expand Up @@ -157,6 +161,7 @@ final class CotabbyAppEnvironment {
foundationModelEngine: foundationModelEngine,
llamaEngine: LlamaSuggestionEngine(runtimeManager: runtimeManager),
performanceMetricsStore: performanceMetricsStore,
qualityMetricsStore: qualityMetricsStore,
llamaModelNameProvider: { [weak runtimeManager] in
runtimeManager?.currentModelFilename
}
Expand All @@ -176,6 +181,7 @@ final class CotabbyAppEnvironment {
modelDownloadManager: modelDownloadManager,
huggingFaceSearchService: huggingFaceSearchService,
performanceMetricsStore: performanceMetricsStore,
qualityMetricsStore: qualityMetricsStore,
systemMetricsStore: systemMetricsStore,
onShowWelcome: { [weak welcomeCoordinator] in
welcomeCoordinator?.showWelcome()
Expand Down Expand Up @@ -213,7 +219,8 @@ final class CotabbyAppEnvironment {
configuration: configuration,
spellChecker: spellChecker,
symSpellCorrector: symSpellCorrector,
spellingLanguageResolver: SpellingLanguageResolver()
spellingLanguageResolver: SpellingLanguageResolver(),
qualityMetricsStore: qualityMetricsStore
)

// The emoji picker is a sibling to the suggestion coordinator. It reuses the input monitor,
Expand Down Expand Up @@ -276,6 +283,7 @@ final class CotabbyAppEnvironment {
self.welcomeCoordinator = welcomeCoordinator
self.huggingFaceSearchService = huggingFaceSearchService
self.performanceMetricsStore = performanceMetricsStore
self.qualityMetricsStore = qualityMetricsStore
self.settingsCoordinator = settingsCoordinator
self.activationIndicatorController = activationIndicatorController
self.focusDebugOverlayController = FocusDebugOverlayController.isEnabled
Expand Down
22 changes: 22 additions & 0 deletions Cotabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,28 @@ struct LlamaGenerationOptions: Equatable, Sendable {
/// degenerate instant stops (e.g. a lone leading period). Lives here so length presets can tune
/// the floor without reaching into `DecodeStopPolicy`; the default preserves prior behavior.
var sentenceStopMinimumTokens: Int = 2

/// Stop decoding the moment the raw distribution's most-likely next token is end-of-generation,
/// even when the stochastic sampler drew something else. The model's top choice being "stop"
/// is the strongest anti-rambling signal available per token, and the engine computes it while
/// the logits row is hot, so honoring it costs nothing here.
var stopAtArgmaxEOG: Bool = true
}

/// One generation's text plus the confidence signals the caller needs for suppression accounting.
/// Returned instead of a bare string so a confidence-suppressed completion is attributed to the
/// real reason rather than reading as "the model produced nothing".
struct LlamaGenerationOutput: Equatable, Sendable {
let text: String
/// Mean per-token log-probability of the generated tokens; nil when confidence gating was off
/// (the engine skips the per-token logprob work entirely) or nothing was generated.
let averageLogprob: Double?
/// True when the completion was withheld because `averageLogprob` fell below the floor.
let suppressedByLowConfidence: Bool

static func text(_ text: String) -> LlamaGenerationOutput {
LlamaGenerationOutput(text: text, averageLogprob: nil, suppressedByLowConfidence: false)
}
}

/// The concrete runtime assets selected during bootstrap after checking available model files.
Expand Down
19 changes: 19 additions & 0 deletions Cotabby/Models/SuggestionModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,25 @@ struct SuggestionResult: Equatable, Sendable {
let rawText: String
let text: String
let latency: TimeInterval
/// Raw value of the `CompletionSuppressionReason` that emptied `text`, when one applies.
/// Carried as a string so the coordinator's quality accounting never needs the normalizer
/// type, and so engine-specific reasons can ride along without enum churn. The explicit
/// initializer default keeps existing call sites compiling unchanged.
let suppressionReason: String?

init(
generation: UInt64,
rawText: String,
text: String,
latency: TimeInterval,
suppressionReason: String? = nil
) {
self.generation = generation
self.rawText = rawText
self.text = text
self.latency = latency
self.suppressionReason = suppressionReason
}
}

/// Represents one active inline-completion session after the model has produced a suggestion.
Expand Down
86 changes: 86 additions & 0 deletions Cotabby/Models/SuggestionQualityMetricsStore.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import Combine
import Foundation

/// Local, always-on counters that answer "is suggestion quality improving for real use": how many
/// completions were generated, how many were shown, why the withheld ones were withheld, and how
/// many shown suggestions the user actually accepted.
///
/// Latency tracking (`PerformanceMetricsStore`) stays opt-in because it records per-request rows;
/// these are lifetime counters with zero content, so they run unconditionally and survive restarts.
/// Acceptance rate (accepted / shown) is the closest thing to ground truth the app can measure on
/// device, and the suppression histogram tells the difference between "the model produced nothing"
/// and "a specific guard fired", which otherwise only exists scattered through debug-only JSONL.
@MainActor
final class SuggestionQualityMetricsStore: ObservableObject {
struct Counters: Codable, Equatable {
var generated = 0
var shown = 0
/// Sessions the user accepted at least once. Counted per suggestion, not per Tab press,
/// so word-by-word acceptance of one suggestion is one acceptance.
var acceptedSuggestions = 0
/// Keyed by `CompletionSuppressionReason` raw values plus coordinator-level reasons
/// (the seam guard verdicts). String-keyed so new reasons never need a schema migration.
var suppressedByReason: [String: Int] = [:]
var firstRecordedAt: Date?

var suppressedTotal: Int { suppressedByReason.values.reduce(0, +) }

var acceptanceRate: Double? {
guard shown > 0 else { return nil }
return Double(acceptedSuggestions) / Double(shown)
}
}

@Published private(set) var counters: Counters

private let userDefaults: UserDefaults
private static let defaultsKey = "cotabbyQualityMetricsCounters"

/// Stored-property @MainActor classes deallocated inside app-hosted tests double-free without
/// an explicitly nonisolated deinit (the isolated-deinit runtime path over-releases). Same
/// workaround as the other main-actor stores exercised by tests.
nonisolated deinit {}

init(userDefaults: UserDefaults = .standard) {
self.userDefaults = userDefaults
if let data = userDefaults.data(forKey: Self.defaultsKey),
let decoded = try? JSONDecoder().decode(Counters.self, from: data) {
counters = decoded
} else {
counters = Counters()
}
}

func recordGenerated() {
mutate { $0.generated += 1 }
}

func recordShown() {
mutate { $0.shown += 1 }
}

func recordAcceptedSuggestion() {
mutate { $0.acceptedSuggestions += 1 }
}

func recordSuppressed(reason: String) {
mutate { $0.suppressedByReason[reason, default: 0] += 1 }
}

func reset() {
counters = Counters()
userDefaults.removeObject(forKey: Self.defaultsKey)
}

private func mutate(_ change: (inout Counters) -> Void) {
var updated = counters
change(&updated)
if updated.firstRecordedAt == nil {
updated.firstRecordedAt = Date()
}
counters = updated
if let data = try? JSONEncoder().encode(updated) {
userDefaults.set(data, forKey: Self.defaultsKey)
}
}
}
Loading