From d5ca06ef9f4c5cc864a2fc5f2c4f1900004bb9ca Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:44:09 -0700 Subject: [PATCH 1/5] feat(decode): confidence floor at -1.5 and argmax-EOG early stop The runtime now returns a typed output (text, average logprob, withheld flag) instead of a bare string, so a confidence-suppressed completion is attributed as lowConfidence rather than reading as 'the model produced nothing'. The shipped floor of -1.5 mean per-token log-probability came from an eval sweep over nine values: floors at or below -2 never fire on this model at temperature 0.1, -1 and tighter buy precision at a brutal coverage cost, and -1.5 is the unique point where the composite quality score rose (0.734 to 0.744), wrong-shows fell 27% relative (0.188 to 0.137), and zero must-show cases were lost. Enabling the floor turns on per-token logprob computation (eval p50 187ms to 200ms); cotabbyConfidenceFloorOverride adjusts it without a rebuild, and -infinity restores the old posture entirely. The decode loop also stops the moment the raw distribution's most-likely next token is end-of-generation (computed by the engine while the logits row is hot). At temperature 0.1 sampling is near-greedy so the eval shows no delta; the stop exists for the sampling tail where the dist sampler draws past the model's intended stop. cotabbyArgmaxStopDisabled switches it off. --- Cotabby/Models/LlamaRuntimeModels.swift | 22 ++++++++++ Cotabby/Models/SuggestionModels.swift | 19 ++++++++ .../Models/SuggestionSubsystemContracts.swift | 11 +++-- .../FoundationModelSuggestionEngine.swift | 3 +- .../Services/Runtime/LlamaRuntimeCore.swift | 33 +++++++++++--- .../Runtime/LlamaRuntimeManager.swift | 4 +- .../Runtime/LlamaSuggestionEngine.swift | 44 ++++++++++++++++--- .../Support/SuggestionTextNormalizer.swift | 5 +++ ...amaSuggestionEngineCancellationTests.swift | 24 ++++++++-- 9 files changed, 145 insertions(+), 20 deletions(-) diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 954c44d0..1755e434 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -197,6 +197,28 @@ struct LlamaGenerationOptions: Equatable, Sendable { /// degenerate instant stops (e.g. a lone leading period). Lives here so length presets can tune /// the floor without reaching into `DecodeStopPolicy`; the default preserves prior behavior. var sentenceStopMinimumTokens: Int = 2 + + /// Stop decoding the moment the raw distribution's most-likely next token is end-of-generation, + /// even when the stochastic sampler drew something else. The model's top choice being "stop" + /// is the strongest anti-rambling signal available per token, and the engine computes it while + /// the logits row is hot, so honoring it costs nothing here. + var stopAtArgmaxEOG: Bool = true +} + +/// One generation's text plus the confidence signals the caller needs for suppression accounting. +/// Returned instead of a bare string so a confidence-suppressed completion is attributed to the +/// real reason rather than reading as "the model produced nothing". +struct LlamaGenerationOutput: Equatable, Sendable { + let text: String + /// Mean per-token log-probability of the generated tokens; nil when confidence gating was off + /// (the engine skips the per-token logprob work entirely) or nothing was generated. + let averageLogprob: Double? + /// True when the completion was withheld because `averageLogprob` fell below the floor. + let suppressedByLowConfidence: Bool + + static func text(_ text: String) -> LlamaGenerationOutput { + LlamaGenerationOutput(text: text, averageLogprob: nil, suppressedByLowConfidence: false) + } } /// The concrete runtime assets selected during bootstrap after checking available model files. diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift index a0f55c88..dfc44b15 100644 --- a/Cotabby/Models/SuggestionModels.swift +++ b/Cotabby/Models/SuggestionModels.swift @@ -449,6 +449,25 @@ struct SuggestionResult: Equatable, Sendable { let rawText: String let text: String let latency: TimeInterval + /// Raw value of the `CompletionSuppressionReason` that emptied `text`, when one applies. + /// Carried as a string so the coordinator's quality accounting never needs the normalizer + /// type, and so engine-specific reasons can ride along without enum churn. The explicit + /// initializer default keeps existing call sites compiling unchanged. + let suppressionReason: String? + + init( + generation: UInt64, + rawText: String, + text: String, + latency: TimeInterval, + suppressionReason: String? = nil + ) { + self.generation = generation + self.rawText = rawText + self.text = text + self.latency = latency + self.suppressionReason = suppressionReason + } } /// Represents one active inline-completion session after the model has produced a suggestion. diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index 77ece468..5f0acb63 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -127,16 +127,21 @@ extension SuggestionGenerating { /// a fake runtime instead of loading a real model. `LlamaRuntimeManager` is the production conformer. @MainActor protocol LlamaRuntimeGenerating: AnyObject { - func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions + ) async throws -> LlamaGenerationOutput /// Streaming variant: `onPartialRawText` receives the cumulative raw completion after each /// sampled token, called from the decode thread (hence `@Sendable`); callers own hopping to - /// their actor. The returned string is still the authoritative final completion. + /// their actor. The returned output's text is still the authoritative final completion, and + /// its confidence fields describe the whole generation (partials are pre-gate by nature). func generate( prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String + ) async throws -> LlamaGenerationOutput func resetPromptCache() /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup: diff --git a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift index 570a6dc9..fae711e0 100644 --- a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift @@ -175,7 +175,8 @@ final class FoundationModelSuggestionEngine { generation: request.generation, rawText: rawSuggestion, text: normalizedSuggestion, - latency: latency + latency: latency, + suppressionReason: normalization.suppression?.rawValue ) } catch is CancellationError { CotabbyLogger.suggestion.debug("Foundation model generation cancelled", metadata: baseMetadata) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index f9313d6c..492a25c2 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -140,7 +140,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions, onPartialRawText: ((String) -> Void)? = nil - ) throws -> String { + ) throws -> LlamaGenerationOutput { let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate") lifecycleCondition.lock() @@ -199,7 +199,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { engine.destroySequence(sequenceID) autocompleteSequenceID = -1 } - return decode.text + return decode.output } /// Decodes `prompt` into the autocomplete KV cache without sampling, so the next `generate` @@ -364,7 +364,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { sequenceID: Int32, options: LlamaGenerationOptions, onPartialRawText: ((String) -> Void)? = nil - ) -> (text: String, engineCancelled: Bool) { + ) -> (output: LlamaGenerationOutput, engineCancelled: Bool) { var generatedText = "" var tokensGenerated = 0 var sumLogprob = 0.0 @@ -392,6 +392,14 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { stopReason = "eos" break } + // The raw distribution's most-likely token is end-of-generation: the model wants to + // stop here even though the stochastic sampler drew something else. Finalize with the + // text accumulated so far and discard the sampled-but-unwanted token; this is the + // anti-rambling stop the sentence classifier cannot express (lists, fragments, code). + if options.stopAtArgmaxEOG, result.argmax_is_eog { + stopReason = "argmax_eog" + break + } let piece = Self.extractPiece(result) generatedText += piece @@ -428,10 +436,25 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ] ) + // The average is only meaningful when the engine actually computed per-token logprobs, + // which is keyed on the floor being enabled (see setComputeLogprob at sequence setup). + let averageLogprob: Double? = options.confidenceFloor > -.infinity && tokensGenerated > 0 + ? sumLogprob / Double(tokensGenerated) + : nil if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { - return ("", engineCancelled) + let suppressed = LlamaGenerationOutput( + text: "", + averageLogprob: averageLogprob, + suppressedByLowConfidence: true + ) + return (suppressed, engineCancelled) } - return (generatedText, engineCancelled) + let output = LlamaGenerationOutput( + text: generatedText, + averageLogprob: averageLogprob, + suppressedByLowConfidence: false + ) + return (output, engineCancelled) } /// Low-confidence gate for the sampled decoder: drop completions the model itself was unsure diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index 889522ca..af02d8e5 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -100,7 +100,7 @@ final class LlamaRuntimeManager: ObservableObject { prompt: String, cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try await generate( prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, @@ -116,7 +116,7 @@ final class LlamaRuntimeManager: ObservableObject { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { _ = try await preparedRuntime() let core = self.core diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index 4092bbb9..d38d6407 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -20,6 +20,24 @@ final class LlamaSuggestionEngine { self.runtimeManager = runtimeManager } + /// Shipped confidence floor (mean per-token log-probability). -infinity disables the gate AND + /// the per-token logprob computation behind it; any other value turns both on. -1.5 came from + /// an eval sweep over {off, -4, -3, -2.5, -2, -1.5, -1, -0.5, -0.3}: floors at or below -2 + /// never fire on this model at temperature 0.1, -1 and tighter buy precision at a brutal + /// coverage cost, and -1.5 is the unique point where the composite quality score rose + /// (0.734 to 0.744), wrong-shows fell 27% relative, and not a single must-show case was lost. + static let defaultConfidenceFloor: Double = -1.5 + /// `defaults write` escape hatches for dogfooding and field diagnosis without a rebuild. + static let confidenceFloorOverrideKey = "cotabbyConfidenceFloorOverride" + static let argmaxStopDisabledKey = "cotabbyArgmaxStopDisabled" + + private static func resolvedConfidenceFloor(_ defaults: UserDefaults = .standard) -> Double { + guard defaults.object(forKey: confidenceFloorOverrideKey) != nil else { + return defaultConfidenceFloor + } + return defaults.double(forKey: confidenceFloorOverrideKey) + } + /// Prefills the prompt KV for the field the user just focused, so the first real suggestion /// there only decodes the typed delta instead of the whole cold prompt. /// @@ -89,9 +107,9 @@ final class LlamaSuggestionEngine { ]) { _, new in new } ) let options = Self.makeGenerationOptions(for: request) - let rawSuggestion: String + let output: LlamaGenerationOutput if let onPartial { - rawSuggestion = try await runtimeManager.generate( + output = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, @@ -115,7 +133,7 @@ final class LlamaSuggestionEngine { } ) } else { - rawSuggestion = try await runtimeManager.generate( + output = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, options: options @@ -124,7 +142,15 @@ final class LlamaSuggestionEngine { try Task.checkCancellation() promptCacheHintTracker.recordSuccessfulRequest(request) - let normalization = SuggestionTextNormalizer.normalizeDetailed(rawSuggestion, for: request) + let rawSuggestion = output.text + // A confidence-suppressed completion never reaches the normalizer (the runtime already + // withheld the text); attribute the real reason instead of "the model produced nothing". + // Streamed partials are pre-gate by nature: a completion the floor later withholds can + // briefly paint and then clear, which is the same contract as any other final-result + // suppression under streaming. + let normalization = output.suppressedByLowConfidence + ? SuggestionNormalizationResult(text: "", suppression: .lowConfidence) + : SuggestionTextNormalizer.normalizeDetailed(rawSuggestion, for: request) let normalizedSuggestion = normalization.text let latency = Date().timeIntervalSince(startTime) let rawChars = rawSuggestion.count @@ -133,12 +159,14 @@ final class LlamaSuggestionEngine { // `suppression_reason` distinguishes an empty ghost text caused by the model producing // nothing from one a filter dropped — the join key for judging decode quality on device. let suppressionReason = normalization.suppression?.rawValue ?? "none" + let averageLogprobDescription = output.averageLogprob.map { String(format: "%.3f", $0) } ?? "off" CotabbyLogger.suggestion.debug( "Llama generated", metadata: baseMetadata.merging([ "raw_chars": .stringConvertible(rawChars), "normalized_chars": .stringConvertible(normalizedChars), "suppression_reason": .string(suppressionReason), + "avg_logprob": .string(averageLogprobDescription), "latency_ms": .stringConvertible(latencyMs) ]) { _, new in new } ) @@ -152,6 +180,7 @@ final class LlamaSuggestionEngine { "raw_chars": .stringConvertible(rawChars), "normalized_chars": .stringConvertible(normalizedChars), "suppression_reason": .string(suppressionReason), + "avg_logprob": .string(averageLogprobDescription), "latency_ms": .stringConvertible(latencyMs), "cache_hint_bytes": .string(hintDesc), "max_tokens": .stringConvertible(request.maxPredictionTokens) @@ -161,7 +190,8 @@ final class LlamaSuggestionEngine { generation: request.generation, rawText: rawSuggestion, text: normalizedSuggestion, - latency: latency + latency: latency, + suppressionReason: normalization.suppression?.rawValue ) } catch is CancellationError { CotabbyLogger.suggestion.debug("Llama generation cancelled", metadata: baseMetadata) @@ -233,7 +263,9 @@ final class LlamaSuggestionEngine { forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( precedingText: request.context.precedingText, trailingText: request.context.trailingText - ) + ), + confidenceFloor: resolvedConfidenceFloor(), + stopAtArgmaxEOG: !UserDefaults.standard.bool(forKey: argmaxStopDisabledKey) ) } } diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift index eafd1422..8a90df82 100644 --- a/Cotabby/Support/SuggestionTextNormalizer.swift +++ b/Cotabby/Support/SuggestionTextNormalizer.swift @@ -24,6 +24,11 @@ enum CompletionSuppressionReason: String, Sendable, Equatable { case echoesPrecedingText /// Printable characters survived but carried control/replacement glyphs the safety gate rejects. case unsafeToInsert + /// The runtime withheld the completion because its mean per-token log-probability fell below + /// the confidence floor: the model itself was unsure, and showing nothing beats a guess. + /// Attributed by the engine (the runtime reports it on `LlamaGenerationOutput`), not by the + /// normalizer, which never sees the withheld text. + case lowConfidence } /// Outcome of normalizing one raw completion: the ghost text, plus the attributable reason when that diff --git a/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift b/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift index 336c6ed0..ba7dcd41 100644 --- a/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift +++ b/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift @@ -53,7 +53,7 @@ final class LlamaSuggestionEngineCancellationTests: XCTestCase { func test_successfulGeneration_doesNotResetCache() async throws { let runtime = FakeLlamaRuntime() - runtime.generateResult = .success("world") + runtime.generateResult = .success(.text("world")) let engine = LlamaSuggestionEngine(runtimeManager: runtime) let result = try await engine.generateSuggestion(for: makeRequest(prompt: "hello ")) @@ -62,6 +62,24 @@ final class LlamaSuggestionEngineCancellationTests: XCTestCase { XCTAssertEqual(runtime.resetCount, 0) } + func test_lowConfidenceSuppression_isAttributedAsLowConfidence() async throws { + let runtime = FakeLlamaRuntime() + runtime.generateResult = .success( + LlamaGenerationOutput(text: "", averageLogprob: -5.2, suppressedByLowConfidence: true) + ) + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + + let result = try await engine.generateSuggestion(for: makeRequest(prompt: "hello ")) + + XCTAssertEqual(result.text, "") + XCTAssertEqual( + result.suppressionReason, + CompletionSuppressionReason.lowConfidence.rawValue, + "a runtime-withheld completion must not read as 'the model produced nothing'" + ) + XCTAssertEqual(runtime.resetCount, 0) + } + func test_suggestionClientError_resetsCache_andRethrowsSameError() async { // A `SuggestionClientError` crossing the runtime boundary is a genuine failure, so it must // reset the cache but keep its original case and message for the coordinator's diagnostics. @@ -168,14 +186,14 @@ private struct UnexpectedRuntimeBoom: LocalizedError { /// so the engine's failure routing can be exercised without loading a real model. @MainActor private final class FakeLlamaRuntime: LlamaRuntimeGenerating { - var generateResult: Result = .success("") + var generateResult: Result = .success(.text("")) private(set) var resetCount = 0 func generate( prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try generateResult.get() } From 8f6d0d22a3adcd564799c27d083cf24854a8c727 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:44:30 -0700 Subject: [PATCH 2/5] feat(metrics): always-on suggestion quality counters with a Performance pane readout Local lifetime counters answering 'is quality improving for real use': generations, suggestions shown, why withheld ones were withheld (reason histogram spanning the normalizer, the confidence floor, and the seam guard), and how many shown suggestions were accepted (counted once per suggestion, so word-by-word walks do not inflate the rate). The router counts generation outcomes because it is the single point every finished result passes through; the coordinator records the display-time and acceptance events only it can see. Counters carry no content, so unlike the per-request latency log there is no opt-in gate; the Performance pane shows the counts, acceptance rate, and top withhold reasons with a reset control, indexed for settings search. Acceptance rate over the suppression histogram is the on-device ground truth that decides whether future decode changes actually help. --- Cotabby.xcodeproj/project.pbxproj | 10 +++ .../Coordinators/SettingsCoordinator.swift | 4 + .../SuggestionCoordinator+Acceptance.swift | 9 ++ .../SuggestionCoordinator+Prediction.swift | 5 ++ .../Coordinators/SuggestionCoordinator.swift | 5 ++ Cotabby/App/Core/CotabbyAppEnvironment.swift | 10 ++- .../SuggestionQualityMetricsStore.swift | 86 +++++++++++++++++++ .../Runtime/SuggestionEngineRouter.swift | 18 ++++ .../Settings/Panes/PerformancePaneView.swift | 68 +++++++++++++++ .../UI/Settings/SettingsContainerView.swift | 2 + Cotabby/UI/Settings/SettingsIndex.swift | 8 +- CotabbyTests/PromptPolicyTests.swift | 1 + ...SuggestionCoordinatorAcceptanceTests.swift | 3 + .../SuggestionQualityMetricsStoreTests.swift | 59 +++++++++++++ 14 files changed, 286 insertions(+), 2 deletions(-) create mode 100644 Cotabby/Models/SuggestionQualityMetricsStore.swift create mode 100644 CotabbyTests/SuggestionQualityMetricsStoreTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index a3fbc84e..c19605ad 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -70,6 +70,7 @@ 1681C0F22323FB1156579D99 /* AGPL-3.0.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6F0EE728C0B1A7AD6B19CD0C /* AGPL-3.0.txt */; }; 175C4FA56C29DEE58C2D4D7E /* SuggestionSettingsModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 86460C747AA883FDE756BDBA /* SuggestionSettingsModel.swift */; }; 18382D1919D90E3C1EE143C2 /* AppSurfaceClassifierTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C451E144D220D5C63372A8C0 /* AppSurfaceClassifierTests.swift */; }; + 18680D0D66469A2954A50B6C /* SuggestionQualityMetricsStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */; }; 1899BC5A35DC96B4D04B18A5 /* es.txt in Resources */ = {isa = PBXBuildFile; fileRef = 0B6816DF5D33863F966240B4 /* es.txt */; }; 19386985A3A91D0843092086 /* AboutPaneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3FA53BBC3D81503C1D17477 /* AboutPaneView.swift */; }; 19CA1BF8B508E0E219EF4485 /* SuggestionEngineModelsTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 470A7DAE3D6A2C873B395AE3 /* SuggestionEngineModelsTests.swift */; }; @@ -248,6 +249,7 @@ 55EDBFF489D4C31276E2A67F /* PermissionHostApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6ACCB12E4DB32D2F2BEA567 /* PermissionHostApp.swift */; }; 5614E22EAA5F5C37A9E4F7B6 /* LlamaRuntimeManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = A52D0B550E00EF173A5D157E /* LlamaRuntimeManager.swift */; }; 56611BA0087710277140E9E6 /* DisplayCoordinateConverterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C1C5DE0F3FF63545000E2453 /* DisplayCoordinateConverterTests.swift */; }; + 5687320132AD97B4086260DF /* SuggestionQualityMetricsStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */; }; 576B3FF30FB457EF04F9A715 /* SuggestionTextColorCodec.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1CE61E74928C221B8BB261C6 /* SuggestionTextColorCodec.swift */; }; 586B36CD813E1432D0AB1380 /* DecodeStopPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */; }; 58AC3193D846FDE88513377D /* BundledRuntimeLocatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18D990E515E1AE4F312F4E95 /* BundledRuntimeLocatorTests.swift */; }; @@ -297,6 +299,7 @@ 66D9E37B12A9265D4733E72E /* LlamaRuntimeCore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 944065A858D9BC936CB12B23 /* LlamaRuntimeCore.swift */; }; 68DA5F93B7185B4F5E6DB4C3 /* it.txt in Resources */ = {isa = PBXBuildFile; fileRef = 0397F1DACB094A0F6A66BC0E /* it.txt */; }; 6955C3A4D7AB3EEF7FA7C469 /* InputSuppressionController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2D1F9CEBAB0F330F8E7B61D8 /* InputSuppressionController.swift */; }; + 695E431AC3FF79769E2C5EEF /* SuggestionQualityMetricsStoreTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */; }; 6A4E62EC9B7B970695F87136 /* TextDirectionDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 328847A0F494360033366791 /* TextDirectionDetector.swift */; }; 6A8454A989104AE150308BCF /* it-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 2D8AA55C2B730110E8598F91 /* it-100k.txt */; }; 6AE0B46FB52D189D94E1F79A /* WordCountFormatterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1E0513E3B23937B099A3CFF2 /* WordCountFormatterTests.swift */; }; @@ -840,6 +843,7 @@ 7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TerminalAppDetector.swift; sourceTree = ""; }; 807148A920E003DEF8BA6092 /* SystemMetricsStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SystemMetricsStore.swift; sourceTree = ""; }; 815F2ABAF6AB75DA3AFBBCEF /* WordCountFormatter.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WordCountFormatter.swift; sourceTree = ""; }; + 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionQualityMetricsStore.swift; sourceTree = ""; }; 82E7794DF60664B1FA8F6E7B /* UnitConversionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UnitConversionEvaluator.swift; sourceTree = ""; }; 82F7F7355967725162DF2D1B /* CustomRulesEditor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomRulesEditor.swift; sourceTree = ""; }; 83A810F9D28A18BA6F2066C7 /* MenuBarSections.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MenuBarSections.swift; sourceTree = ""; }; @@ -929,6 +933,7 @@ B41F06FEF208B30ECCF23A6F /* MacroModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MacroModels.swift; sourceTree = ""; }; B424E2AC97C99D335B0D5751 /* SuggestionTextNormalizer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionTextNormalizer.swift; sourceTree = ""; }; B4B4A2E2DD6733658EC05BD8 /* DownloadFileRescuer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuer.swift; sourceTree = ""; }; + B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionQualityMetricsStoreTests.swift; sourceTree = ""; }; B6ACCB12E4DB32D2F2BEA567 /* PermissionHostApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionHostApp.swift; sourceTree = ""; }; B6D36DB66629CF22C1783945 /* CompletionSeamGuardTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CompletionSeamGuardTests.swift; sourceTree = ""; }; B6D42CD456B4B3C988B148A6 /* FocusTrackingModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusTrackingModel.swift; sourceTree = ""; }; @@ -1304,6 +1309,7 @@ F4D9DF8723AF32C058BFACDE /* SpellingDictionaryCatalog.swift */, ADBE3E6CC585C1683787C877 /* SuggestionEngineModels.swift */, 386C98FFCF76EC1C8C7E82BB /* SuggestionModels.swift */, + 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */, D93563FDA25DFC0038E5F887 /* SuggestionSettingsData.swift */, 86460C747AA883FDE756BDBA /* SuggestionSettingsModel.swift */, DEB16474A67CE1D210B944C9 /* SuggestionSubsystemContracts.swift */, @@ -1466,6 +1472,7 @@ 45A896811745673061AF3612 /* SuggestionFocusFreshnessTests.swift */, 8CB1D4F2681FAF59014AE115 /* SuggestionInteractionStateTests.swift */, CDB25ABC4FFB0E63477CDCB0 /* SuggestionOverlayStabilityGateTests.swift */, + B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */, EE94342B888A5A2CCF66BC93 /* SuggestionRequestFactoryTests.swift */, 9C8F07AC52C7A482F5FE34C5 /* SuggestionSessionReconcilerTests.swift */, 00BB95A341A8B5F4A1725640 /* SuggestionSettingsModelTests.swift */, @@ -2081,6 +2088,7 @@ 7DEFC57991AB0C5379AD9CBF /* SuggestionModels.swift in Sources */, EE2C9177CE615298595215A8 /* SuggestionOverlayPresenter.swift in Sources */, DFF3AA49E0770DE3CFBC24C1 /* SuggestionOverlayStabilityGate.swift in Sources */, + 18680D0D66469A2954A50B6C /* SuggestionQualityMetricsStore.swift in Sources */, B691B8378FD73E186A72450C /* SuggestionRequestFactory.swift in Sources */, 532283A7651F7E66635F4281 /* SuggestionSessionReconciler.swift in Sources */, C8CA6DACEAA83336551D4EFA /* SuggestionSettingsData.swift in Sources */, @@ -2310,6 +2318,7 @@ 0AF568AB234033BA2DE4CAA7 /* SuggestionModels.swift in Sources */, 02DA43985CDAE6859014F14F /* SuggestionOverlayPresenter.swift in Sources */, 0F3267956257401F39386773 /* SuggestionOverlayStabilityGate.swift in Sources */, + 5687320132AD97B4086260DF /* SuggestionQualityMetricsStore.swift in Sources */, 46F341472191BC451B6BF6B5 /* SuggestionRequestFactory.swift in Sources */, CA5B2D226FBAA5419E78F14F /* SuggestionSessionReconciler.swift in Sources */, 7EEE6AEBFBD419FFE7C544BA /* SuggestionSettingsData.swift in Sources */, @@ -2472,6 +2481,7 @@ 5CED06E89FBEF557DCD6C684 /* SuggestionFocusFreshnessTests.swift in Sources */, 6CBEF02FCDFCF406E378C27C /* SuggestionInteractionStateTests.swift in Sources */, 4C6D8ED0A7B45D2EADF06DA5 /* SuggestionOverlayStabilityGateTests.swift in Sources */, + 695E431AC3FF79769E2C5EEF /* SuggestionQualityMetricsStoreTests.swift in Sources */, B93AB7E845086F6FBB068369 /* SuggestionRequestFactoryTests.swift in Sources */, 7E9413CE7C999C4612348248 /* SuggestionSessionReconcilerTests.swift in Sources */, 7C6D42EAD04C8144538B132A /* SuggestionSettingsModelTests.swift in Sources */, diff --git a/Cotabby/App/Coordinators/SettingsCoordinator.swift b/Cotabby/App/Coordinators/SettingsCoordinator.swift index 96a8132e..092b54a4 100644 --- a/Cotabby/App/Coordinators/SettingsCoordinator.swift +++ b/Cotabby/App/Coordinators/SettingsCoordinator.swift @@ -20,6 +20,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { private let modelDownloadManager: ModelDownloadManager private let huggingFaceSearchService: HuggingFaceSearchService private let performanceMetricsStore: PerformanceMetricsStore + private let qualityMetricsStore: SuggestionQualityMetricsStore private let systemMetricsStore: SystemMetricsStore private let onShowWelcome: () -> Void private let clearEmojiHistory: () -> Void @@ -36,6 +37,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { modelDownloadManager: ModelDownloadManager, huggingFaceSearchService: HuggingFaceSearchService, performanceMetricsStore: PerformanceMetricsStore, + qualityMetricsStore: SuggestionQualityMetricsStore, systemMetricsStore: SystemMetricsStore, onShowWelcome: @escaping () -> Void, clearEmojiHistory: @escaping () -> Void @@ -49,6 +51,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { self.modelDownloadManager = modelDownloadManager self.huggingFaceSearchService = huggingFaceSearchService self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.systemMetricsStore = systemMetricsStore self.onShowWelcome = onShowWelcome self.clearEmojiHistory = clearEmojiHistory @@ -76,6 +79,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { modelDownloadManager: modelDownloadManager, huggingFaceSearchService: huggingFaceSearchService, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore, onShowWelcome: onShowWelcome, clearEmojiHistory: clearEmojiHistory diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift index f5290b64..ba27b091 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift @@ -122,6 +122,7 @@ extension SuggestionCoordinator { deferAcceptanceBookkeeping { [weak self] in self?.recordAcceptedWords(from: acceptedChunk) + self?.recordSuggestionAcceptedIfFirstChunk(of: sessionForAcceptance) } cancelPredictionWork() @@ -563,6 +564,14 @@ extension SuggestionCoordinator { } } + /// Marks the session's suggestion accepted in the quality counters, once per suggestion: only + /// the first chunk counts, so word-by-word walks of one suggestion add nothing further and the + /// acceptance rate stays suggestions-accepted over suggestions-shown. + private func recordSuggestionAcceptedIfFirstChunk(of session: ActiveSuggestionSession) { + guard session.consumedCharacterCount == 0 else { return } + qualityMetricsStore.recordAcceptedSuggestion() + } + /// Updates the global productivity counter from text accepted via Tab. func recordAcceptedWords(from acceptedChunk: String) { let acceptedWordCount = SuggestionSessionReconciler.acceptedWordCount(in: acceptedChunk) diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index 33b009b4..faff1fc3 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -576,6 +576,8 @@ extension SuggestionCoordinator { clearSuggestion() hideOverlay(reason: "Overlay hidden because the completion failed the seam guard.") state = .idle + let seamReason = if case .seamMisspelling = seamVerdict { "seamMisspelling" } else { "seamJunkPunctuationRun" } + qualityMetricsStore.recordSuppressed(reason: seamReason) logStage( "seam-suppressed", workID: workID, @@ -589,6 +591,9 @@ extension SuggestionCoordinator { latestLatencyMilliseconds = Int(result.latency * 1000) latestGenerationNumber = liveContext.generation + // One shown event per suggestion: this is the only place a fresh generation becomes + // visible (re-presentations after partial accepts reuse the same session). + qualityMetricsStore.recordShown() let session = interactionState.startSession( fullText: result.text, liveContext: liveContext, diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift index 61516984..18733160 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift @@ -54,6 +54,9 @@ final class SuggestionCoordinator: ObservableObject { /// `CotabbyAppEnvironment`) so the underlying `NSSpellChecker` document tag persists across the /// coordinator's lifetime instead of churning per keystroke. let spellChecker: CurrentWordSpellChecker + /// Always-on quality counters (shown / suppressed / accepted). The router counts generation + /// outcomes; the coordinator owns the display-time and acceptance events only it can see. + let qualityMetricsStore: SuggestionQualityMetricsStore /// Frequency-ranked correction source (SymSpell). Used first for the correction word, with /// `spellChecker` as the fallback while its index is still loading or when it has no suggestion. let symSpellCorrector: SymSpellCorrector @@ -163,6 +166,7 @@ final class SuggestionCoordinator: ObservableObject { spellChecker: CurrentWordSpellChecker, symSpellCorrector: SymSpellCorrector, spellingLanguageResolver: SpellingLanguageResolver = SpellingLanguageResolver(), + qualityMetricsStore: SuggestionQualityMetricsStore, userDefaults: UserDefaults = .standard ) { let storedTotalTabAcceptedWordCount = userDefaults.integer( @@ -184,6 +188,7 @@ final class SuggestionCoordinator: ObservableObject { self.spellChecker = spellChecker self.symSpellCorrector = symSpellCorrector self.spellingLanguageResolver = spellingLanguageResolver + self.qualityMetricsStore = qualityMetricsStore self.userDefaults = userDefaults settingsSnapshot = suggestionSettings.snapshot // These collaborators isolate "how overlay/logging works" from "when the coordinator diff --git a/Cotabby/App/Core/CotabbyAppEnvironment.swift b/Cotabby/App/Core/CotabbyAppEnvironment.swift index 9eed835a..06d4d389 100644 --- a/Cotabby/App/Core/CotabbyAppEnvironment.swift +++ b/Cotabby/App/Core/CotabbyAppEnvironment.swift @@ -34,6 +34,7 @@ final class CotabbyAppEnvironment { let welcomeCoordinator: WelcomeCoordinator let huggingFaceSearchService: HuggingFaceSearchService let performanceMetricsStore: PerformanceMetricsStore + let qualityMetricsStore: SuggestionQualityMetricsStore let settingsCoordinator: SettingsCoordinator let activationIndicatorController: ActivationIndicatorController let focusDebugOverlayController: FocusDebugOverlayController? @@ -113,6 +114,9 @@ final class CotabbyAppEnvironment { ) let huggingFaceSearchService = HuggingFaceSearchService() let performanceMetricsStore = PerformanceMetricsStore() + // Always-on quality counters (generated / shown / suppressed-by-reason / accepted). + // Counters only, no content, so unlike latency tracking there is no opt-in gate. + let qualityMetricsStore = SuggestionQualityMetricsStore() // Live CPU/RAM graph backing for the Performance pane. Holds no state until the pane asks it // to start sampling, so constructing it eagerly here costs nothing. let systemMetricsStore = SystemMetricsStore() @@ -157,6 +161,7 @@ final class CotabbyAppEnvironment { foundationModelEngine: foundationModelEngine, llamaEngine: LlamaSuggestionEngine(runtimeManager: runtimeManager), performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, llamaModelNameProvider: { [weak runtimeManager] in runtimeManager?.currentModelFilename } @@ -176,6 +181,7 @@ final class CotabbyAppEnvironment { modelDownloadManager: modelDownloadManager, huggingFaceSearchService: huggingFaceSearchService, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore, onShowWelcome: { [weak welcomeCoordinator] in welcomeCoordinator?.showWelcome() @@ -213,7 +219,8 @@ final class CotabbyAppEnvironment { configuration: configuration, spellChecker: spellChecker, symSpellCorrector: symSpellCorrector, - spellingLanguageResolver: SpellingLanguageResolver() + spellingLanguageResolver: SpellingLanguageResolver(), + qualityMetricsStore: qualityMetricsStore ) // The emoji picker is a sibling to the suggestion coordinator. It reuses the input monitor, @@ -276,6 +283,7 @@ final class CotabbyAppEnvironment { self.welcomeCoordinator = welcomeCoordinator self.huggingFaceSearchService = huggingFaceSearchService self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.settingsCoordinator = settingsCoordinator self.activationIndicatorController = activationIndicatorController self.focusDebugOverlayController = FocusDebugOverlayController.isEnabled diff --git a/Cotabby/Models/SuggestionQualityMetricsStore.swift b/Cotabby/Models/SuggestionQualityMetricsStore.swift new file mode 100644 index 00000000..12d81612 --- /dev/null +++ b/Cotabby/Models/SuggestionQualityMetricsStore.swift @@ -0,0 +1,86 @@ +import Combine +import Foundation + +/// Local, always-on counters that answer "is suggestion quality improving for real use": how many +/// completions were generated, how many were shown, why the withheld ones were withheld, and how +/// many shown suggestions the user actually accepted. +/// +/// Latency tracking (`PerformanceMetricsStore`) stays opt-in because it records per-request rows; +/// these are lifetime counters with zero content, so they run unconditionally and survive restarts. +/// Acceptance rate (accepted / shown) is the closest thing to ground truth the app can measure on +/// device, and the suppression histogram tells the difference between "the model produced nothing" +/// and "a specific guard fired", which otherwise only exists scattered through debug-only JSONL. +@MainActor +final class SuggestionQualityMetricsStore: ObservableObject { + struct Counters: Codable, Equatable { + var generated = 0 + var shown = 0 + /// Sessions the user accepted at least once. Counted per suggestion, not per Tab press, + /// so word-by-word acceptance of one suggestion is one acceptance. + var acceptedSuggestions = 0 + /// Keyed by `CompletionSuppressionReason` raw values plus coordinator-level reasons + /// (the seam guard verdicts). String-keyed so new reasons never need a schema migration. + var suppressedByReason: [String: Int] = [:] + var firstRecordedAt: Date? + + var suppressedTotal: Int { suppressedByReason.values.reduce(0, +) } + + var acceptanceRate: Double? { + guard shown > 0 else { return nil } + return Double(acceptedSuggestions) / Double(shown) + } + } + + @Published private(set) var counters: Counters + + private let userDefaults: UserDefaults + private static let defaultsKey = "cotabbyQualityMetricsCounters" + + /// Stored-property @MainActor classes deallocated inside app-hosted tests double-free without + /// an explicitly nonisolated deinit (the isolated-deinit runtime path over-releases). Same + /// workaround as the other main-actor stores exercised by tests. + nonisolated deinit {} + + init(userDefaults: UserDefaults = .standard) { + self.userDefaults = userDefaults + if let data = userDefaults.data(forKey: Self.defaultsKey), + let decoded = try? JSONDecoder().decode(Counters.self, from: data) { + counters = decoded + } else { + counters = Counters() + } + } + + func recordGenerated() { + mutate { $0.generated += 1 } + } + + func recordShown() { + mutate { $0.shown += 1 } + } + + func recordAcceptedSuggestion() { + mutate { $0.acceptedSuggestions += 1 } + } + + func recordSuppressed(reason: String) { + mutate { $0.suppressedByReason[reason, default: 0] += 1 } + } + + func reset() { + counters = Counters() + userDefaults.removeObject(forKey: Self.defaultsKey) + } + + private func mutate(_ change: (inout Counters) -> Void) { + var updated = counters + change(&updated) + if updated.firstRecordedAt == nil { + updated.firstRecordedAt = Date() + } + counters = updated + if let data = try? JSONEncoder().encode(updated) { + userDefaults.set(data, forKey: Self.defaultsKey) + } + } +} diff --git a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift index 65b0cfbe..8d217f6e 100644 --- a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift +++ b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift @@ -11,6 +11,7 @@ final class SuggestionEngineRouter { private let foundationModelEngine: any SuggestionGenerating private let llamaEngine: any SuggestionGenerating private let performanceMetricsStore: PerformanceMetricsStore + private let qualityMetricsStore: SuggestionQualityMetricsStore /// Closure that returns the currently selected llama model filename (e.g. `Qwen3-0.6B-Q8_0.gguf`). /// A closure instead of a direct `LlamaRuntimeManager` reference keeps the router from depending /// on the concrete runtime type — useful for tests that want to fake the model label. @@ -21,12 +22,14 @@ final class SuggestionEngineRouter { foundationModelEngine: any SuggestionGenerating, llamaEngine: any SuggestionGenerating, performanceMetricsStore: PerformanceMetricsStore, + qualityMetricsStore: SuggestionQualityMetricsStore, llamaModelNameProvider: @escaping @MainActor () -> String? ) { self.suggestionSettings = suggestionSettings self.foundationModelEngine = foundationModelEngine self.llamaEngine = llamaEngine self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.llamaModelNameProvider = llamaModelNameProvider } @@ -48,6 +51,7 @@ final class SuggestionEngineRouter { do { let result = try await foundationModelEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: "Apple Intelligence", latency: result.latency) + recordQualityOutcome(result) return result } catch SuggestionClientError.unsupportedLanguageOrLocale(let message) { CotabbyLogger.suggestion.info( @@ -67,10 +71,23 @@ final class SuggestionEngineRouter { CotabbyLogger.suggestion.debug("Routing to open-source llama engine", metadata: metadata) let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) + recordQualityOutcome(result) return result } } + /// Counts every finished generation plus the engine-attributed suppression reason when the + /// pipeline emptied the text. The router is the single point that sees every finished result + /// regardless of engine or fallback, which keeps these counters complete by construction. + /// Display-time outcomes (shown, seam-guard suppressions, acceptance) are recorded by the + /// coordinator, the only layer that knows them. + private func recordQualityOutcome(_ result: SuggestionResult) { + qualityMetricsStore.recordGenerated() + if let reason = result.suppressionReason { + qualityMetricsStore.recordSuppressed(reason: reason) + } + } + /// Persists one (timestamp, model, latency) triple into the rolling ring buffer when the /// Performance pane toggle is on. The router is the right home for this seam because it is /// the single point that sees a finished `SuggestionResult` and knows which engine produced @@ -121,6 +138,7 @@ final class SuggestionEngineRouter { do { let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) + recordQualityOutcome(result) return result } catch SuggestionClientError.cancelled { throw SuggestionClientError.cancelled diff --git a/Cotabby/UI/Settings/Panes/PerformancePaneView.swift b/Cotabby/UI/Settings/Panes/PerformancePaneView.swift index b001167f..13d99ba1 100644 --- a/Cotabby/UI/Settings/Panes/PerformancePaneView.swift +++ b/Cotabby/UI/Settings/Panes/PerformancePaneView.swift @@ -11,12 +11,15 @@ import SwiftUI struct PerformancePaneView: View { @ObservedObject var suggestionSettings: SuggestionSettingsModel @ObservedObject var performanceMetricsStore: PerformanceMetricsStore + @ObservedObject var qualityMetricsStore: SuggestionQualityMetricsStore @ObservedObject var systemMetricsStore: SystemMetricsStore var body: some View { SettingsPaneScaffold { liveResourceSection + suggestionQualitySection + Section("Tracking") { Toggle(isOn: trackingEnabledBinding) { SettingsRowLabel( @@ -59,6 +62,71 @@ struct PerformancePaneView: View { .onDisappear { systemMetricsStore.endSampling() } } + // MARK: - Suggestion quality counters + + /// Lifetime counters: how often suggestions appear, why withheld ones were withheld, and how + /// many shown suggestions the user accepted. Always on (counters carry no content), unlike the + /// per-request latency log below, which records timestamps and stays opt-in. + private var suggestionQualitySection: some View { + Section { + qualityCounterRow(label: "Suggestions shown", value: "\(qualityMetricsStore.counters.shown)") + qualityCounterRow(label: "Accepted", value: acceptedLabel) + qualityCounterRow(label: "Generations", value: "\(qualityMetricsStore.counters.generated)") + if !topSuppressionReasons.isEmpty { + qualityCounterRow( + label: "Withheld (\(qualityMetricsStore.counters.suppressedTotal))", + value: topSuppressionReasons + ) + } + } header: { + HStack { + Text(qualityHeaderLabel) + Spacer() + if qualityMetricsStore.counters.shown > 0 || qualityMetricsStore.counters.generated > 0 { + Button("Reset") { + qualityMetricsStore.reset() + } + .buttonStyle(.borderless) + .controlSize(.small) + } + } + } + } + + private func qualityCounterRow(label: String, value: String) -> some View { + HStack(alignment: .firstTextBaseline) { + Text(label) + Spacer() + Text(value) + .foregroundStyle(.secondary) + .multilineTextAlignment(.trailing) + } + } + + private var acceptedLabel: String { + let accepted = qualityMetricsStore.counters.acceptedSuggestions + guard let rate = qualityMetricsStore.counters.acceptanceRate else { + return "\(accepted)" + } + return "\(accepted) (\(Int((rate * 100).rounded()))%)" + } + + private var topSuppressionReasons: String { + qualityMetricsStore.counters.suppressedByReason + .sorted { lhs, rhs in lhs.value == rhs.value ? lhs.key < rhs.key : lhs.value > rhs.value } + .prefix(4) + .map { "\($0.key) \($0.value)" } + .joined(separator: ", ") + } + + private var qualityHeaderLabel: String { + guard let since = qualityMetricsStore.counters.firstRecordedAt else { + return "Suggestion Quality" + } + let formatted = since.formatted(date: .abbreviated, time: .omitted) + return "Suggestion Quality (since \(formatted))" + } + // MARK: - Live resource graphs private var liveResourceSection: some View { diff --git a/Cotabby/UI/Settings/SettingsContainerView.swift b/Cotabby/UI/Settings/SettingsContainerView.swift index 9300aacc..0059d760 100644 --- a/Cotabby/UI/Settings/SettingsContainerView.swift +++ b/Cotabby/UI/Settings/SettingsContainerView.swift @@ -21,6 +21,7 @@ struct SettingsContainerView: View { @ObservedObject var modelDownloadManager: ModelDownloadManager @ObservedObject var huggingFaceSearchService: HuggingFaceSearchService @ObservedObject var performanceMetricsStore: PerformanceMetricsStore + @ObservedObject var qualityMetricsStore: SuggestionQualityMetricsStore @ObservedObject var systemMetricsStore: SystemMetricsStore let onShowWelcome: () -> Void @@ -130,6 +131,7 @@ struct SettingsContainerView: View { PerformancePaneView( suggestionSettings: suggestionSettings, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore ) case .about: diff --git a/Cotabby/UI/Settings/SettingsIndex.swift b/Cotabby/UI/Settings/SettingsIndex.swift index ef13b543..9865d002 100644 --- a/Cotabby/UI/Settings/SettingsIndex.swift +++ b/Cotabby/UI/Settings/SettingsIndex.swift @@ -71,6 +71,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case screenRecording // Performance case performanceTracking + case suggestionQualityStats case resourceUsage case recentRequests // About @@ -137,6 +138,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .inputMonitoring: return "Input Monitoring" case .screenRecording: return "Screen Recording" case .performanceTracking: return "Enable Performance Tracking" + case .suggestionQualityStats: return "Suggestion Quality" case .resourceUsage: return "Live Resource Usage" case .recentRequests: return "Recent Requests" case .checkForUpdates: return "Check for Updates" @@ -202,6 +204,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .inputMonitoring: return "keyboard" case .screenRecording: return "camera.viewfinder" case .performanceTracking: return "stopwatch" + case .suggestionQualityStats: return "checkmark.seal" case .resourceUsage: return "chart.line.uptrend.xyaxis" case .recentRequests: return "list.bullet.clipboard" case .checkForUpdates: return "arrow.triangle.2.circlepath" @@ -238,7 +241,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { return .apps case .accessibility, .inputMonitoring, .screenRecording: return .permissions - case .performanceTracking, .resourceUsage, .recentRequests: + case .performanceTracking, .suggestionQualityStats, .resourceUsage, .recentRequests: return .performance case .checkForUpdates, .support, .githubRepository, .wiki, .acknowledgements, .uninstall: @@ -409,6 +412,9 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .performanceTracking: return ["performance", "tracking", "latency", "metrics", "timing", "telemetry", "analytics", "diagnostics", "measure"] + case .suggestionQualityStats: + return ["quality", "acceptance", "accepted", "shown", "suppressed", "withheld", + "rate", "stats", "counters", "suggestions"] case .resourceUsage: return ["cpu", "memory", "ram", "usage", "resource", "graph", "chart", "live", "load", "monitor"] diff --git a/CotabbyTests/PromptPolicyTests.swift b/CotabbyTests/PromptPolicyTests.swift index ef23a211..bfdb5f5a 100644 --- a/CotabbyTests/PromptPolicyTests.swift +++ b/CotabbyTests/PromptPolicyTests.swift @@ -308,6 +308,7 @@ final class SuggestionEngineRouterTests: XCTestCase { foundationModelEngine: appleEngine, llamaEngine: openSourceEngine, performanceMetricsStore: PerformanceMetricsStore(userDefaults: makeUserDefaults()), + qualityMetricsStore: SuggestionQualityMetricsStore(userDefaults: makeUserDefaults()), llamaModelNameProvider: { nil } ) diff --git a/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift b/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift index efaafafa..3b701689 100644 --- a/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift +++ b/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift @@ -312,6 +312,9 @@ final class SuggestionCoordinatorAcceptanceTests: XCTestCase { configuration: .standard, spellChecker: CurrentWordSpellChecker(), symSpellCorrector: SymSpellCorrector(preloadLanguage: nil), + qualityMetricsStore: SuggestionQualityMetricsStore( + userDefaults: UserDefaults(suiteName: "CotabbyTests.quality.\(UUID().uuidString)") ?? .standard + ), userDefaults: UserDefaults(suiteName: "CotabbyTests.\(UUID().uuidString)") ?? .standard ) Self.retainedCoordinators.append(coordinator) diff --git a/CotabbyTests/SuggestionQualityMetricsStoreTests.swift b/CotabbyTests/SuggestionQualityMetricsStoreTests.swift new file mode 100644 index 00000000..112aa8e8 --- /dev/null +++ b/CotabbyTests/SuggestionQualityMetricsStoreTests.swift @@ -0,0 +1,59 @@ +import XCTest +@testable import Cotabby + +@MainActor +final class SuggestionQualityMetricsStoreTests: XCTestCase { + private func freshDefaults() -> UserDefaults { + UserDefaults(suiteName: "CotabbyTests.qualityMetrics.\(UUID().uuidString)") ?? .standard + } + + func testCountersAccumulate() { + let store = SuggestionQualityMetricsStore(userDefaults: freshDefaults()) + store.recordGenerated() + store.recordGenerated() + store.recordShown() + store.recordAcceptedSuggestion() + store.recordSuppressed(reason: "lowConfidence") + store.recordSuppressed(reason: "lowConfidence") + store.recordSuppressed(reason: "seamMisspelling") + + XCTAssertEqual(store.counters.generated, 2) + XCTAssertEqual(store.counters.shown, 1) + XCTAssertEqual(store.counters.acceptedSuggestions, 1) + XCTAssertEqual(store.counters.suppressedByReason["lowConfidence"], 2) + XCTAssertEqual(store.counters.suppressedByReason["seamMisspelling"], 1) + XCTAssertEqual(store.counters.suppressedTotal, 3) + XCTAssertNotNil(store.counters.firstRecordedAt) + } + + func testAcceptanceRate() { + let store = SuggestionQualityMetricsStore(userDefaults: freshDefaults()) + XCTAssertNil(store.counters.acceptanceRate, "no rate without shown suggestions") + store.recordShown() + store.recordShown() + store.recordShown() + store.recordShown() + store.recordAcceptedSuggestion() + XCTAssertEqual(store.counters.acceptanceRate ?? 0, 0.25, accuracy: 0.0001) + } + + func testPersistsAcrossInstances() { + let defaults = freshDefaults() + let first = SuggestionQualityMetricsStore(userDefaults: defaults) + first.recordShown() + first.recordSuppressed(reason: "emptyGeneration") + + let second = SuggestionQualityMetricsStore(userDefaults: defaults) + XCTAssertEqual(second.counters.shown, 1) + XCTAssertEqual(second.counters.suppressedByReason["emptyGeneration"], 1) + } + + func testResetClearsEverything() { + let defaults = freshDefaults() + let store = SuggestionQualityMetricsStore(userDefaults: defaults) + store.recordShown() + store.reset() + XCTAssertEqual(store.counters, SuggestionQualityMetricsStore.Counters()) + XCTAssertEqual(SuggestionQualityMetricsStore(userDefaults: defaults).counters.shown, 0) + } +} From 01efabeb444c118fc73e7dd3f5694a477733ba52 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:44:50 -0700 Subject: [PATCH 3/5] feat(suggestion): adapt the prediction debounce to the last generation latency A fixed debounce serves two masters badly: on fast hardware it adds avoidable delay before every suggestion, and on slow hardware it lets keystrokes pile doomed generations onto a model that cannot keep up (every cancel still costs decode setup and teardown). The debounce now keys off the most recent generation latency: 15ms when the model answers within 70ms, 25ms within 140ms, 55ms beyond that, with the configured value as the fallback until a first latency exists. --- Cotabby.xcodeproj/project.pbxproj | 10 ++++++++ .../SuggestionCoordinator+Prediction.swift | 11 +++++++-- .../Models/SuggestionSubsystemContracts.swift | 2 +- Cotabby/Support/DebouncePolicy.swift | 21 ++++++++++++++++ CotabbyTests/DebouncePolicyTests.swift | 24 +++++++++++++++++++ .../LlamaSuggestionEnginePrewarmTests.swift | 4 ++-- .../LlamaSuggestionEngineStreamingTests.swift | 8 +++---- .../SuggestionCoordinatorTestSupport.swift | 3 +++ .../SuggestionEngineRouterTests.swift | 1 + 9 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 Cotabby/Support/DebouncePolicy.swift create mode 100644 CotabbyTests/DebouncePolicyTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index c19605ad..02c1fdd8 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -304,6 +304,7 @@ 6A8454A989104AE150308BCF /* it-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 2D8AA55C2B730110E8598F91 /* it-100k.txt */; }; 6AE0B46FB52D189D94E1F79A /* WordCountFormatterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1E0513E3B23937B099A3CFF2 /* WordCountFormatterTests.swift */; }; 6BE0C8F9D054A2C0D9018001 /* ConfidenceSuppressionPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1BD71ECC2AE4821B643E0935 /* ConfidenceSuppressionPolicy.swift */; }; + 6C59B369AAC6948C53E41654 /* DebouncePolicyTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */; }; 6CBEF02FCDFCF406E378C27C /* SuggestionInteractionStateTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8CB1D4F2681FAF59014AE115 /* SuggestionInteractionStateTests.swift */; }; 6D0E79CF3C1A8CE53046FCE5 /* AXTextGeometryResolverTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C046CB4F3CB4BFE9391DB5DE /* AXTextGeometryResolverTests.swift */; }; 6D57E3CDF56127422311C065 /* TerminalAppDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */; }; @@ -487,6 +488,7 @@ B782EC08B7516791BDB21172 /* FieldStyleCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = B7FBF2B766E728F25899B64E /* FieldStyleCache.swift */; }; B7A98BC225304E4DFED9E622 /* OnboardingTemplateRecommender.swift in Sources */ = {isa = PBXBuildFile; fileRef = FA878B447441BB4F3E327CC8 /* OnboardingTemplateRecommender.swift */; }; B816C6191738AB616F2E8D2D /* SuggestionCoordinatorTestSupport.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4C174D8294858BF9DF3D361D /* SuggestionCoordinatorTestSupport.swift */; }; + B849D68E0474CECAE809881C /* DebouncePolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */; }; B93AB7E845086F6FBB068369 /* SuggestionRequestFactoryTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = EE94342B888A5A2CCF66BC93 /* SuggestionRequestFactoryTests.swift */; }; B9623395B31459D9D45B1320 /* CurrentWordExtractor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 247561C626843957CFB4B632 /* CurrentWordExtractor.swift */; }; B9F400BCC20757DA5DB0B5F9 /* FoundationModelSuggestionEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5664E34B23FBDF69292FEF43 /* FoundationModelSuggestionEngine.swift */; }; @@ -585,6 +587,7 @@ E311B80968761E90FBA19A8A /* TypoGate.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8412FE2BAC406421248A03B /* TypoGate.swift */; }; E313639E71AE1374D2B9A956 /* SuggestionWorkController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B2D97BAA3618A7D0357AC44 /* SuggestionWorkController.swift */; }; E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */; }; + E3C0326597083762BA6D76CA /* DebouncePolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */; }; E3CAAEFAAB5BB24CEE16445B /* LLMIOFileHandler.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8D610FCA3A97249DCCE7D0B8 /* LLMIOFileHandler.swift */; }; E4382BEA8A8551612E5966B9 /* BaseCompletionPromptRenderer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 85EF79E6144D6C6AD062B569 /* BaseCompletionPromptRenderer.swift */; }; E46F50AEDA8FE13B02E3FA8D /* AXHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = AC70775535A3428991025AB8 /* AXHelper.swift */; }; @@ -815,6 +818,7 @@ 6CF1FBAABEF545B620AF8D78 /* ru-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "ru-100k.txt"; sourceTree = ""; }; 6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaEvalScoringTests.swift; sourceTree = ""; }; 6DB982BF30B3601F57277776 /* fr-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "fr-100k.txt"; sourceTree = ""; }; + 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebouncePolicy.swift; sourceTree = ""; }; 6DC693E00430F46E41CB56E6 /* RequestID.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RequestID.swift; sourceTree = ""; }; 6E3B1232C4BE8072A5183F9C /* SymSpell.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SymSpell.swift; sourceTree = ""; }; 6E3EC87078D3A4C21DB3252C /* RandomMacroEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RandomMacroEvaluator.swift; sourceTree = ""; }; @@ -1023,6 +1027,7 @@ E68BE6A22BA0D42C8DD9868C /* SelfCaptureGate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SelfCaptureGate.swift; sourceTree = ""; }; E7F42112F14026E6253BB865 /* PermissionAndContextModelTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionAndContextModelTests.swift; sourceTree = ""; }; EAAE6B395FAB604DF059280A /* KeyCodeLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeyCodeLabels.swift; sourceTree = ""; }; + EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebouncePolicyTests.swift; sourceTree = ""; }; EB630F9814388203DD1CA2EC /* ShortcutsPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ShortcutsPaneView.swift; sourceTree = ""; }; EC04832FBD5311352F35241B /* SuggestionCaretLayoutRepairTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionCaretLayoutRepairTests.swift; sourceTree = ""; }; EC4A3C4BC38793EB11F484F1 /* CompositionInputModeClassifierTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CompositionInputModeClassifierTests.swift; sourceTree = ""; }; @@ -1378,6 +1383,7 @@ 1C4751DFE9DA372FBC40BA30 /* CurrentWordExtractorTests.swift */, AD752451330486FE270018B0 /* CustomRulesTests.swift */, 313EDBA60565836F32CEEC10 /* DateMacroEvaluatorTests.swift */, + EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */, B3B09064903B760D6DF2DF7D /* DecodeStopPolicyTests.swift */, 8F20A19A24D20E16D25ADCDA /* DeepGeometryWalkThrottleTests.swift */, C49F67B3EEB2F2A577A54085 /* DeviceInfoTests.swift */, @@ -1631,6 +1637,7 @@ 9CC2D6472ACD377FD73A5801 /* ControlTokenMarkers.swift */, C7B2D34A6F3AC9DFD61350F7 /* CotabbyDebugOptions.swift */, 247561C626843957CFB4B632 /* CurrentWordExtractor.swift */, + 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */, D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */, 29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */, 74BD1D4DB27D5D96D1E06096 /* DisplayCoordinateConverter.swift */, @@ -1945,6 +1952,7 @@ 55D4E6FB63E3475749E61EB3 /* CustomRulesCatalog.swift in Sources */, F7237FDB0665465F1C7EDCDE /* CustomRulesEditor.swift in Sources */, E0DAB3CDE782C330AF21FC0D /* DateMacroEvaluator.swift in Sources */, + E3C0326597083762BA6D76CA /* DebouncePolicy.swift in Sources */, FA25762161F068F59BEC86EB /* DecodeStopPolicy.swift in Sources */, 400E1A5145FC8C5BA2FAED0A /* DeepGeometryWalkThrottle.swift in Sources */, CF39EB76C3ECF8F764C1B4FB /* DeviceInfo.swift in Sources */, @@ -2175,6 +2183,7 @@ 0431AE1DBEE36C90C7F39C19 /* CustomRulesCatalog.swift in Sources */, 4B4DDB569CAD806F765224DE /* CustomRulesEditor.swift in Sources */, DAD77998F793468D4D64B705 /* DateMacroEvaluator.swift in Sources */, + B849D68E0474CECAE809881C /* DebouncePolicy.swift in Sources */, 586B36CD813E1432D0AB1380 /* DecodeStopPolicy.swift in Sources */, 261FA692D19C48E53D6999BC /* DeepGeometryWalkThrottle.swift in Sources */, 1450746C690B3D98203B71EC /* DeviceInfo.swift in Sources */, @@ -2387,6 +2396,7 @@ 64599CD334AAD79266224689 /* CurrentWordExtractorTests.swift in Sources */, 91D1F16B8C5DA281D4B7F699 /* CustomRulesTests.swift in Sources */, 4CCF29A7EA1B7D37841C135D /* DateMacroEvaluatorTests.swift in Sources */, + 6C59B369AAC6948C53E41654 /* DebouncePolicyTests.swift in Sources */, 79B0AEA0D2FC6A865E9303F9 /* DecodeStopPolicyTests.swift in Sources */, 664A5D62A723EB204ADEF2F9 /* DeepGeometryWalkThrottleTests.swift in Sources */, 43DED8ABEFF9894ED54097A9 /* DeviceInfoTests.swift in Sources */, diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index faff1fc3..ab443354 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -20,11 +20,18 @@ extension SuggestionCoordinator { return } + // The debounce window adapts to the last generation latency: snappier when the model is + // fast, calmer when it is slow (fewer doomed generations to cancel). The configured value + // is the fallback until a first latency exists. + let debounceMilliseconds = DebouncePolicy.milliseconds( + lastGenerationLatencyMilliseconds: latestLatencyMilliseconds, + fallback: settingsSnapshot.debounceMilliseconds + ) // The debounce clock starts at the keystroke, not here. The host-publish poll has already // consumed real wall time waiting for the host to publish the keystroke to AX, and that // wait collapses bursts just as well as sleeping does. Stacking the full debounce on top // of the publish wait was pure added latency, so only the unconsumed remainder is slept. - let remainingDelay = max(0, settingsSnapshot.debounceMilliseconds - consumedDelayMilliseconds) + let remainingDelay = max(0, debounceMilliseconds - consumedDelayMilliseconds) // Task cancellation in Swift is cooperative, so we also use an explicit work id. // That gives us strict "latest request wins" semantics even if an old task wakes up late. @@ -42,7 +49,7 @@ extension SuggestionCoordinator { logStage( "debouncing", workID: workID, - message: "Debouncing (\(settingsSnapshot.debounceMilliseconds)ms window) before generating." + message: "Debouncing (\(debounceMilliseconds)ms window, \(remainingDelay)ms remaining) before generating." ) } diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index 5f0acb63..79655f7c 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -162,7 +162,7 @@ extension LlamaRuntimeGenerating { cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try await generate(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options) } } diff --git a/Cotabby/Support/DebouncePolicy.swift b/Cotabby/Support/DebouncePolicy.swift new file mode 100644 index 00000000..6d2af60a --- /dev/null +++ b/Cotabby/Support/DebouncePolicy.swift @@ -0,0 +1,21 @@ +import Foundation + +/// Chooses the prediction debounce from the last observed generation latency. +/// +/// A fixed debounce serves two masters badly: on fast hardware it adds avoidable delay before +/// every suggestion, and on slow hardware it lets keystrokes pile doomed generations onto a model +/// that cannot keep up (each cancel still costs a decode setup and teardown). Keying the debounce +/// to the most recent generation latency makes fast machines snappier and slow machines calmer, +/// with no configuration. The configured value remains the fallback until a first latency exists. +nonisolated enum DebouncePolicy { + static func milliseconds(lastGenerationLatencyMilliseconds: Int?, fallback: Int) -> Int { + guard let last = lastGenerationLatencyMilliseconds, last > 0 else { + return fallback + } + switch last { + case ...70: return 15 + case ...140: return 25 + default: return 55 + } + } +} diff --git a/CotabbyTests/DebouncePolicyTests.swift b/CotabbyTests/DebouncePolicyTests.swift new file mode 100644 index 00000000..bcb1afbc --- /dev/null +++ b/CotabbyTests/DebouncePolicyTests.swift @@ -0,0 +1,24 @@ +import XCTest +@testable import Cotabby + +final class DebouncePolicyTests: XCTestCase { + func testNoLatencyDataUsesFallback() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: nil, fallback: 20), 20) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 0, fallback: 20), 20) + } + + func testFastGenerationsGetTheShortDebounce() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 45, fallback: 20), 15) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 70, fallback: 20), 15) + } + + func testMediumGenerationsGetTheMiddleDebounce() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 71, fallback: 20), 25) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 140, fallback: 20), 25) + } + + func testSlowGenerationsBackOff() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 141, fallback: 20), 55) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 900, fallback: 20), 55) + } +} diff --git a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift index f66c4f57..c351779f 100644 --- a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift +++ b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift @@ -106,7 +106,7 @@ final class LlamaSuggestionEnginePrewarmTests: XCTestCase { @MainActor private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { var prefillError: Error? - var generateResult: Result = .success("ok") + var generateResult: Result = .success(.text("ok")) private(set) var prefillPrompts: [String] = [] private(set) var generateCachedPrefixBytes: [Int?] = [] @@ -114,7 +114,7 @@ private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { generateCachedPrefixBytes.append(cachedPrefixBytes) return try generateResult.get() } diff --git a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift index fc138f70..4e28451e 100644 --- a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift +++ b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift @@ -107,8 +107,8 @@ private final class StreamingFakeRuntime: LlamaRuntimeGenerating { prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { - finalText + ) async throws -> LlamaGenerationOutput { + .text(finalText) } func generate( @@ -116,12 +116,12 @@ private final class StreamingFakeRuntime: LlamaRuntimeGenerating { cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { streamingCallCount += 1 for partial in partialRawTexts { onPartialRawText?(partial) } - return finalText + return .text(finalText) } func resetPromptCache() {} diff --git a/CotabbyTests/SuggestionCoordinatorTestSupport.swift b/CotabbyTests/SuggestionCoordinatorTestSupport.swift index 37a13b57..42953014 100644 --- a/CotabbyTests/SuggestionCoordinatorTestSupport.swift +++ b/CotabbyTests/SuggestionCoordinatorTestSupport.swift @@ -245,6 +245,9 @@ func makeCoordinatorRig( configuration: .standard, spellChecker: CurrentWordSpellChecker(), symSpellCorrector: SymSpellCorrector(preloadLanguage: nil), + qualityMetricsStore: SuggestionQualityMetricsStore( + userDefaults: UserDefaults(suiteName: "CotabbyTests.rig.quality.\(UUID().uuidString)") ?? .standard + ), userDefaults: UserDefaults(suiteName: "CotabbyTests.rig.\(UUID().uuidString)") ?? .standard ) return CoordinatorRig( diff --git a/CotabbyTests/SuggestionEngineRouterTests.swift b/CotabbyTests/SuggestionEngineRouterTests.swift index 75e4f395..156d8fa9 100644 --- a/CotabbyTests/SuggestionEngineRouterTests.swift +++ b/CotabbyTests/SuggestionEngineRouterTests.swift @@ -67,6 +67,7 @@ final class SuggestionEngineRouterRoutingTests: XCTestCase { foundationModelEngine: foundation, llamaEngine: llama, performanceMetricsStore: metrics, + qualityMetricsStore: SuggestionQualityMetricsStore(userDefaults: defaults), llamaModelNameProvider: { llamaModelName } ) Self.retained.append(contentsOf: [router, settings, metrics] as [AnyObject]) From 24ac0227b6409b6abd795b3355ed03278161fa2d Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:31:06 -0700 Subject: [PATCH 4/5] review: balance the quality ledger at lifecycle exits; injectable argmax toggle The stale-drop, unattributed-empty, selected-text, and accept-echo exits returned without recording either shown or suppressed, so the generated counter silently outgrew the others; each now records a lifecycle discard reason (engine-attributed empties stay counted by the router alone). The argmax-EOG toggle now mirrors the confidence floor's injectable-defaults seam, with tests covering both escape hatches against an isolated suite. --- Cotabby.xcodeproj/project.pbxproj | 4 ++ .../SuggestionCoordinator+Prediction.swift | 11 +++++ .../Runtime/LlamaSuggestionEngine.swift | 10 +++- .../LlamaDecodeGateDefaultsTests.swift | 46 +++++++++++++++++++ 4 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 CotabbyTests/LlamaDecodeGateDefaultsTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index 02c1fdd8..6ff8ed51 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -508,6 +508,7 @@ BFCA7FAFDAEBF586AB615567 /* ClipboardRelevanceFilterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90B0D133AB77A2503FB08827 /* ClipboardRelevanceFilterTests.swift */; }; C0537A515AED443F6C61DB2A /* MenuBarSections.swift in Sources */ = {isa = PBXBuildFile; fileRef = 83A810F9D28A18BA6F2066C7 /* MenuBarSections.swift */; }; C0B833234748E82D3382631A /* emoji.json in Resources */ = {isa = PBXBuildFile; fileRef = C379D77029D6E88C8C1B9AF7 /* emoji.json */; }; + C0F757D74758B76DA2962BC5 /* LlamaDecodeGateDefaultsTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */; }; C0FE11D76BDF01A5470C554D /* FocusCapabilityFlickerGate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A44BEC8C23FF227731DD0CD /* FocusCapabilityFlickerGate.swift */; }; C149EAED2CF6F8B6274053E0 /* AppSurfaceClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 94B0830FBE4F2E239F670DBA /* AppSurfaceClassifier.swift */; }; C178E35A9A713BD4D9943E62 /* TypoCaseTransfer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 08CE63B8725EBD71A4C024E1 /* TypoCaseTransfer.swift */; }; @@ -736,6 +737,7 @@ 29CDC8BE5312B9BEFD9B22CB /* SurfaceContextComposerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SurfaceContextComposerTests.swift; sourceTree = ""; }; 29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeviceInfo.swift; sourceTree = ""; }; 2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluator.swift; sourceTree = ""; }; + 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaDecodeGateDefaultsTests.swift; sourceTree = ""; }; 2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AcknowledgementsView.swift; sourceTree = ""; }; 2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluatorTests.swift; sourceTree = ""; }; 2D1F9CEBAB0F330F8E7B61D8 /* InputSuppressionController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputSuppressionController.swift; sourceTree = ""; }; @@ -1426,6 +1428,7 @@ 2960080A726E51198225147A /* InsertionStrategySelectorTests.swift */, 2930EC34057319130393696B /* KeyCodeLabelsTests.swift */, 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */, + 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */, 906011A6C9D66EEBAF3B5CC0 /* LlamaEvalScoring.swift */, 6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */, 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */, @@ -2440,6 +2443,7 @@ F66F0D982EBAF5A3E99C5342 /* KeyCodeLabelsTests.swift in Sources */, 475FB7450EEC3C1B16E66CC4 /* LLMIOFileHandlerTests.swift in Sources */, E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */, + C0F757D74758B76DA2962BC5 /* LlamaDecodeGateDefaultsTests.swift in Sources */, 3D82280EFF7F7E9F3FFF45ED /* LlamaEvalScoring.swift in Sources */, 3D56E9B3AA378400E2C081E3 /* LlamaEvalScoringTests.swift in Sources */, E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */, diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index ab443354..5b0ae7aa 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -503,6 +503,10 @@ extension SuggestionCoordinator { guard liveContext.generation == result.generation else { latestRawModelOutput = SuggestionDebugLogger.debugPreview(result.rawText) + // Lifecycle discards are counted under their own reasons so `generated` always equals + // `shown` plus the suppression histogram; without this, every drop here silently + // inflated the generated count against the others. + qualityMetricsStore.recordSuppressed(reason: "discardedStaleContext") logStage( "stale-drop", workID: workID, @@ -521,6 +525,11 @@ extension SuggestionCoordinator { clearSuggestion() hideOverlay(reason: "Overlay hidden because the model returned an empty continuation.") state = .idle + // The router already counted engine-attributed suppressions (normalizer, confidence + // floor); only the unattributed "model produced nothing" case needs a ledger entry. + if result.suppressionReason == nil { + qualityMetricsStore.recordSuppressed(reason: "emptyUnattributed") + } logStage( "empty-result", workID: workID, @@ -536,6 +545,7 @@ extension SuggestionCoordinator { clearSuggestion(clearDiagnostics: true) hideOverlay(reason: "Overlay hidden because text is selected.") state = .idle + qualityMetricsStore.recordSuppressed(reason: "discardedSelection") logStage( "selected-text", workID: workID, @@ -560,6 +570,7 @@ extension SuggestionCoordinator { clearSuggestion(clearDiagnostics: false) hideOverlay(reason: "Overlay hidden because the regeneration only echoed the just-accepted text before the host published it.") state = .idle + qualityMetricsStore.recordSuppressed(reason: "discardedAcceptEcho") logStage( "stale-accept-echo", workID: workID, diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index d38d6407..b090ddc9 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -31,13 +31,19 @@ final class LlamaSuggestionEngine { static let confidenceFloorOverrideKey = "cotabbyConfidenceFloorOverride" static let argmaxStopDisabledKey = "cotabbyArgmaxStopDisabled" - private static func resolvedConfidenceFloor(_ defaults: UserDefaults = .standard) -> Double { + static func resolvedConfidenceFloor(_ defaults: UserDefaults = .standard) -> Double { guard defaults.object(forKey: confidenceFloorOverrideKey) != nil else { return defaultConfidenceFloor } return defaults.double(forKey: confidenceFloorOverrideKey) } + /// Mirrors `resolvedConfidenceFloor`: injectable defaults so the disable toggle is testable + /// against an isolated suite instead of process-global state. + static func resolvedStopAtArgmaxEOG(_ defaults: UserDefaults = .standard) -> Bool { + !defaults.bool(forKey: argmaxStopDisabledKey) + } + /// Prefills the prompt KV for the field the user just focused, so the first real suggestion /// there only decodes the typed delta instead of the whole cold prompt. /// @@ -265,7 +271,7 @@ final class LlamaSuggestionEngine { trailingText: request.context.trailingText ), confidenceFloor: resolvedConfidenceFloor(), - stopAtArgmaxEOG: !UserDefaults.standard.bool(forKey: argmaxStopDisabledKey) + stopAtArgmaxEOG: resolvedStopAtArgmaxEOG() ) } } diff --git a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift new file mode 100644 index 00000000..90ba2da3 --- /dev/null +++ b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift @@ -0,0 +1,46 @@ +import Foundation +import XCTest +@testable import Cotabby + +/// Tests the `defaults write` escape hatches for the decode gates against an isolated suite, so +/// the confidence floor and the argmax-EOG stop are provably adjustable in the field without a +/// rebuild (and without touching process-global defaults from the test host). +@MainActor +final class LlamaDecodeGateDefaultsTests: XCTestCase { + private let suiteName = "LlamaDecodeGateDefaultsTests" + private var defaults: UserDefaults! + + override func setUp() { + super.setUp() + defaults = UserDefaults(suiteName: suiteName) + defaults.removePersistentDomain(forName: suiteName) + } + + override func tearDown() { + defaults.removePersistentDomain(forName: suiteName) + defaults = nil + super.tearDown() + } + + func test_confidenceFloor_defaultsToShippedValue() { + XCTAssertEqual( + LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), + LlamaSuggestionEngine.defaultConfidenceFloor + ) + } + + func test_confidenceFloor_overrideWins_includingDisable() { + defaults.set(-0.8, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey) + XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -0.8) + + defaults.set(-Double.infinity, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey) + XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -.infinity) + } + + func test_argmaxStop_onByDefault_andDisableToggleWorks() { + XCTAssertTrue(LlamaSuggestionEngine.resolvedStopAtArgmaxEOG(defaults)) + + defaults.set(true, forKey: LlamaSuggestionEngine.argmaxStopDisabledKey) + XCTAssertFalse(LlamaSuggestionEngine.resolvedStopAtArgmaxEOG(defaults)) + } +} From 98ca09794307efb43c126df09e6e8fd374589d49 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:39:11 -0700 Subject: [PATCH 5/5] ci: retrigger checks after force-push synchronize was dropped