diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index a3fbc84e..6ff8ed51 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -70,6 +70,7 @@ 1681C0F22323FB1156579D99 /* AGPL-3.0.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6F0EE728C0B1A7AD6B19CD0C /* AGPL-3.0.txt */; }; 175C4FA56C29DEE58C2D4D7E /* SuggestionSettingsModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 86460C747AA883FDE756BDBA /* SuggestionSettingsModel.swift */; }; 18382D1919D90E3C1EE143C2 /* AppSurfaceClassifierTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C451E144D220D5C63372A8C0 /* AppSurfaceClassifierTests.swift */; }; + 18680D0D66469A2954A50B6C /* SuggestionQualityMetricsStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */; }; 1899BC5A35DC96B4D04B18A5 /* es.txt in Resources */ = {isa = PBXBuildFile; fileRef = 0B6816DF5D33863F966240B4 /* es.txt */; }; 19386985A3A91D0843092086 /* AboutPaneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3FA53BBC3D81503C1D17477 /* AboutPaneView.swift */; }; 19CA1BF8B508E0E219EF4485 /* SuggestionEngineModelsTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 470A7DAE3D6A2C873B395AE3 /* SuggestionEngineModelsTests.swift */; }; @@ -248,6 +249,7 @@ 55EDBFF489D4C31276E2A67F /* PermissionHostApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = B6ACCB12E4DB32D2F2BEA567 /* PermissionHostApp.swift */; }; 5614E22EAA5F5C37A9E4F7B6 /* LlamaRuntimeManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = A52D0B550E00EF173A5D157E /* LlamaRuntimeManager.swift */; }; 56611BA0087710277140E9E6 /* DisplayCoordinateConverterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C1C5DE0F3FF63545000E2453 /* DisplayCoordinateConverterTests.swift */; }; + 5687320132AD97B4086260DF /* SuggestionQualityMetricsStore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */; }; 576B3FF30FB457EF04F9A715 /* SuggestionTextColorCodec.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1CE61E74928C221B8BB261C6 /* SuggestionTextColorCodec.swift */; }; 586B36CD813E1432D0AB1380 /* DecodeStopPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */; }; 58AC3193D846FDE88513377D /* BundledRuntimeLocatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 18D990E515E1AE4F312F4E95 /* BundledRuntimeLocatorTests.swift */; }; @@ -297,10 +299,12 @@ 66D9E37B12A9265D4733E72E /* LlamaRuntimeCore.swift in Sources */ = {isa = PBXBuildFile; fileRef = 944065A858D9BC936CB12B23 /* LlamaRuntimeCore.swift */; }; 68DA5F93B7185B4F5E6DB4C3 /* it.txt in Resources */ = {isa = PBXBuildFile; fileRef = 0397F1DACB094A0F6A66BC0E /* it.txt */; }; 6955C3A4D7AB3EEF7FA7C469 /* InputSuppressionController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2D1F9CEBAB0F330F8E7B61D8 /* InputSuppressionController.swift */; }; + 695E431AC3FF79769E2C5EEF /* SuggestionQualityMetricsStoreTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */; }; 6A4E62EC9B7B970695F87136 /* TextDirectionDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 328847A0F494360033366791 /* TextDirectionDetector.swift */; }; 6A8454A989104AE150308BCF /* it-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 2D8AA55C2B730110E8598F91 /* it-100k.txt */; }; 6AE0B46FB52D189D94E1F79A /* WordCountFormatterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1E0513E3B23937B099A3CFF2 /* WordCountFormatterTests.swift */; }; 6BE0C8F9D054A2C0D9018001 /* ConfidenceSuppressionPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1BD71ECC2AE4821B643E0935 /* ConfidenceSuppressionPolicy.swift */; }; + 6C59B369AAC6948C53E41654 /* DebouncePolicyTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */; }; 6CBEF02FCDFCF406E378C27C /* SuggestionInteractionStateTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8CB1D4F2681FAF59014AE115 /* SuggestionInteractionStateTests.swift */; }; 6D0E79CF3C1A8CE53046FCE5 /* AXTextGeometryResolverTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = C046CB4F3CB4BFE9391DB5DE /* AXTextGeometryResolverTests.swift */; }; 6D57E3CDF56127422311C065 /* TerminalAppDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */; }; @@ -484,6 +488,7 @@ B782EC08B7516791BDB21172 /* FieldStyleCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = B7FBF2B766E728F25899B64E /* FieldStyleCache.swift */; }; B7A98BC225304E4DFED9E622 /* OnboardingTemplateRecommender.swift in Sources */ = {isa = PBXBuildFile; fileRef = FA878B447441BB4F3E327CC8 /* OnboardingTemplateRecommender.swift */; }; B816C6191738AB616F2E8D2D /* SuggestionCoordinatorTestSupport.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4C174D8294858BF9DF3D361D /* SuggestionCoordinatorTestSupport.swift */; }; + B849D68E0474CECAE809881C /* DebouncePolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */; }; B93AB7E845086F6FBB068369 /* SuggestionRequestFactoryTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = EE94342B888A5A2CCF66BC93 /* SuggestionRequestFactoryTests.swift */; }; B9623395B31459D9D45B1320 /* CurrentWordExtractor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 247561C626843957CFB4B632 /* CurrentWordExtractor.swift */; }; B9F400BCC20757DA5DB0B5F9 /* FoundationModelSuggestionEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5664E34B23FBDF69292FEF43 /* FoundationModelSuggestionEngine.swift */; }; @@ -503,6 +508,7 @@ BFCA7FAFDAEBF586AB615567 /* ClipboardRelevanceFilterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 90B0D133AB77A2503FB08827 /* ClipboardRelevanceFilterTests.swift */; }; C0537A515AED443F6C61DB2A /* MenuBarSections.swift in Sources */ = {isa = PBXBuildFile; fileRef = 83A810F9D28A18BA6F2066C7 /* MenuBarSections.swift */; }; C0B833234748E82D3382631A /* emoji.json in Resources */ = {isa = PBXBuildFile; fileRef = C379D77029D6E88C8C1B9AF7 /* emoji.json */; }; + C0F757D74758B76DA2962BC5 /* LlamaDecodeGateDefaultsTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */; }; C0FE11D76BDF01A5470C554D /* FocusCapabilityFlickerGate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6A44BEC8C23FF227731DD0CD /* FocusCapabilityFlickerGate.swift */; }; C149EAED2CF6F8B6274053E0 /* AppSurfaceClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 94B0830FBE4F2E239F670DBA /* AppSurfaceClassifier.swift */; }; C178E35A9A713BD4D9943E62 /* TypoCaseTransfer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 08CE63B8725EBD71A4C024E1 /* TypoCaseTransfer.swift */; }; @@ -582,6 +588,7 @@ E311B80968761E90FBA19A8A /* TypoGate.swift in Sources */ = {isa = PBXBuildFile; fileRef = B8412FE2BAC406421248A03B /* TypoGate.swift */; }; E313639E71AE1374D2B9A956 /* SuggestionWorkController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6B2D97BAA3618A7D0357AC44 /* SuggestionWorkController.swift */; }; E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */; }; + E3C0326597083762BA6D76CA /* DebouncePolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */; }; E3CAAEFAAB5BB24CEE16445B /* LLMIOFileHandler.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8D610FCA3A97249DCCE7D0B8 /* LLMIOFileHandler.swift */; }; E4382BEA8A8551612E5966B9 /* BaseCompletionPromptRenderer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 85EF79E6144D6C6AD062B569 /* BaseCompletionPromptRenderer.swift */; }; E46F50AEDA8FE13B02E3FA8D /* AXHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = AC70775535A3428991025AB8 /* AXHelper.swift */; }; @@ -730,6 +737,7 @@ 29CDC8BE5312B9BEFD9B22CB /* SurfaceContextComposerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SurfaceContextComposerTests.swift; sourceTree = ""; }; 29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeviceInfo.swift; sourceTree = ""; }; 2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluator.swift; sourceTree = ""; }; + 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaDecodeGateDefaultsTests.swift; sourceTree = ""; }; 2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AcknowledgementsView.swift; sourceTree = ""; }; 2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluatorTests.swift; sourceTree = ""; }; 2D1F9CEBAB0F330F8E7B61D8 /* InputSuppressionController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputSuppressionController.swift; sourceTree = ""; }; @@ -812,6 +820,7 @@ 6CF1FBAABEF545B620AF8D78 /* ru-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "ru-100k.txt"; sourceTree = ""; }; 6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaEvalScoringTests.swift; sourceTree = ""; }; 6DB982BF30B3601F57277776 /* fr-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "fr-100k.txt"; sourceTree = ""; }; + 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebouncePolicy.swift; sourceTree = ""; }; 6DC693E00430F46E41CB56E6 /* RequestID.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RequestID.swift; sourceTree = ""; }; 6E3B1232C4BE8072A5183F9C /* SymSpell.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SymSpell.swift; sourceTree = ""; }; 6E3EC87078D3A4C21DB3252C /* RandomMacroEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RandomMacroEvaluator.swift; sourceTree = ""; }; @@ -840,6 +849,7 @@ 7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TerminalAppDetector.swift; sourceTree = ""; }; 807148A920E003DEF8BA6092 /* SystemMetricsStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SystemMetricsStore.swift; sourceTree = ""; }; 815F2ABAF6AB75DA3AFBBCEF /* WordCountFormatter.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WordCountFormatter.swift; sourceTree = ""; }; + 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionQualityMetricsStore.swift; sourceTree = ""; }; 82E7794DF60664B1FA8F6E7B /* UnitConversionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = UnitConversionEvaluator.swift; sourceTree = ""; }; 82F7F7355967725162DF2D1B /* CustomRulesEditor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CustomRulesEditor.swift; sourceTree = ""; }; 83A810F9D28A18BA6F2066C7 /* MenuBarSections.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MenuBarSections.swift; sourceTree = ""; }; @@ -929,6 +939,7 @@ B41F06FEF208B30ECCF23A6F /* MacroModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MacroModels.swift; sourceTree = ""; }; B424E2AC97C99D335B0D5751 /* SuggestionTextNormalizer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionTextNormalizer.swift; sourceTree = ""; }; B4B4A2E2DD6733658EC05BD8 /* DownloadFileRescuer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuer.swift; sourceTree = ""; }; + B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionQualityMetricsStoreTests.swift; sourceTree = ""; }; B6ACCB12E4DB32D2F2BEA567 /* PermissionHostApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionHostApp.swift; sourceTree = ""; }; B6D36DB66629CF22C1783945 /* CompletionSeamGuardTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CompletionSeamGuardTests.swift; sourceTree = ""; }; B6D42CD456B4B3C988B148A6 /* FocusTrackingModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusTrackingModel.swift; sourceTree = ""; }; @@ -1018,6 +1029,7 @@ E68BE6A22BA0D42C8DD9868C /* SelfCaptureGate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SelfCaptureGate.swift; sourceTree = ""; }; E7F42112F14026E6253BB865 /* PermissionAndContextModelTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PermissionAndContextModelTests.swift; sourceTree = ""; }; EAAE6B395FAB604DF059280A /* KeyCodeLabels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeyCodeLabels.swift; sourceTree = ""; }; + EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DebouncePolicyTests.swift; sourceTree = ""; }; EB630F9814388203DD1CA2EC /* ShortcutsPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ShortcutsPaneView.swift; sourceTree = ""; }; EC04832FBD5311352F35241B /* SuggestionCaretLayoutRepairTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionCaretLayoutRepairTests.swift; sourceTree = ""; }; EC4A3C4BC38793EB11F484F1 /* CompositionInputModeClassifierTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CompositionInputModeClassifierTests.swift; sourceTree = ""; }; @@ -1304,6 +1316,7 @@ F4D9DF8723AF32C058BFACDE /* SpellingDictionaryCatalog.swift */, ADBE3E6CC585C1683787C877 /* SuggestionEngineModels.swift */, 386C98FFCF76EC1C8C7E82BB /* SuggestionModels.swift */, + 81718CA62FBC775A6CEBCED1 /* SuggestionQualityMetricsStore.swift */, D93563FDA25DFC0038E5F887 /* SuggestionSettingsData.swift */, 86460C747AA883FDE756BDBA /* SuggestionSettingsModel.swift */, DEB16474A67CE1D210B944C9 /* SuggestionSubsystemContracts.swift */, @@ -1372,6 +1385,7 @@ 1C4751DFE9DA372FBC40BA30 /* CurrentWordExtractorTests.swift */, AD752451330486FE270018B0 /* CustomRulesTests.swift */, 313EDBA60565836F32CEEC10 /* DateMacroEvaluatorTests.swift */, + EB23CDF7CAA1DEAD606B46B3 /* DebouncePolicyTests.swift */, B3B09064903B760D6DF2DF7D /* DecodeStopPolicyTests.swift */, 8F20A19A24D20E16D25ADCDA /* DeepGeometryWalkThrottleTests.swift */, C49F67B3EEB2F2A577A54085 /* DeviceInfoTests.swift */, @@ -1414,6 +1428,7 @@ 2960080A726E51198225147A /* InsertionStrategySelectorTests.swift */, 2930EC34057319130393696B /* KeyCodeLabelsTests.swift */, 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */, + 2B3E5554AAC5D0007CCC61A7 /* LlamaDecodeGateDefaultsTests.swift */, 906011A6C9D66EEBAF3B5CC0 /* LlamaEvalScoring.swift */, 6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */, 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */, @@ -1466,6 +1481,7 @@ 45A896811745673061AF3612 /* SuggestionFocusFreshnessTests.swift */, 8CB1D4F2681FAF59014AE115 /* SuggestionInteractionStateTests.swift */, CDB25ABC4FFB0E63477CDCB0 /* SuggestionOverlayStabilityGateTests.swift */, + B4CC566AC1DE33FD0CD30E1E /* SuggestionQualityMetricsStoreTests.swift */, EE94342B888A5A2CCF66BC93 /* SuggestionRequestFactoryTests.swift */, 9C8F07AC52C7A482F5FE34C5 /* SuggestionSessionReconcilerTests.swift */, 00BB95A341A8B5F4A1725640 /* SuggestionSettingsModelTests.swift */, @@ -1624,6 +1640,7 @@ 9CC2D6472ACD377FD73A5801 /* ControlTokenMarkers.swift */, C7B2D34A6F3AC9DFD61350F7 /* CotabbyDebugOptions.swift */, 247561C626843957CFB4B632 /* CurrentWordExtractor.swift */, + 6DC17643448271DE5DE61A89 /* DebouncePolicy.swift */, D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */, 29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */, 74BD1D4DB27D5D96D1E06096 /* DisplayCoordinateConverter.swift */, @@ -1938,6 +1955,7 @@ 55D4E6FB63E3475749E61EB3 /* CustomRulesCatalog.swift in Sources */, F7237FDB0665465F1C7EDCDE /* CustomRulesEditor.swift in Sources */, E0DAB3CDE782C330AF21FC0D /* DateMacroEvaluator.swift in Sources */, + E3C0326597083762BA6D76CA /* DebouncePolicy.swift in Sources */, FA25762161F068F59BEC86EB /* DecodeStopPolicy.swift in Sources */, 400E1A5145FC8C5BA2FAED0A /* DeepGeometryWalkThrottle.swift in Sources */, CF39EB76C3ECF8F764C1B4FB /* DeviceInfo.swift in Sources */, @@ -2081,6 +2099,7 @@ 7DEFC57991AB0C5379AD9CBF /* SuggestionModels.swift in Sources */, EE2C9177CE615298595215A8 /* SuggestionOverlayPresenter.swift in Sources */, DFF3AA49E0770DE3CFBC24C1 /* SuggestionOverlayStabilityGate.swift in Sources */, + 18680D0D66469A2954A50B6C /* SuggestionQualityMetricsStore.swift in Sources */, B691B8378FD73E186A72450C /* SuggestionRequestFactory.swift in Sources */, 532283A7651F7E66635F4281 /* SuggestionSessionReconciler.swift in Sources */, C8CA6DACEAA83336551D4EFA /* SuggestionSettingsData.swift in Sources */, @@ -2167,6 +2186,7 @@ 0431AE1DBEE36C90C7F39C19 /* CustomRulesCatalog.swift in Sources */, 4B4DDB569CAD806F765224DE /* CustomRulesEditor.swift in Sources */, DAD77998F793468D4D64B705 /* DateMacroEvaluator.swift in Sources */, + B849D68E0474CECAE809881C /* DebouncePolicy.swift in Sources */, 586B36CD813E1432D0AB1380 /* DecodeStopPolicy.swift in Sources */, 261FA692D19C48E53D6999BC /* DeepGeometryWalkThrottle.swift in Sources */, 1450746C690B3D98203B71EC /* DeviceInfo.swift in Sources */, @@ -2310,6 +2330,7 @@ 0AF568AB234033BA2DE4CAA7 /* SuggestionModels.swift in Sources */, 02DA43985CDAE6859014F14F /* SuggestionOverlayPresenter.swift in Sources */, 0F3267956257401F39386773 /* SuggestionOverlayStabilityGate.swift in Sources */, + 5687320132AD97B4086260DF /* SuggestionQualityMetricsStore.swift in Sources */, 46F341472191BC451B6BF6B5 /* SuggestionRequestFactory.swift in Sources */, CA5B2D226FBAA5419E78F14F /* SuggestionSessionReconciler.swift in Sources */, 7EEE6AEBFBD419FFE7C544BA /* SuggestionSettingsData.swift in Sources */, @@ -2378,6 +2399,7 @@ 64599CD334AAD79266224689 /* CurrentWordExtractorTests.swift in Sources */, 91D1F16B8C5DA281D4B7F699 /* CustomRulesTests.swift in Sources */, 4CCF29A7EA1B7D37841C135D /* DateMacroEvaluatorTests.swift in Sources */, + 6C59B369AAC6948C53E41654 /* DebouncePolicyTests.swift in Sources */, 79B0AEA0D2FC6A865E9303F9 /* DecodeStopPolicyTests.swift in Sources */, 664A5D62A723EB204ADEF2F9 /* DeepGeometryWalkThrottleTests.swift in Sources */, 43DED8ABEFF9894ED54097A9 /* DeviceInfoTests.swift in Sources */, @@ -2421,6 +2443,7 @@ F66F0D982EBAF5A3E99C5342 /* KeyCodeLabelsTests.swift in Sources */, 475FB7450EEC3C1B16E66CC4 /* LLMIOFileHandlerTests.swift in Sources */, E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */, + C0F757D74758B76DA2962BC5 /* LlamaDecodeGateDefaultsTests.swift in Sources */, 3D82280EFF7F7E9F3FFF45ED /* LlamaEvalScoring.swift in Sources */, 3D56E9B3AA378400E2C081E3 /* LlamaEvalScoringTests.swift in Sources */, E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */, @@ -2472,6 +2495,7 @@ 5CED06E89FBEF557DCD6C684 /* SuggestionFocusFreshnessTests.swift in Sources */, 6CBEF02FCDFCF406E378C27C /* SuggestionInteractionStateTests.swift in Sources */, 4C6D8ED0A7B45D2EADF06DA5 /* SuggestionOverlayStabilityGateTests.swift in Sources */, + 695E431AC3FF79769E2C5EEF /* SuggestionQualityMetricsStoreTests.swift in Sources */, B93AB7E845086F6FBB068369 /* SuggestionRequestFactoryTests.swift in Sources */, 7E9413CE7C999C4612348248 /* SuggestionSessionReconcilerTests.swift in Sources */, 7C6D42EAD04C8144538B132A /* SuggestionSettingsModelTests.swift in Sources */, diff --git a/Cotabby/App/Coordinators/SettingsCoordinator.swift b/Cotabby/App/Coordinators/SettingsCoordinator.swift index 96a8132e..092b54a4 100644 --- a/Cotabby/App/Coordinators/SettingsCoordinator.swift +++ b/Cotabby/App/Coordinators/SettingsCoordinator.swift @@ -20,6 +20,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { private let modelDownloadManager: ModelDownloadManager private let huggingFaceSearchService: HuggingFaceSearchService private let performanceMetricsStore: PerformanceMetricsStore + private let qualityMetricsStore: SuggestionQualityMetricsStore private let systemMetricsStore: SystemMetricsStore private let onShowWelcome: () -> Void private let clearEmojiHistory: () -> Void @@ -36,6 +37,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { modelDownloadManager: ModelDownloadManager, huggingFaceSearchService: HuggingFaceSearchService, performanceMetricsStore: PerformanceMetricsStore, + qualityMetricsStore: SuggestionQualityMetricsStore, systemMetricsStore: SystemMetricsStore, onShowWelcome: @escaping () -> Void, clearEmojiHistory: @escaping () -> Void @@ -49,6 +51,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { self.modelDownloadManager = modelDownloadManager self.huggingFaceSearchService = huggingFaceSearchService self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.systemMetricsStore = systemMetricsStore self.onShowWelcome = onShowWelcome self.clearEmojiHistory = clearEmojiHistory @@ -76,6 +79,7 @@ final class SettingsCoordinator: NSObject, NSWindowDelegate { modelDownloadManager: modelDownloadManager, huggingFaceSearchService: huggingFaceSearchService, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore, onShowWelcome: onShowWelcome, clearEmojiHistory: clearEmojiHistory diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift index f5290b64..ba27b091 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Acceptance.swift @@ -122,6 +122,7 @@ extension SuggestionCoordinator { deferAcceptanceBookkeeping { [weak self] in self?.recordAcceptedWords(from: acceptedChunk) + self?.recordSuggestionAcceptedIfFirstChunk(of: sessionForAcceptance) } cancelPredictionWork() @@ -563,6 +564,14 @@ extension SuggestionCoordinator { } } + /// Marks the session's suggestion accepted in the quality counters, once per suggestion: only + /// the first chunk counts, so word-by-word walks of one suggestion add nothing further and the + /// acceptance rate stays suggestions-accepted over suggestions-shown. + private func recordSuggestionAcceptedIfFirstChunk(of session: ActiveSuggestionSession) { + guard session.consumedCharacterCount == 0 else { return } + qualityMetricsStore.recordAcceptedSuggestion() + } + /// Updates the global productivity counter from text accepted via Tab. func recordAcceptedWords(from acceptedChunk: String) { let acceptedWordCount = SuggestionSessionReconciler.acceptedWordCount(in: acceptedChunk) diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index 33b009b4..5b0ae7aa 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -20,11 +20,18 @@ extension SuggestionCoordinator { return } + // The debounce window adapts to the last generation latency: snappier when the model is + // fast, calmer when it is slow (fewer doomed generations to cancel). The configured value + // is the fallback until a first latency exists. + let debounceMilliseconds = DebouncePolicy.milliseconds( + lastGenerationLatencyMilliseconds: latestLatencyMilliseconds, + fallback: settingsSnapshot.debounceMilliseconds + ) // The debounce clock starts at the keystroke, not here. The host-publish poll has already // consumed real wall time waiting for the host to publish the keystroke to AX, and that // wait collapses bursts just as well as sleeping does. Stacking the full debounce on top // of the publish wait was pure added latency, so only the unconsumed remainder is slept. - let remainingDelay = max(0, settingsSnapshot.debounceMilliseconds - consumedDelayMilliseconds) + let remainingDelay = max(0, debounceMilliseconds - consumedDelayMilliseconds) // Task cancellation in Swift is cooperative, so we also use an explicit work id. // That gives us strict "latest request wins" semantics even if an old task wakes up late. @@ -42,7 +49,7 @@ extension SuggestionCoordinator { logStage( "debouncing", workID: workID, - message: "Debouncing (\(settingsSnapshot.debounceMilliseconds)ms window) before generating." + message: "Debouncing (\(debounceMilliseconds)ms window, \(remainingDelay)ms remaining) before generating." ) } @@ -496,6 +503,10 @@ extension SuggestionCoordinator { guard liveContext.generation == result.generation else { latestRawModelOutput = SuggestionDebugLogger.debugPreview(result.rawText) + // Lifecycle discards are counted under their own reasons so `generated` always equals + // `shown` plus the suppression histogram; without this, every drop here silently + // inflated the generated count against the others. + qualityMetricsStore.recordSuppressed(reason: "discardedStaleContext") logStage( "stale-drop", workID: workID, @@ -514,6 +525,11 @@ extension SuggestionCoordinator { clearSuggestion() hideOverlay(reason: "Overlay hidden because the model returned an empty continuation.") state = .idle + // The router already counted engine-attributed suppressions (normalizer, confidence + // floor); only the unattributed "model produced nothing" case needs a ledger entry. + if result.suppressionReason == nil { + qualityMetricsStore.recordSuppressed(reason: "emptyUnattributed") + } logStage( "empty-result", workID: workID, @@ -529,6 +545,7 @@ extension SuggestionCoordinator { clearSuggestion(clearDiagnostics: true) hideOverlay(reason: "Overlay hidden because text is selected.") state = .idle + qualityMetricsStore.recordSuppressed(reason: "discardedSelection") logStage( "selected-text", workID: workID, @@ -553,6 +570,7 @@ extension SuggestionCoordinator { clearSuggestion(clearDiagnostics: false) hideOverlay(reason: "Overlay hidden because the regeneration only echoed the just-accepted text before the host published it.") state = .idle + qualityMetricsStore.recordSuppressed(reason: "discardedAcceptEcho") logStage( "stale-accept-echo", workID: workID, @@ -576,6 +594,8 @@ extension SuggestionCoordinator { clearSuggestion() hideOverlay(reason: "Overlay hidden because the completion failed the seam guard.") state = .idle + let seamReason = if case .seamMisspelling = seamVerdict { "seamMisspelling" } else { "seamJunkPunctuationRun" } + qualityMetricsStore.recordSuppressed(reason: seamReason) logStage( "seam-suppressed", workID: workID, @@ -589,6 +609,9 @@ extension SuggestionCoordinator { latestLatencyMilliseconds = Int(result.latency * 1000) latestGenerationNumber = liveContext.generation + // One shown event per suggestion: this is the only place a fresh generation becomes + // visible (re-presentations after partial accepts reuse the same session). + qualityMetricsStore.recordShown() let session = interactionState.startSession( fullText: result.text, liveContext: liveContext, diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift index 61516984..18733160 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift @@ -54,6 +54,9 @@ final class SuggestionCoordinator: ObservableObject { /// `CotabbyAppEnvironment`) so the underlying `NSSpellChecker` document tag persists across the /// coordinator's lifetime instead of churning per keystroke. let spellChecker: CurrentWordSpellChecker + /// Always-on quality counters (shown / suppressed / accepted). The router counts generation + /// outcomes; the coordinator owns the display-time and acceptance events only it can see. + let qualityMetricsStore: SuggestionQualityMetricsStore /// Frequency-ranked correction source (SymSpell). Used first for the correction word, with /// `spellChecker` as the fallback while its index is still loading or when it has no suggestion. let symSpellCorrector: SymSpellCorrector @@ -163,6 +166,7 @@ final class SuggestionCoordinator: ObservableObject { spellChecker: CurrentWordSpellChecker, symSpellCorrector: SymSpellCorrector, spellingLanguageResolver: SpellingLanguageResolver = SpellingLanguageResolver(), + qualityMetricsStore: SuggestionQualityMetricsStore, userDefaults: UserDefaults = .standard ) { let storedTotalTabAcceptedWordCount = userDefaults.integer( @@ -184,6 +188,7 @@ final class SuggestionCoordinator: ObservableObject { self.spellChecker = spellChecker self.symSpellCorrector = symSpellCorrector self.spellingLanguageResolver = spellingLanguageResolver + self.qualityMetricsStore = qualityMetricsStore self.userDefaults = userDefaults settingsSnapshot = suggestionSettings.snapshot // These collaborators isolate "how overlay/logging works" from "when the coordinator diff --git a/Cotabby/App/Core/CotabbyAppEnvironment.swift b/Cotabby/App/Core/CotabbyAppEnvironment.swift index 9eed835a..06d4d389 100644 --- a/Cotabby/App/Core/CotabbyAppEnvironment.swift +++ b/Cotabby/App/Core/CotabbyAppEnvironment.swift @@ -34,6 +34,7 @@ final class CotabbyAppEnvironment { let welcomeCoordinator: WelcomeCoordinator let huggingFaceSearchService: HuggingFaceSearchService let performanceMetricsStore: PerformanceMetricsStore + let qualityMetricsStore: SuggestionQualityMetricsStore let settingsCoordinator: SettingsCoordinator let activationIndicatorController: ActivationIndicatorController let focusDebugOverlayController: FocusDebugOverlayController? @@ -113,6 +114,9 @@ final class CotabbyAppEnvironment { ) let huggingFaceSearchService = HuggingFaceSearchService() let performanceMetricsStore = PerformanceMetricsStore() + // Always-on quality counters (generated / shown / suppressed-by-reason / accepted). + // Counters only, no content, so unlike latency tracking there is no opt-in gate. + let qualityMetricsStore = SuggestionQualityMetricsStore() // Live CPU/RAM graph backing for the Performance pane. Holds no state until the pane asks it // to start sampling, so constructing it eagerly here costs nothing. let systemMetricsStore = SystemMetricsStore() @@ -157,6 +161,7 @@ final class CotabbyAppEnvironment { foundationModelEngine: foundationModelEngine, llamaEngine: LlamaSuggestionEngine(runtimeManager: runtimeManager), performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, llamaModelNameProvider: { [weak runtimeManager] in runtimeManager?.currentModelFilename } @@ -176,6 +181,7 @@ final class CotabbyAppEnvironment { modelDownloadManager: modelDownloadManager, huggingFaceSearchService: huggingFaceSearchService, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore, onShowWelcome: { [weak welcomeCoordinator] in welcomeCoordinator?.showWelcome() @@ -213,7 +219,8 @@ final class CotabbyAppEnvironment { configuration: configuration, spellChecker: spellChecker, symSpellCorrector: symSpellCorrector, - spellingLanguageResolver: SpellingLanguageResolver() + spellingLanguageResolver: SpellingLanguageResolver(), + qualityMetricsStore: qualityMetricsStore ) // The emoji picker is a sibling to the suggestion coordinator. It reuses the input monitor, @@ -276,6 +283,7 @@ final class CotabbyAppEnvironment { self.welcomeCoordinator = welcomeCoordinator self.huggingFaceSearchService = huggingFaceSearchService self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.settingsCoordinator = settingsCoordinator self.activationIndicatorController = activationIndicatorController self.focusDebugOverlayController = FocusDebugOverlayController.isEnabled diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 954c44d0..1755e434 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -197,6 +197,28 @@ struct LlamaGenerationOptions: Equatable, Sendable { /// degenerate instant stops (e.g. a lone leading period). Lives here so length presets can tune /// the floor without reaching into `DecodeStopPolicy`; the default preserves prior behavior. var sentenceStopMinimumTokens: Int = 2 + + /// Stop decoding the moment the raw distribution's most-likely next token is end-of-generation, + /// even when the stochastic sampler drew something else. The model's top choice being "stop" + /// is the strongest anti-rambling signal available per token, and the engine computes it while + /// the logits row is hot, so honoring it costs nothing here. + var stopAtArgmaxEOG: Bool = true +} + +/// One generation's text plus the confidence signals the caller needs for suppression accounting. +/// Returned instead of a bare string so a confidence-suppressed completion is attributed to the +/// real reason rather than reading as "the model produced nothing". +struct LlamaGenerationOutput: Equatable, Sendable { + let text: String + /// Mean per-token log-probability of the generated tokens; nil when confidence gating was off + /// (the engine skips the per-token logprob work entirely) or nothing was generated. + let averageLogprob: Double? + /// True when the completion was withheld because `averageLogprob` fell below the floor. + let suppressedByLowConfidence: Bool + + static func text(_ text: String) -> LlamaGenerationOutput { + LlamaGenerationOutput(text: text, averageLogprob: nil, suppressedByLowConfidence: false) + } } /// The concrete runtime assets selected during bootstrap after checking available model files. diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift index a0f55c88..dfc44b15 100644 --- a/Cotabby/Models/SuggestionModels.swift +++ b/Cotabby/Models/SuggestionModels.swift @@ -449,6 +449,25 @@ struct SuggestionResult: Equatable, Sendable { let rawText: String let text: String let latency: TimeInterval + /// Raw value of the `CompletionSuppressionReason` that emptied `text`, when one applies. + /// Carried as a string so the coordinator's quality accounting never needs the normalizer + /// type, and so engine-specific reasons can ride along without enum churn. The explicit + /// initializer default keeps existing call sites compiling unchanged. + let suppressionReason: String? + + init( + generation: UInt64, + rawText: String, + text: String, + latency: TimeInterval, + suppressionReason: String? = nil + ) { + self.generation = generation + self.rawText = rawText + self.text = text + self.latency = latency + self.suppressionReason = suppressionReason + } } /// Represents one active inline-completion session after the model has produced a suggestion. diff --git a/Cotabby/Models/SuggestionQualityMetricsStore.swift b/Cotabby/Models/SuggestionQualityMetricsStore.swift new file mode 100644 index 00000000..12d81612 --- /dev/null +++ b/Cotabby/Models/SuggestionQualityMetricsStore.swift @@ -0,0 +1,86 @@ +import Combine +import Foundation + +/// Local, always-on counters that answer "is suggestion quality improving for real use": how many +/// completions were generated, how many were shown, why the withheld ones were withheld, and how +/// many shown suggestions the user actually accepted. +/// +/// Latency tracking (`PerformanceMetricsStore`) stays opt-in because it records per-request rows; +/// these are lifetime counters with zero content, so they run unconditionally and survive restarts. +/// Acceptance rate (accepted / shown) is the closest thing to ground truth the app can measure on +/// device, and the suppression histogram tells the difference between "the model produced nothing" +/// and "a specific guard fired", which otherwise only exists scattered through debug-only JSONL. +@MainActor +final class SuggestionQualityMetricsStore: ObservableObject { + struct Counters: Codable, Equatable { + var generated = 0 + var shown = 0 + /// Sessions the user accepted at least once. Counted per suggestion, not per Tab press, + /// so word-by-word acceptance of one suggestion is one acceptance. + var acceptedSuggestions = 0 + /// Keyed by `CompletionSuppressionReason` raw values plus coordinator-level reasons + /// (the seam guard verdicts). String-keyed so new reasons never need a schema migration. + var suppressedByReason: [String: Int] = [:] + var firstRecordedAt: Date? + + var suppressedTotal: Int { suppressedByReason.values.reduce(0, +) } + + var acceptanceRate: Double? { + guard shown > 0 else { return nil } + return Double(acceptedSuggestions) / Double(shown) + } + } + + @Published private(set) var counters: Counters + + private let userDefaults: UserDefaults + private static let defaultsKey = "cotabbyQualityMetricsCounters" + + /// Stored-property @MainActor classes deallocated inside app-hosted tests double-free without + /// an explicitly nonisolated deinit (the isolated-deinit runtime path over-releases). Same + /// workaround as the other main-actor stores exercised by tests. + nonisolated deinit {} + + init(userDefaults: UserDefaults = .standard) { + self.userDefaults = userDefaults + if let data = userDefaults.data(forKey: Self.defaultsKey), + let decoded = try? JSONDecoder().decode(Counters.self, from: data) { + counters = decoded + } else { + counters = Counters() + } + } + + func recordGenerated() { + mutate { $0.generated += 1 } + } + + func recordShown() { + mutate { $0.shown += 1 } + } + + func recordAcceptedSuggestion() { + mutate { $0.acceptedSuggestions += 1 } + } + + func recordSuppressed(reason: String) { + mutate { $0.suppressedByReason[reason, default: 0] += 1 } + } + + func reset() { + counters = Counters() + userDefaults.removeObject(forKey: Self.defaultsKey) + } + + private func mutate(_ change: (inout Counters) -> Void) { + var updated = counters + change(&updated) + if updated.firstRecordedAt == nil { + updated.firstRecordedAt = Date() + } + counters = updated + if let data = try? JSONEncoder().encode(updated) { + userDefaults.set(data, forKey: Self.defaultsKey) + } + } +} diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index 77ece468..79655f7c 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -127,16 +127,21 @@ extension SuggestionGenerating { /// a fake runtime instead of loading a real model. `LlamaRuntimeManager` is the production conformer. @MainActor protocol LlamaRuntimeGenerating: AnyObject { - func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions + ) async throws -> LlamaGenerationOutput /// Streaming variant: `onPartialRawText` receives the cumulative raw completion after each /// sampled token, called from the decode thread (hence `@Sendable`); callers own hopping to - /// their actor. The returned string is still the authoritative final completion. + /// their actor. The returned output's text is still the authoritative final completion, and + /// its confidence fields describe the whole generation (partials are pre-gate by nature). func generate( prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String + ) async throws -> LlamaGenerationOutput func resetPromptCache() /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup: @@ -157,7 +162,7 @@ extension LlamaRuntimeGenerating { cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try await generate(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options) } } diff --git a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift index 570a6dc9..fae711e0 100644 --- a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift @@ -175,7 +175,8 @@ final class FoundationModelSuggestionEngine { generation: request.generation, rawText: rawSuggestion, text: normalizedSuggestion, - latency: latency + latency: latency, + suppressionReason: normalization.suppression?.rawValue ) } catch is CancellationError { CotabbyLogger.suggestion.debug("Foundation model generation cancelled", metadata: baseMetadata) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index f9313d6c..492a25c2 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -140,7 +140,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions, onPartialRawText: ((String) -> Void)? = nil - ) throws -> String { + ) throws -> LlamaGenerationOutput { let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate") lifecycleCondition.lock() @@ -199,7 +199,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { engine.destroySequence(sequenceID) autocompleteSequenceID = -1 } - return decode.text + return decode.output } /// Decodes `prompt` into the autocomplete KV cache without sampling, so the next `generate` @@ -364,7 +364,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { sequenceID: Int32, options: LlamaGenerationOptions, onPartialRawText: ((String) -> Void)? = nil - ) -> (text: String, engineCancelled: Bool) { + ) -> (output: LlamaGenerationOutput, engineCancelled: Bool) { var generatedText = "" var tokensGenerated = 0 var sumLogprob = 0.0 @@ -392,6 +392,14 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { stopReason = "eos" break } + // The raw distribution's most-likely token is end-of-generation: the model wants to + // stop here even though the stochastic sampler drew something else. Finalize with the + // text accumulated so far and discard the sampled-but-unwanted token; this is the + // anti-rambling stop the sentence classifier cannot express (lists, fragments, code). + if options.stopAtArgmaxEOG, result.argmax_is_eog { + stopReason = "argmax_eog" + break + } let piece = Self.extractPiece(result) generatedText += piece @@ -428,10 +436,25 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ] ) + // The average is only meaningful when the engine actually computed per-token logprobs, + // which is keyed on the floor being enabled (see setComputeLogprob at sequence setup). + let averageLogprob: Double? = options.confidenceFloor > -.infinity && tokensGenerated > 0 + ? sumLogprob / Double(tokensGenerated) + : nil if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { - return ("", engineCancelled) + let suppressed = LlamaGenerationOutput( + text: "", + averageLogprob: averageLogprob, + suppressedByLowConfidence: true + ) + return (suppressed, engineCancelled) } - return (generatedText, engineCancelled) + let output = LlamaGenerationOutput( + text: generatedText, + averageLogprob: averageLogprob, + suppressedByLowConfidence: false + ) + return (output, engineCancelled) } /// Low-confidence gate for the sampled decoder: drop completions the model itself was unsure diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index 889522ca..af02d8e5 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -100,7 +100,7 @@ final class LlamaRuntimeManager: ObservableObject { prompt: String, cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try await generate( prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, @@ -116,7 +116,7 @@ final class LlamaRuntimeManager: ObservableObject { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { _ = try await preparedRuntime() let core = self.core diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index 4092bbb9..b090ddc9 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -20,6 +20,30 @@ final class LlamaSuggestionEngine { self.runtimeManager = runtimeManager } + /// Shipped confidence floor (mean per-token log-probability). -infinity disables the gate AND + /// the per-token logprob computation behind it; any other value turns both on. -1.5 came from + /// an eval sweep over {off, -4, -3, -2.5, -2, -1.5, -1, -0.5, -0.3}: floors at or below -2 + /// never fire on this model at temperature 0.1, -1 and tighter buy precision at a brutal + /// coverage cost, and -1.5 is the unique point where the composite quality score rose + /// (0.734 to 0.744), wrong-shows fell 27% relative, and not a single must-show case was lost. + static let defaultConfidenceFloor: Double = -1.5 + /// `defaults write` escape hatches for dogfooding and field diagnosis without a rebuild. + static let confidenceFloorOverrideKey = "cotabbyConfidenceFloorOverride" + static let argmaxStopDisabledKey = "cotabbyArgmaxStopDisabled" + + static func resolvedConfidenceFloor(_ defaults: UserDefaults = .standard) -> Double { + guard defaults.object(forKey: confidenceFloorOverrideKey) != nil else { + return defaultConfidenceFloor + } + return defaults.double(forKey: confidenceFloorOverrideKey) + } + + /// Mirrors `resolvedConfidenceFloor`: injectable defaults so the disable toggle is testable + /// against an isolated suite instead of process-global state. + static func resolvedStopAtArgmaxEOG(_ defaults: UserDefaults = .standard) -> Bool { + !defaults.bool(forKey: argmaxStopDisabledKey) + } + /// Prefills the prompt KV for the field the user just focused, so the first real suggestion /// there only decodes the typed delta instead of the whole cold prompt. /// @@ -89,9 +113,9 @@ final class LlamaSuggestionEngine { ]) { _, new in new } ) let options = Self.makeGenerationOptions(for: request) - let rawSuggestion: String + let output: LlamaGenerationOutput if let onPartial { - rawSuggestion = try await runtimeManager.generate( + output = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, @@ -115,7 +139,7 @@ final class LlamaSuggestionEngine { } ) } else { - rawSuggestion = try await runtimeManager.generate( + output = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, options: options @@ -124,7 +148,15 @@ final class LlamaSuggestionEngine { try Task.checkCancellation() promptCacheHintTracker.recordSuccessfulRequest(request) - let normalization = SuggestionTextNormalizer.normalizeDetailed(rawSuggestion, for: request) + let rawSuggestion = output.text + // A confidence-suppressed completion never reaches the normalizer (the runtime already + // withheld the text); attribute the real reason instead of "the model produced nothing". + // Streamed partials are pre-gate by nature: a completion the floor later withholds can + // briefly paint and then clear, which is the same contract as any other final-result + // suppression under streaming. + let normalization = output.suppressedByLowConfidence + ? SuggestionNormalizationResult(text: "", suppression: .lowConfidence) + : SuggestionTextNormalizer.normalizeDetailed(rawSuggestion, for: request) let normalizedSuggestion = normalization.text let latency = Date().timeIntervalSince(startTime) let rawChars = rawSuggestion.count @@ -133,12 +165,14 @@ final class LlamaSuggestionEngine { // `suppression_reason` distinguishes an empty ghost text caused by the model producing // nothing from one a filter dropped — the join key for judging decode quality on device. let suppressionReason = normalization.suppression?.rawValue ?? "none" + let averageLogprobDescription = output.averageLogprob.map { String(format: "%.3f", $0) } ?? "off" CotabbyLogger.suggestion.debug( "Llama generated", metadata: baseMetadata.merging([ "raw_chars": .stringConvertible(rawChars), "normalized_chars": .stringConvertible(normalizedChars), "suppression_reason": .string(suppressionReason), + "avg_logprob": .string(averageLogprobDescription), "latency_ms": .stringConvertible(latencyMs) ]) { _, new in new } ) @@ -152,6 +186,7 @@ final class LlamaSuggestionEngine { "raw_chars": .stringConvertible(rawChars), "normalized_chars": .stringConvertible(normalizedChars), "suppression_reason": .string(suppressionReason), + "avg_logprob": .string(averageLogprobDescription), "latency_ms": .stringConvertible(latencyMs), "cache_hint_bytes": .string(hintDesc), "max_tokens": .stringConvertible(request.maxPredictionTokens) @@ -161,7 +196,8 @@ final class LlamaSuggestionEngine { generation: request.generation, rawText: rawSuggestion, text: normalizedSuggestion, - latency: latency + latency: latency, + suppressionReason: normalization.suppression?.rawValue ) } catch is CancellationError { CotabbyLogger.suggestion.debug("Llama generation cancelled", metadata: baseMetadata) @@ -233,7 +269,9 @@ final class LlamaSuggestionEngine { forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( precedingText: request.context.precedingText, trailingText: request.context.trailingText - ) + ), + confidenceFloor: resolvedConfidenceFloor(), + stopAtArgmaxEOG: resolvedStopAtArgmaxEOG() ) } } diff --git a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift index 65b0cfbe..8d217f6e 100644 --- a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift +++ b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift @@ -11,6 +11,7 @@ final class SuggestionEngineRouter { private let foundationModelEngine: any SuggestionGenerating private let llamaEngine: any SuggestionGenerating private let performanceMetricsStore: PerformanceMetricsStore + private let qualityMetricsStore: SuggestionQualityMetricsStore /// Closure that returns the currently selected llama model filename (e.g. `Qwen3-0.6B-Q8_0.gguf`). /// A closure instead of a direct `LlamaRuntimeManager` reference keeps the router from depending /// on the concrete runtime type — useful for tests that want to fake the model label. @@ -21,12 +22,14 @@ final class SuggestionEngineRouter { foundationModelEngine: any SuggestionGenerating, llamaEngine: any SuggestionGenerating, performanceMetricsStore: PerformanceMetricsStore, + qualityMetricsStore: SuggestionQualityMetricsStore, llamaModelNameProvider: @escaping @MainActor () -> String? ) { self.suggestionSettings = suggestionSettings self.foundationModelEngine = foundationModelEngine self.llamaEngine = llamaEngine self.performanceMetricsStore = performanceMetricsStore + self.qualityMetricsStore = qualityMetricsStore self.llamaModelNameProvider = llamaModelNameProvider } @@ -48,6 +51,7 @@ final class SuggestionEngineRouter { do { let result = try await foundationModelEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: "Apple Intelligence", latency: result.latency) + recordQualityOutcome(result) return result } catch SuggestionClientError.unsupportedLanguageOrLocale(let message) { CotabbyLogger.suggestion.info( @@ -67,10 +71,23 @@ final class SuggestionEngineRouter { CotabbyLogger.suggestion.debug("Routing to open-source llama engine", metadata: metadata) let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) + recordQualityOutcome(result) return result } } + /// Counts every finished generation plus the engine-attributed suppression reason when the + /// pipeline emptied the text. The router is the single point that sees every finished result + /// regardless of engine or fallback, which keeps these counters complete by construction. + /// Display-time outcomes (shown, seam-guard suppressions, acceptance) are recorded by the + /// coordinator, the only layer that knows them. + private func recordQualityOutcome(_ result: SuggestionResult) { + qualityMetricsStore.recordGenerated() + if let reason = result.suppressionReason { + qualityMetricsStore.recordSuppressed(reason: reason) + } + } + /// Persists one (timestamp, model, latency) triple into the rolling ring buffer when the /// Performance pane toggle is on. The router is the right home for this seam because it is /// the single point that sees a finished `SuggestionResult` and knows which engine produced @@ -121,6 +138,7 @@ final class SuggestionEngineRouter { do { let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) + recordQualityOutcome(result) return result } catch SuggestionClientError.cancelled { throw SuggestionClientError.cancelled diff --git a/Cotabby/Support/DebouncePolicy.swift b/Cotabby/Support/DebouncePolicy.swift new file mode 100644 index 00000000..6d2af60a --- /dev/null +++ b/Cotabby/Support/DebouncePolicy.swift @@ -0,0 +1,21 @@ +import Foundation + +/// Chooses the prediction debounce from the last observed generation latency. +/// +/// A fixed debounce serves two masters badly: on fast hardware it adds avoidable delay before +/// every suggestion, and on slow hardware it lets keystrokes pile doomed generations onto a model +/// that cannot keep up (each cancel still costs a decode setup and teardown). Keying the debounce +/// to the most recent generation latency makes fast machines snappier and slow machines calmer, +/// with no configuration. The configured value remains the fallback until a first latency exists. +nonisolated enum DebouncePolicy { + static func milliseconds(lastGenerationLatencyMilliseconds: Int?, fallback: Int) -> Int { + guard let last = lastGenerationLatencyMilliseconds, last > 0 else { + return fallback + } + switch last { + case ...70: return 15 + case ...140: return 25 + default: return 55 + } + } +} diff --git a/Cotabby/Support/SuggestionTextNormalizer.swift b/Cotabby/Support/SuggestionTextNormalizer.swift index eafd1422..8a90df82 100644 --- a/Cotabby/Support/SuggestionTextNormalizer.swift +++ b/Cotabby/Support/SuggestionTextNormalizer.swift @@ -24,6 +24,11 @@ enum CompletionSuppressionReason: String, Sendable, Equatable { case echoesPrecedingText /// Printable characters survived but carried control/replacement glyphs the safety gate rejects. case unsafeToInsert + /// The runtime withheld the completion because its mean per-token log-probability fell below + /// the confidence floor: the model itself was unsure, and showing nothing beats a guess. + /// Attributed by the engine (the runtime reports it on `LlamaGenerationOutput`), not by the + /// normalizer, which never sees the withheld text. + case lowConfidence } /// Outcome of normalizing one raw completion: the ghost text, plus the attributable reason when that diff --git a/Cotabby/UI/Settings/Panes/PerformancePaneView.swift b/Cotabby/UI/Settings/Panes/PerformancePaneView.swift index b001167f..13d99ba1 100644 --- a/Cotabby/UI/Settings/Panes/PerformancePaneView.swift +++ b/Cotabby/UI/Settings/Panes/PerformancePaneView.swift @@ -11,12 +11,15 @@ import SwiftUI struct PerformancePaneView: View { @ObservedObject var suggestionSettings: SuggestionSettingsModel @ObservedObject var performanceMetricsStore: PerformanceMetricsStore + @ObservedObject var qualityMetricsStore: SuggestionQualityMetricsStore @ObservedObject var systemMetricsStore: SystemMetricsStore var body: some View { SettingsPaneScaffold { liveResourceSection + suggestionQualitySection + Section("Tracking") { Toggle(isOn: trackingEnabledBinding) { SettingsRowLabel( @@ -59,6 +62,71 @@ struct PerformancePaneView: View { .onDisappear { systemMetricsStore.endSampling() } } + // MARK: - Suggestion quality counters + + /// Lifetime counters: how often suggestions appear, why withheld ones were withheld, and how + /// many shown suggestions the user accepted. Always on (counters carry no content), unlike the + /// per-request latency log below, which records timestamps and stays opt-in. + private var suggestionQualitySection: some View { + Section { + qualityCounterRow(label: "Suggestions shown", value: "\(qualityMetricsStore.counters.shown)") + qualityCounterRow(label: "Accepted", value: acceptedLabel) + qualityCounterRow(label: "Generations", value: "\(qualityMetricsStore.counters.generated)") + if !topSuppressionReasons.isEmpty { + qualityCounterRow( + label: "Withheld (\(qualityMetricsStore.counters.suppressedTotal))", + value: topSuppressionReasons + ) + } + } header: { + HStack { + Text(qualityHeaderLabel) + Spacer() + if qualityMetricsStore.counters.shown > 0 || qualityMetricsStore.counters.generated > 0 { + Button("Reset") { + qualityMetricsStore.reset() + } + .buttonStyle(.borderless) + .controlSize(.small) + } + } + } + } + + private func qualityCounterRow(label: String, value: String) -> some View { + HStack(alignment: .firstTextBaseline) { + Text(label) + Spacer() + Text(value) + .foregroundStyle(.secondary) + .multilineTextAlignment(.trailing) + } + } + + private var acceptedLabel: String { + let accepted = qualityMetricsStore.counters.acceptedSuggestions + guard let rate = qualityMetricsStore.counters.acceptanceRate else { + return "\(accepted)" + } + return "\(accepted) (\(Int((rate * 100).rounded()))%)" + } + + private var topSuppressionReasons: String { + qualityMetricsStore.counters.suppressedByReason + .sorted { lhs, rhs in lhs.value == rhs.value ? lhs.key < rhs.key : lhs.value > rhs.value } + .prefix(4) + .map { "\($0.key) \($0.value)" } + .joined(separator: ", ") + } + + private var qualityHeaderLabel: String { + guard let since = qualityMetricsStore.counters.firstRecordedAt else { + return "Suggestion Quality" + } + let formatted = since.formatted(date: .abbreviated, time: .omitted) + return "Suggestion Quality (since \(formatted))" + } + // MARK: - Live resource graphs private var liveResourceSection: some View { diff --git a/Cotabby/UI/Settings/SettingsContainerView.swift b/Cotabby/UI/Settings/SettingsContainerView.swift index 9300aacc..0059d760 100644 --- a/Cotabby/UI/Settings/SettingsContainerView.swift +++ b/Cotabby/UI/Settings/SettingsContainerView.swift @@ -21,6 +21,7 @@ struct SettingsContainerView: View { @ObservedObject var modelDownloadManager: ModelDownloadManager @ObservedObject var huggingFaceSearchService: HuggingFaceSearchService @ObservedObject var performanceMetricsStore: PerformanceMetricsStore + @ObservedObject var qualityMetricsStore: SuggestionQualityMetricsStore @ObservedObject var systemMetricsStore: SystemMetricsStore let onShowWelcome: () -> Void @@ -130,6 +131,7 @@ struct SettingsContainerView: View { PerformancePaneView( suggestionSettings: suggestionSettings, performanceMetricsStore: performanceMetricsStore, + qualityMetricsStore: qualityMetricsStore, systemMetricsStore: systemMetricsStore ) case .about: diff --git a/Cotabby/UI/Settings/SettingsIndex.swift b/Cotabby/UI/Settings/SettingsIndex.swift index ef13b543..9865d002 100644 --- a/Cotabby/UI/Settings/SettingsIndex.swift +++ b/Cotabby/UI/Settings/SettingsIndex.swift @@ -71,6 +71,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case screenRecording // Performance case performanceTracking + case suggestionQualityStats case resourceUsage case recentRequests // About @@ -137,6 +138,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .inputMonitoring: return "Input Monitoring" case .screenRecording: return "Screen Recording" case .performanceTracking: return "Enable Performance Tracking" + case .suggestionQualityStats: return "Suggestion Quality" case .resourceUsage: return "Live Resource Usage" case .recentRequests: return "Recent Requests" case .checkForUpdates: return "Check for Updates" @@ -202,6 +204,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .inputMonitoring: return "keyboard" case .screenRecording: return "camera.viewfinder" case .performanceTracking: return "stopwatch" + case .suggestionQualityStats: return "checkmark.seal" case .resourceUsage: return "chart.line.uptrend.xyaxis" case .recentRequests: return "list.bullet.clipboard" case .checkForUpdates: return "arrow.triangle.2.circlepath" @@ -238,7 +241,7 @@ enum SettingsItem: String, CaseIterable, Identifiable { return .apps case .accessibility, .inputMonitoring, .screenRecording: return .permissions - case .performanceTracking, .resourceUsage, .recentRequests: + case .performanceTracking, .suggestionQualityStats, .resourceUsage, .recentRequests: return .performance case .checkForUpdates, .support, .githubRepository, .wiki, .acknowledgements, .uninstall: @@ -409,6 +412,9 @@ enum SettingsItem: String, CaseIterable, Identifiable { case .performanceTracking: return ["performance", "tracking", "latency", "metrics", "timing", "telemetry", "analytics", "diagnostics", "measure"] + case .suggestionQualityStats: + return ["quality", "acceptance", "accepted", "shown", "suppressed", "withheld", + "rate", "stats", "counters", "suggestions"] case .resourceUsage: return ["cpu", "memory", "ram", "usage", "resource", "graph", "chart", "live", "load", "monitor"] diff --git a/CotabbyTests/DebouncePolicyTests.swift b/CotabbyTests/DebouncePolicyTests.swift new file mode 100644 index 00000000..bcb1afbc --- /dev/null +++ b/CotabbyTests/DebouncePolicyTests.swift @@ -0,0 +1,24 @@ +import XCTest +@testable import Cotabby + +final class DebouncePolicyTests: XCTestCase { + func testNoLatencyDataUsesFallback() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: nil, fallback: 20), 20) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 0, fallback: 20), 20) + } + + func testFastGenerationsGetTheShortDebounce() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 45, fallback: 20), 15) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 70, fallback: 20), 15) + } + + func testMediumGenerationsGetTheMiddleDebounce() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 71, fallback: 20), 25) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 140, fallback: 20), 25) + } + + func testSlowGenerationsBackOff() { + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 141, fallback: 20), 55) + XCTAssertEqual(DebouncePolicy.milliseconds(lastGenerationLatencyMilliseconds: 900, fallback: 20), 55) + } +} diff --git a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift new file mode 100644 index 00000000..90ba2da3 --- /dev/null +++ b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift @@ -0,0 +1,46 @@ +import Foundation +import XCTest +@testable import Cotabby + +/// Tests the `defaults write` escape hatches for the decode gates against an isolated suite, so +/// the confidence floor and the argmax-EOG stop are provably adjustable in the field without a +/// rebuild (and without touching process-global defaults from the test host). +@MainActor +final class LlamaDecodeGateDefaultsTests: XCTestCase { + private let suiteName = "LlamaDecodeGateDefaultsTests" + private var defaults: UserDefaults! + + override func setUp() { + super.setUp() + defaults = UserDefaults(suiteName: suiteName) + defaults.removePersistentDomain(forName: suiteName) + } + + override func tearDown() { + defaults.removePersistentDomain(forName: suiteName) + defaults = nil + super.tearDown() + } + + func test_confidenceFloor_defaultsToShippedValue() { + XCTAssertEqual( + LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), + LlamaSuggestionEngine.defaultConfidenceFloor + ) + } + + func test_confidenceFloor_overrideWins_includingDisable() { + defaults.set(-0.8, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey) + XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -0.8) + + defaults.set(-Double.infinity, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey) + XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -.infinity) + } + + func test_argmaxStop_onByDefault_andDisableToggleWorks() { + XCTAssertTrue(LlamaSuggestionEngine.resolvedStopAtArgmaxEOG(defaults)) + + defaults.set(true, forKey: LlamaSuggestionEngine.argmaxStopDisabledKey) + XCTAssertFalse(LlamaSuggestionEngine.resolvedStopAtArgmaxEOG(defaults)) + } +} diff --git a/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift b/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift index 336c6ed0..ba7dcd41 100644 --- a/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift +++ b/CotabbyTests/LlamaSuggestionEngineCancellationTests.swift @@ -53,7 +53,7 @@ final class LlamaSuggestionEngineCancellationTests: XCTestCase { func test_successfulGeneration_doesNotResetCache() async throws { let runtime = FakeLlamaRuntime() - runtime.generateResult = .success("world") + runtime.generateResult = .success(.text("world")) let engine = LlamaSuggestionEngine(runtimeManager: runtime) let result = try await engine.generateSuggestion(for: makeRequest(prompt: "hello ")) @@ -62,6 +62,24 @@ final class LlamaSuggestionEngineCancellationTests: XCTestCase { XCTAssertEqual(runtime.resetCount, 0) } + func test_lowConfidenceSuppression_isAttributedAsLowConfidence() async throws { + let runtime = FakeLlamaRuntime() + runtime.generateResult = .success( + LlamaGenerationOutput(text: "", averageLogprob: -5.2, suppressedByLowConfidence: true) + ) + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + + let result = try await engine.generateSuggestion(for: makeRequest(prompt: "hello ")) + + XCTAssertEqual(result.text, "") + XCTAssertEqual( + result.suppressionReason, + CompletionSuppressionReason.lowConfidence.rawValue, + "a runtime-withheld completion must not read as 'the model produced nothing'" + ) + XCTAssertEqual(runtime.resetCount, 0) + } + func test_suggestionClientError_resetsCache_andRethrowsSameError() async { // A `SuggestionClientError` crossing the runtime boundary is a genuine failure, so it must // reset the cache but keep its original case and message for the coordinator's diagnostics. @@ -168,14 +186,14 @@ private struct UnexpectedRuntimeBoom: LocalizedError { /// so the engine's failure routing can be exercised without loading a real model. @MainActor private final class FakeLlamaRuntime: LlamaRuntimeGenerating { - var generateResult: Result = .success("") + var generateResult: Result = .success(.text("")) private(set) var resetCount = 0 func generate( prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { try generateResult.get() } diff --git a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift index f66c4f57..c351779f 100644 --- a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift +++ b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift @@ -106,7 +106,7 @@ final class LlamaSuggestionEnginePrewarmTests: XCTestCase { @MainActor private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { var prefillError: Error? - var generateResult: Result = .success("ok") + var generateResult: Result = .success(.text("ok")) private(set) var prefillPrompts: [String] = [] private(set) var generateCachedPrefixBytes: [Int?] = [] @@ -114,7 +114,7 @@ private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { generateCachedPrefixBytes.append(cachedPrefixBytes) return try generateResult.get() } diff --git a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift index fc138f70..4e28451e 100644 --- a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift +++ b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift @@ -107,8 +107,8 @@ private final class StreamingFakeRuntime: LlamaRuntimeGenerating { prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions - ) async throws -> String { - finalText + ) async throws -> LlamaGenerationOutput { + .text(finalText) } func generate( @@ -116,12 +116,12 @@ private final class StreamingFakeRuntime: LlamaRuntimeGenerating { cachedPrefixBytes: Int?, options: LlamaGenerationOptions, onPartialRawText: (@Sendable (String) -> Void)? - ) async throws -> String { + ) async throws -> LlamaGenerationOutput { streamingCallCount += 1 for partial in partialRawTexts { onPartialRawText?(partial) } - return finalText + return .text(finalText) } func resetPromptCache() {} diff --git a/CotabbyTests/PromptPolicyTests.swift b/CotabbyTests/PromptPolicyTests.swift index ef23a211..bfdb5f5a 100644 --- a/CotabbyTests/PromptPolicyTests.swift +++ b/CotabbyTests/PromptPolicyTests.swift @@ -308,6 +308,7 @@ final class SuggestionEngineRouterTests: XCTestCase { foundationModelEngine: appleEngine, llamaEngine: openSourceEngine, performanceMetricsStore: PerformanceMetricsStore(userDefaults: makeUserDefaults()), + qualityMetricsStore: SuggestionQualityMetricsStore(userDefaults: makeUserDefaults()), llamaModelNameProvider: { nil } ) diff --git a/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift b/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift index efaafafa..3b701689 100644 --- a/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift +++ b/CotabbyTests/SuggestionCoordinatorAcceptanceTests.swift @@ -312,6 +312,9 @@ final class SuggestionCoordinatorAcceptanceTests: XCTestCase { configuration: .standard, spellChecker: CurrentWordSpellChecker(), symSpellCorrector: SymSpellCorrector(preloadLanguage: nil), + qualityMetricsStore: SuggestionQualityMetricsStore( + userDefaults: UserDefaults(suiteName: "CotabbyTests.quality.\(UUID().uuidString)") ?? .standard + ), userDefaults: UserDefaults(suiteName: "CotabbyTests.\(UUID().uuidString)") ?? .standard ) Self.retainedCoordinators.append(coordinator) diff --git a/CotabbyTests/SuggestionCoordinatorTestSupport.swift b/CotabbyTests/SuggestionCoordinatorTestSupport.swift index 37a13b57..42953014 100644 --- a/CotabbyTests/SuggestionCoordinatorTestSupport.swift +++ b/CotabbyTests/SuggestionCoordinatorTestSupport.swift @@ -245,6 +245,9 @@ func makeCoordinatorRig( configuration: .standard, spellChecker: CurrentWordSpellChecker(), symSpellCorrector: SymSpellCorrector(preloadLanguage: nil), + qualityMetricsStore: SuggestionQualityMetricsStore( + userDefaults: UserDefaults(suiteName: "CotabbyTests.rig.quality.\(UUID().uuidString)") ?? .standard + ), userDefaults: UserDefaults(suiteName: "CotabbyTests.rig.\(UUID().uuidString)") ?? .standard ) return CoordinatorRig( diff --git a/CotabbyTests/SuggestionEngineRouterTests.swift b/CotabbyTests/SuggestionEngineRouterTests.swift index 75e4f395..156d8fa9 100644 --- a/CotabbyTests/SuggestionEngineRouterTests.swift +++ b/CotabbyTests/SuggestionEngineRouterTests.swift @@ -67,6 +67,7 @@ final class SuggestionEngineRouterRoutingTests: XCTestCase { foundationModelEngine: foundation, llamaEngine: llama, performanceMetricsStore: metrics, + qualityMetricsStore: SuggestionQualityMetricsStore(userDefaults: defaults), llamaModelNameProvider: { llamaModelName } ) Self.retained.append(contentsOf: [router, settings, metrics] as [AnyObject]) diff --git a/CotabbyTests/SuggestionQualityMetricsStoreTests.swift b/CotabbyTests/SuggestionQualityMetricsStoreTests.swift new file mode 100644 index 00000000..112aa8e8 --- /dev/null +++ b/CotabbyTests/SuggestionQualityMetricsStoreTests.swift @@ -0,0 +1,59 @@ +import XCTest +@testable import Cotabby + +@MainActor +final class SuggestionQualityMetricsStoreTests: XCTestCase { + private func freshDefaults() -> UserDefaults { + UserDefaults(suiteName: "CotabbyTests.qualityMetrics.\(UUID().uuidString)") ?? .standard + } + + func testCountersAccumulate() { + let store = SuggestionQualityMetricsStore(userDefaults: freshDefaults()) + store.recordGenerated() + store.recordGenerated() + store.recordShown() + store.recordAcceptedSuggestion() + store.recordSuppressed(reason: "lowConfidence") + store.recordSuppressed(reason: "lowConfidence") + store.recordSuppressed(reason: "seamMisspelling") + + XCTAssertEqual(store.counters.generated, 2) + XCTAssertEqual(store.counters.shown, 1) + XCTAssertEqual(store.counters.acceptedSuggestions, 1) + XCTAssertEqual(store.counters.suppressedByReason["lowConfidence"], 2) + XCTAssertEqual(store.counters.suppressedByReason["seamMisspelling"], 1) + XCTAssertEqual(store.counters.suppressedTotal, 3) + XCTAssertNotNil(store.counters.firstRecordedAt) + } + + func testAcceptanceRate() { + let store = SuggestionQualityMetricsStore(userDefaults: freshDefaults()) + XCTAssertNil(store.counters.acceptanceRate, "no rate without shown suggestions") + store.recordShown() + store.recordShown() + store.recordShown() + store.recordShown() + store.recordAcceptedSuggestion() + XCTAssertEqual(store.counters.acceptanceRate ?? 0, 0.25, accuracy: 0.0001) + } + + func testPersistsAcrossInstances() { + let defaults = freshDefaults() + let first = SuggestionQualityMetricsStore(userDefaults: defaults) + first.recordShown() + first.recordSuppressed(reason: "emptyGeneration") + + let second = SuggestionQualityMetricsStore(userDefaults: defaults) + XCTAssertEqual(second.counters.shown, 1) + XCTAssertEqual(second.counters.suppressedByReason["emptyGeneration"], 1) + } + + func testResetClearsEverything() { + let defaults = freshDefaults() + let store = SuggestionQualityMetricsStore(userDefaults: defaults) + store.recordShown() + store.reset() + XCTAssertEqual(store.counters, SuggestionQualityMetricsStore.Counters()) + XCTAssertEqual(SuggestionQualityMetricsStore(userDefaults: defaults).counters.shown, 0) + } +}