From 46723602f821c7bd3fc0ef9f28b2788f9dd40347 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 22:36:57 -0700 Subject: [PATCH] fix(decode): ship the confidence floor OFF by default (#688 regression) #688 enabled a -1.5 mean-logprob confidence floor by default. It won an offline eval sweep, but that golden set under-represented real typing: production logs show the same floor withholding ~56% of completions, most of them perfectly usable, and under streaming it paints a partial then clears it (suggestions appear to flicker away). It also forces per-token logprob computation, adding latency. Set defaultConfidenceFloor to -.infinity so the gate (and its logprob cost) are off unless opted into via cotabbyConfidenceFloorOverride. The eval harness sets that key to measure with the gate on. Recalibrate against a representative real-usage distribution before re-enabling by default. --- .../Runtime/LlamaSuggestionEngine.swift | 17 +++++++++++------ CotabbyTests/LlamaDecodeGateDefaultsTests.swift | 8 ++++++++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index b090ddc9..eb5c4454 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -21,12 +21,17 @@ final class LlamaSuggestionEngine { } /// Shipped confidence floor (mean per-token log-probability). -infinity disables the gate AND - /// the per-token logprob computation behind it; any other value turns both on. -1.5 came from - /// an eval sweep over {off, -4, -3, -2.5, -2, -1.5, -1, -0.5, -0.3}: floors at or below -2 - /// never fire on this model at temperature 0.1, -1 and tighter buy precision at a brutal - /// coverage cost, and -1.5 is the unique point where the composite quality score rose - /// (0.734 to 0.744), wrong-shows fell 27% relative, and not a single must-show case was lost. - static let defaultConfidenceFloor: Double = -1.5 + /// the per-token logprob computation behind it; any other value turns both on. + /// + /// Shipped OFF. A floor of -1.5 won an offline eval sweep (composite quality 0.734 to 0.744, + /// wrong-shows down 27%, no must-show case lost), but that golden set badly under-represented + /// real free-form typing: in production logs the same -1.5 floor withheld ~56% of completions, + /// most of them perfectly usable (passed and suppressed completions sat on either side of the + /// threshold with no quality difference). Under streaming the gate also paints a partial and + /// then clears it, which reads as suggestions flickering away. Until the floor is recalibrated + /// against a representative real-usage distribution, it stays opt-in via + /// `cotabbyConfidenceFloorOverride`; the eval harness sets that key to measure with the gate on. + static let defaultConfidenceFloor: Double = -.infinity /// `defaults write` escape hatches for dogfooding and field diagnosis without a rebuild. static let confidenceFloorOverrideKey = "cotabbyConfidenceFloorOverride" static let argmaxStopDisabledKey = "cotabbyArgmaxStopDisabled" diff --git a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift index 90ba2da3..4c5cabe9 100644 --- a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift +++ b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift @@ -29,6 +29,14 @@ final class LlamaDecodeGateDefaultsTests: XCTestCase { ) } + /// The gate ships OFF: a -1.5 floor withheld ~56% of real completions, so confidence + /// suppression is opt-in until it is recalibrated against real usage. -infinity also turns off + /// the per-token logprob computation, so this lock guards both the coverage and the latency. + func test_confidenceFloor_shippedOff_byDefault() { + XCTAssertEqual(LlamaSuggestionEngine.defaultConfidenceFloor, -.infinity) + XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -.infinity) + } + func test_confidenceFloor_overrideWins_includingDisable() { defaults.set(-0.8, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey) XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -0.8)