From 46723602f821c7bd3fc0ef9f28b2788f9dd40347 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 11 Jun 2026 22:36:57 -0700
Subject: [PATCH] fix(decode): ship the confidence floor OFF by default (#688
 regression)

#688 enabled a -1.5 mean-logprob confidence floor by default. It won an offline
eval sweep, but that golden set under-represented real typing: production logs
show the same floor withholding ~56% of completions, most of them perfectly
usable, and under streaming it paints a partial then clears it (suggestions
appear to flicker away). It also forces per-token logprob computation, adding
latency.

Set defaultConfidenceFloor to -.infinity so the gate (and its logprob cost) are
off unless opted into via cotabbyConfidenceFloorOverride. The eval harness sets
that key to measure with the gate on. Recalibrate against a representative
real-usage distribution before re-enabling by default.
---
 .../Runtime/LlamaSuggestionEngine.swift         | 17 +++++++++++------
 CotabbyTests/LlamaDecodeGateDefaultsTests.swift |  8 ++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index b090ddc9..eb5c4454 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -21,12 +21,17 @@ final class LlamaSuggestionEngine {
     }
 
     /// Shipped confidence floor (mean per-token log-probability). -infinity disables the gate AND
-    /// the per-token logprob computation behind it; any other value turns both on. -1.5 came from
-    /// an eval sweep over {off, -4, -3, -2.5, -2, -1.5, -1, -0.5, -0.3}: floors at or below -2
-    /// never fire on this model at temperature 0.1, -1 and tighter buy precision at a brutal
-    /// coverage cost, and -1.5 is the unique point where the composite quality score rose
-    /// (0.734 to 0.744), wrong-shows fell 27% relative, and not a single must-show case was lost.
-    static let defaultConfidenceFloor: Double = -1.5
+    /// the per-token logprob computation behind it; any other value turns both on.
+    ///
+    /// Shipped OFF. A floor of -1.5 won an offline eval sweep (composite quality 0.734 to 0.744,
+    /// wrong-shows down 27%, no must-show case lost), but that golden set badly under-represented
+    /// real free-form typing: in production logs the same -1.5 floor withheld ~56% of completions,
+    /// most of them perfectly usable (passed and suppressed completions sat on either side of the
+    /// threshold with no quality difference). Under streaming the gate also paints a partial and
+    /// then clears it, which reads as suggestions flickering away. Until the floor is recalibrated
+    /// against a representative real-usage distribution, it stays opt-in via
+    /// `cotabbyConfidenceFloorOverride`; the eval harness sets that key to measure with the gate on.
+    static let defaultConfidenceFloor: Double = -.infinity
     /// `defaults write` escape hatches for dogfooding and field diagnosis without a rebuild.
     static let confidenceFloorOverrideKey = "cotabbyConfidenceFloorOverride"
     static let argmaxStopDisabledKey = "cotabbyArgmaxStopDisabled"
diff --git a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift
index 90ba2da3..4c5cabe9 100644
--- a/CotabbyTests/LlamaDecodeGateDefaultsTests.swift
+++ b/CotabbyTests/LlamaDecodeGateDefaultsTests.swift
@@ -29,6 +29,14 @@ final class LlamaDecodeGateDefaultsTests: XCTestCase {
         )
     }
 
+    /// The gate ships OFF: a -1.5 floor withheld ~56% of real completions, so confidence
+    /// suppression is opt-in until it is recalibrated against real usage. -infinity also turns off
+    /// the per-token logprob computation, so this lock guards both the coverage and the latency.
+    func test_confidenceFloor_shippedOff_byDefault() {
+        XCTAssertEqual(LlamaSuggestionEngine.defaultConfidenceFloor, -.infinity)
+        XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -.infinity)
+    }
+
     func test_confidenceFloor_overrideWins_includingDisable() {
         defaults.set(-0.8, forKey: LlamaSuggestionEngine.confidenceFloorOverrideKey)
         XCTAssertEqual(LlamaSuggestionEngine.resolvedConfidenceFloor(defaults), -0.8)