From 495044c4176a26c3f6b3b5007a10da1ef4bcd1b2 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:43:13 -0700
Subject: [PATCH] Decode-quality primitives: scaffolding-token mask and
 argmax-is-EOG stop signal

Two zero-hot-loop-cost additions for autocomplete decode quality:

- buildTokenMasks now probes each token's special-rendered piece and hard-masks
  single-token chat/instruct/FIM scaffolding (<|im_end|>, <start_of_turn>, [INST],
  FIM families) that the GGUF did not flag as a control token, plus an unflagged
  BOS. EOG tokens stay exempt so natural stops keep firing. Well-formed GGUFs
  already flag these as control, so the common count is 0; the rule is insurance
  against vocabularies that ship them unflagged. Exposed via
  getMaskedScaffoldingTokenCount for tests/diagnostics.

- SampleResult gains argmax_is_eog: whether the raw distribution's single
  most-likely token at this position is an end-of-generation token. Stochastic
  sampling can draw past the point where the model wants to stop; this lets
  callers detect that stop intent on the exact step it appears. Computed in C++
  while the logits row is hot (one O(vocab) pass per token, tens of microseconds);
  the seed token's verdict is captured at decodePrompt while its row is still
  resident. Field is appended, so existing Swift call sites that only read
  members keep compiling; SamplingConfig is untouched.
---
 .../CotabbyInferenceEngine.cpp                | 91 +++++++++++++++++-
 .../include/CotabbyInferenceEngine.h          | 11 +++
 .../LlamaMiddlewareTests.swift                | 93 +++++++++++++++++++
 3 files changed, 193 insertions(+), 2 deletions(-)
diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
index 57353e3..d871596 100644
--- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
+++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
@@ -10,6 +10,7 @@
 #include <mutex>
 #include <random>
 #include <string>
+#include <string_view>
 #include <thread>
 #include <unordered_map>
 #include <vector>
@@ -23,6 +24,36 @@
 
 static void silenced_log_callback(ggml_log_level, const char*, void*) {}
 
+// Single-token chat/instruct/FIM scaffolding that must never surface in autocomplete text.
+// Most GGUFs flag these as control tokens (masked by the base rule); this catches vocabularies
+// that ship them as ordinary text tokens. EOG-flagged tokens are exempted by the caller so the
+// natural stop check keeps firing. Matching is exact (or prefix for the FIM/repo families)
+// against the special-rendered piece, so ordinary text like "<|" fragments is never affected.
+static bool isScaffoldingMarkerPiece(const char* piece, int length) {
+    if (!piece || length <= 0) return false;
+    const std::string_view view(piece, static_cast<size_t>(length));
+    static constexpr std::string_view exact_markers[] = {
+        "<|im_start|>", "<|im_end|>",
+        "<|user|>", "<|assistant|>", "<|system|>",
+        "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>",
+        "<|end|>", "<|endoftext|>",
+        "<start_of_turn>", "<end_of_turn>",
+        "[INST]", "[/INST]",
+    };
+    for (const auto marker : exact_markers) {
+        if (view == marker) return true;
+    }
+    static constexpr std::string_view prefix_markers[] = {
+        "<|fim_", "<fim_", "<|file_sep", "<|repo_name",
+    };
+    for (const auto prefix : prefix_markers) {
+        if (view.size() >= prefix.size() && view.substr(0, prefix.size()) == prefix) {
+            return true;
+        }
+    }
+    return false;
+}
+
 // Decode threads should match the *performance* core count, not the logical core count.
 // llama.cpp's CPU work is a per-layer parallel matmul with a barrier at each layer: schedule any
 // of those threads onto efficiency cores and every P-core finishes early only to stall at the
@@ -96,6 +127,10 @@ struct SequenceState {
     // Log-probability of the seed token, computed at decodePrompt and returned with the seed.
     float seed_logprob = 0.0f;
 
+    // argmax-is-EOG verdict for the seed token's logits row, captured at decodePrompt while
+    // those logits are still resident; the row is gone by the time sampleNext returns the seed.
+    bool seed_argmax_is_eog = false;
+
     ~SequenceState() {
         if (sampler) { llama_sampler_free(sampler); }
     }
@@ -114,7 +149,8 @@ struct SequenceState {
           has_pending_input(o.has_pending_input),
           force_word_continuation(o.force_word_continuation),
           compute_logprob(o.compute_logprob),
-          seed_logprob(o.seed_logprob) {
+          seed_logprob(o.seed_logprob),
+          seed_argmax_is_eog(o.seed_argmax_is_eog) {
         o.sampler = nullptr;
     }
     SequenceState& operator=(SequenceState&&) = delete;
@@ -182,6 +218,10 @@ struct CotabbyInferenceEngine::Impl {
     std::vector<llama_logit_bias> nonprintable_bias;
     std::vector<llama_logit_bias> linebreak_bias;
     std::vector<bool> starts_new_word;
+    // How many of the nonprintable entries came from the scaffolding-piece rule rather than
+    // the control/unknown/unused attributes. Surfaced via getMaskedScaffoldingTokenCount so
+    // tests and diagnostics can confirm the rule's reach on a given vocabulary.
+    int scaffolding_masked_count = 0;
 
     // Public-facing sequence map (external int32_t IDs → state) and the
     // internal `llama_seq_id` slot allocator.
@@ -299,11 +339,16 @@ struct CotabbyInferenceEngine::Impl {
         nonprintable_bias.clear();
         linebreak_bias.clear();
         starts_new_word.clear();
+        scaffolding_masked_count = 0;
         if (!vocab) return;
 
         const int32_t n = llama_vocab_n_tokens(vocab);
         starts_new_word.assign(static_cast<size_t>(n), false);
 
+        // BOS belongs at sequence start only; some vocabularies ship it without the control
+        // attribute, which would otherwise let it be sampled mid-text.
+        const llama_token bos_token = llama_vocab_bos(vocab);
+
         char piece[64];
         for (llama_token t = 0; t < n; ++t) {
             const bool is_eog = llama_vocab_is_eog(vocab, t);
@@ -314,7 +359,20 @@ struct CotabbyInferenceEngine::Impl {
                 const enum llama_token_attr attr = llama_vocab_get_attr(vocab, t);
                 const bool junk_attr =
                     (attr & (LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_UNUSED)) != 0;
-                if (llama_vocab_is_control(vocab, t) || junk_attr) {
+                bool masked = llama_vocab_is_control(vocab, t) || junk_attr || t == bos_token;
+                if (!masked) {
+                    // Probe with special rendering: control-style markers decode to an empty
+                    // piece under the plain rendering used below, so the scaffolding rule must
+                    // look at the special-rendered text instead.
+                    const int special_written =
+                        llama_token_to_piece(vocab, t, piece, sizeof(piece), 0, true);
+                    if (special_written > 0 &&
+                        isScaffoldingMarkerPiece(piece, special_written)) {
+                        masked = true;
+                        ++scaffolding_masked_count;
+                    }
+                }
+                if (masked) {
                     nonprintable_bias.push_back({ t, -INFINITY });
                 }
             }
@@ -374,6 +432,25 @@ struct CotabbyInferenceEngine::Impl {
         );
     }
 
+    // Whether the raw distribution at `logits_row` puts its single highest logit on an
+    // end-of-generation token. One O(vocab) pass over the row the caller just sampled from;
+    // the sampler chain works on a copied candidate array, so the row is unmutated here.
+    bool argmaxIsEOG(int logits_row) const {
+        if (!shared_ctx || !vocab) return false;
+        const float* logits = llama_get_logits_ith(shared_ctx, logits_row);
+        if (!logits) return false;
+        const int32_t n = llama_vocab_n_tokens(vocab);
+        llama_token argmax = 0;
+        float best = -INFINITY;
+        for (llama_token t = 0; t < n; ++t) {
+            if (logits[t] > best) {
+                best = logits[t];
+                argmax = t;
+            }
+        }
+        return llama_vocab_is_eog(vocab, argmax) || argmax == llama_vocab_eos(vocab);
+    }
+
     void destroyAllSequences() {
         std::lock_guard<std::mutex> lock(sequences_mutex);
         for (auto& [id, seq] : sequences) {
@@ -481,6 +558,10 @@ struct CotabbyInferenceEngine::Impl {
                 llama_token next = llama_sampler_sample(
                     req.sampler, shared_ctx, i
                 );
+                // Computed on the raw logits row after sampling: llama_sampler_sample applies
+                // the chain to a copied candidate array, so row i is still the model's
+                // unbiased distribution here.
+                r.argmax_is_eog = argmaxIsEOG(i);
                 if (next == llama_vocab_eos(vocab) ||
                     llama_vocab_is_eog(vocab, next)) {
                     r.token = next;
@@ -902,6 +983,7 @@ EngineStatus CotabbyInferenceEngine::decodePrompt(int32_t sequence_id,
     llama_sampler_accept(seq->sampler, seed);
     seq->seed_token = seed;
     seq->seed_logprob = seq->compute_logprob ? impl_->computeLogprob(-1, seed) : 0.0f;
+    seq->seed_argmax_is_eog = impl_->argmaxIsEOG(-1);
     seq->has_seed_token = true;
     seq->has_pending_input = false;
 
@@ -946,6 +1028,7 @@ SampleResult CotabbyInferenceEngine::sampleNext(int32_t sequence_id) {
     if (seq->has_seed_token) {
         llama_token next = seq->seed_token;
         seq->has_seed_token = false;
+        result.argmax_is_eog = seq->seed_argmax_is_eog;
 
         if (next == llama_vocab_eos(impl_->vocab) ||
             llama_vocab_is_eog(impl_->vocab, next)) {
@@ -1231,3 +1314,7 @@ int CotabbyInferenceEngine::getThreadCount() const {
 int CotabbyInferenceEngine::getGPULayerCount() const {
     return impl_->gpu_layer_count;
 }
+
+int CotabbyInferenceEngine::getMaskedScaffoldingTokenCount() const {
+    return impl_->scaffolding_masked_count;
+}
diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
index 4550877..8da29de 100644
--- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
+++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
@@ -27,6 +27,12 @@ struct SWIFT_SELF_CONTAINED SampleResult {
     // Log-probability of the chosen token under the raw model distribution (<= 0). Used as a
     // confidence signal; 0 for the EOS/cancelled cases where it carries no meaning.
     float logprob;
+    // True when the single most-likely token of the raw distribution this token was sampled
+    // from is an end-of-generation token. Stochastic sampling can draw past the point where
+    // the model wants to stop; this flag lets callers detect that stop intent on the very
+    // step it appears, even when the sampled token is something else. Appended after the
+    // existing fields so Swift call sites that only read members keep compiling.
+    bool argmax_is_eog;
 };
 
 enum class EngineStatus : int {
@@ -168,6 +174,11 @@ class CotabbyInferenceEngine {
     int getBatchSize() const;
     int getThreadCount() const;
     int getGPULayerCount() const;
+    // Number of vocabulary tokens that were hard-masked at model load because their rendered
+    // piece is chat/instruct/FIM scaffolding that the GGUF did not flag as a control token.
+    // 0 is the common (and healthy) case: well-formed GGUFs flag these as control already,
+    // and control tokens are masked by the base rule rather than counted here.
+    int getMaskedScaffoldingTokenCount() const;
 
 private:
     struct Impl;
diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
index 138d8d0..53dfaa2 100644
--- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
+++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
@@ -509,4 +509,97 @@ final class LlamaMiddlewareTests: XCTestCase {
         }
         engine.destroySequence(seq)
     }
+
+    func testMaskedScaffoldingCountDefaultsToZero() {
+        let engine = CotabbyInferenceEngine()
+        XCTAssertEqual(engine.getMaskedScaffoldingTokenCount(), 0)
+    }
+
+    // Single-token chat/template markers must never surface as sampled text. Most vocabularies
+    // flag them as control tokens (masked by the base rule); the scaffolding rule covers ones
+    // that ship unflagged. Either way the observable contract is the same: no sampled piece is
+    // ever a complete marker string, even at high temperature with a template-bait prompt.
+    func testSamplingNeverEmitsScaffoldingMarkerPieces() throws {
+        guard let modelPath = ProcessInfo.processInfo.environment["COTABBY_TEST_MODEL_PATH"] else {
+            try XCTSkipIf(true, "Set COTABBY_TEST_MODEL_PATH to a .gguf file to run this test")
+            return
+        }
+        var engine = CotabbyInferenceEngine()
+        XCTAssertEqual(engine.loadModel(modelPath, -1, 1024, 256), EngineStatus.ok)
+        defer { engine.unloadModel() }
+        XCTAssertGreaterThanOrEqual(engine.getMaskedScaffoldingTokenCount(), 0)
+
+        let markers: Set<String> = [
+            "<|im_start|>", "<|im_end|>", "<|user|>", "<|assistant|>", "<|system|>",
+            "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|end|>",
+            "<|endoftext|>", "<start_of_turn>", "<end_of_turn>", "[INST]", "[/INST]"
+        ]
+        let config = SamplingConfig(
+            max_prediction_tokens: 64, temperature: 1.8,
+            top_k: 0, top_p: 0, min_p: 0,
+            repetition_penalty: 1.0, seed: 7,
+            single_line: false
+        )
+        let seq = engine.createSequence(config)
+        let prompt = "<|im_start|>user\nWrite a reply<|im_end|>\n<|im_start|>assistant\n"
+        var tokens = Array(engine.tokenize(prompt, Int32(prompt.utf8.count)))
+        XCTAssertEqual(engine.decodePrompt(seq, &tokens, Int32(tokens.count), 0), EngineStatus.ok)
+
+        for _ in 0..<64 {
+            let result = engine.sampleNext(seq)
+            if result.is_eos || result.was_cancelled { break }
+            guard let piecePointer = result.piece else { continue }
+            let piece = String(
+                bytes: UnsafeRawBufferPointer(start: piecePointer, count: Int(result.piece_length)),
+                encoding: .utf8
+            ) ?? ""
+            XCTAssertFalse(
+                markers.contains(piece),
+                "Sampled a complete scaffolding marker piece: \(piece)"
+            )
+        }
+        engine.destroySequence(seq)
+    }
+
+    // Under greedy sampling with penalties disabled, the sampled token IS the raw argmax, so
+    // `argmax_is_eog` must agree with `isEndOfGenerationToken(sampled token)` on every step.
+    // This pins the flag's semantics without needing the model to reach a natural EOS.
+    func testArgmaxIsEOGMatchesGreedyChoice() throws {
+        guard let modelPath = ProcessInfo.processInfo.environment["COTABBY_TEST_MODEL_PATH"] else {
+            try XCTSkipIf(true, "Set COTABBY_TEST_MODEL_PATH to a .gguf file to run this test")
+            return
+        }
+        var engine = CotabbyInferenceEngine()
+        XCTAssertEqual(engine.loadModel(modelPath, -1, 1024, 256), EngineStatus.ok)
+        defer { engine.unloadModel() }
+
+        let config = SamplingConfig(
+            max_prediction_tokens: 24, temperature: 0,
+            top_k: 0, top_p: 0, min_p: 0,
+            repetition_penalty: 1.0, seed: 0,
+            single_line: false
+        )
+        let seq = engine.createSequence(config)
+        let prompt = "The capital of France is"
+        var tokens = Array(engine.tokenize(prompt, Int32(prompt.utf8.count)))
+        XCTAssertEqual(engine.decodePrompt(seq, &tokens, Int32(tokens.count), 0), EngineStatus.ok)
+
+        var steps = 0
+        for _ in 0..<24 {
+            let result = engine.sampleNext(seq)
+            if result.was_cancelled { break }
+            // Greedy + no penalties: sampled token == raw argmax. A masked control token can in
+            // principle displace the raw argmax from the greedy pick, but EOG tokens are never
+            // masked, so the EOG verdicts still agree.
+            XCTAssertEqual(
+                result.argmax_is_eog,
+                engine.isEndOfGenerationToken(result.token),
+                "argmax_is_eog disagreed with the greedy token's EOG status at step \(steps)"
+            )
+            steps += 1
+            if result.is_eos { break }
+        }
+        XCTAssertGreaterThan(steps, 0, "Expected at least one sampled step")
+        engine.destroySequence(seq)
+    }
 }