From 495044c4176a26c3f6b3b5007a10da1ef4bcd1b2 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:43:13 -0700 Subject: [PATCH] Decode-quality primitives: scaffolding-token mask and argmax-is-EOG stop signal Two zero-hot-loop-cost additions for autocomplete decode quality: - buildTokenMasks now probes each token's special-rendered piece and hard-masks single-token chat/instruct/FIM scaffolding (<|im_end|>, , [INST], FIM families) that the GGUF did not flag as a control token, plus an unflagged BOS. EOG tokens stay exempt so natural stops keep firing. Well-formed GGUFs already flag these as control, so the common count is 0; the rule is insurance against vocabularies that ship them unflagged. Exposed via getMaskedScaffoldingTokenCount for tests/diagnostics. - SampleResult gains argmax_is_eog: whether the raw distribution's single most-likely token at this position is an end-of-generation token. Stochastic sampling can draw past the point where the model wants to stop; this lets callers detect that stop intent on the exact step it appears. Computed in C++ while the logits row is hot (one O(vocab) pass per token, tens of microseconds); the seed token's verdict is captured at decodePrompt while its row is still resident. Field is appended, so existing Swift call sites that only read members keep compiling; SamplingConfig is untouched. --- .../CotabbyInferenceEngine.cpp | 91 +++++++++++++++++- .../include/CotabbyInferenceEngine.h | 11 +++ .../LlamaMiddlewareTests.swift | 93 +++++++++++++++++++ 3 files changed, 193 insertions(+), 2 deletions(-) diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp index 57353e3..d871596 100644 --- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp +++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,36 @@ static void silenced_log_callback(ggml_log_level, const char*, void*) {} +// Single-token chat/instruct/FIM scaffolding that must never surface in autocomplete text. +// Most GGUFs flag these as control tokens (masked by the base rule); this catches vocabularies +// that ship them as ordinary text tokens. EOG-flagged tokens are exempted by the caller so the +// natural stop check keeps firing. Matching is exact (or prefix for the FIM/repo families) +// against the special-rendered piece, so ordinary text like "<|" fragments is never affected. +static bool isScaffoldingMarkerPiece(const char* piece, int length) { + if (!piece || length <= 0) return false; + const std::string_view view(piece, static_cast(length)); + static constexpr std::string_view exact_markers[] = { + "<|im_start|>", "<|im_end|>", + "<|user|>", "<|assistant|>", "<|system|>", + "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", + "<|end|>", "<|endoftext|>", + "", "", + "[INST]", "[/INST]", + }; + for (const auto marker : exact_markers) { + if (view == marker) return true; + } + static constexpr std::string_view prefix_markers[] = { + "<|fim_", "= prefix.size() && view.substr(0, prefix.size()) == prefix) { + return true; + } + } + return false; +} + // Decode threads should match the *performance* core count, not the logical core count. // llama.cpp's CPU work is a per-layer parallel matmul with a barrier at each layer: schedule any // of those threads onto efficiency cores and every P-core finishes early only to stall at the @@ -96,6 +127,10 @@ struct SequenceState { // Log-probability of the seed token, computed at decodePrompt and returned with the seed. float seed_logprob = 0.0f; + // argmax-is-EOG verdict for the seed token's logits row, captured at decodePrompt while + // those logits are still resident; the row is gone by the time sampleNext returns the seed. + bool seed_argmax_is_eog = false; + ~SequenceState() { if (sampler) { llama_sampler_free(sampler); } } @@ -114,7 +149,8 @@ struct SequenceState { has_pending_input(o.has_pending_input), force_word_continuation(o.force_word_continuation), compute_logprob(o.compute_logprob), - seed_logprob(o.seed_logprob) { + seed_logprob(o.seed_logprob), + seed_argmax_is_eog(o.seed_argmax_is_eog) { o.sampler = nullptr; } SequenceState& operator=(SequenceState&&) = delete; @@ -182,6 +218,10 @@ struct CotabbyInferenceEngine::Impl { std::vector nonprintable_bias; std::vector linebreak_bias; std::vector starts_new_word; + // How many of the nonprintable entries came from the scaffolding-piece rule rather than + // the control/unknown/unused attributes. Surfaced via getMaskedScaffoldingTokenCount so + // tests and diagnostics can confirm the rule's reach on a given vocabulary. + int scaffolding_masked_count = 0; // Public-facing sequence map (external int32_t IDs → state) and the // internal `llama_seq_id` slot allocator. @@ -299,11 +339,16 @@ struct CotabbyInferenceEngine::Impl { nonprintable_bias.clear(); linebreak_bias.clear(); starts_new_word.clear(); + scaffolding_masked_count = 0; if (!vocab) return; const int32_t n = llama_vocab_n_tokens(vocab); starts_new_word.assign(static_cast(n), false); + // BOS belongs at sequence start only; some vocabularies ship it without the control + // attribute, which would otherwise let it be sampled mid-text. + const llama_token bos_token = llama_vocab_bos(vocab); + char piece[64]; for (llama_token t = 0; t < n; ++t) { const bool is_eog = llama_vocab_is_eog(vocab, t); @@ -314,7 +359,20 @@ struct CotabbyInferenceEngine::Impl { const enum llama_token_attr attr = llama_vocab_get_attr(vocab, t); const bool junk_attr = (attr & (LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_UNUSED)) != 0; - if (llama_vocab_is_control(vocab, t) || junk_attr) { + bool masked = llama_vocab_is_control(vocab, t) || junk_attr || t == bos_token; + if (!masked) { + // Probe with special rendering: control-style markers decode to an empty + // piece under the plain rendering used below, so the scaffolding rule must + // look at the special-rendered text instead. + const int special_written = + llama_token_to_piece(vocab, t, piece, sizeof(piece), 0, true); + if (special_written > 0 && + isScaffoldingMarkerPiece(piece, special_written)) { + masked = true; + ++scaffolding_masked_count; + } + } + if (masked) { nonprintable_bias.push_back({ t, -INFINITY }); } } @@ -374,6 +432,25 @@ struct CotabbyInferenceEngine::Impl { ); } + // Whether the raw distribution at `logits_row` puts its single highest logit on an + // end-of-generation token. One O(vocab) pass over the row the caller just sampled from; + // the sampler chain works on a copied candidate array, so the row is unmutated here. + bool argmaxIsEOG(int logits_row) const { + if (!shared_ctx || !vocab) return false; + const float* logits = llama_get_logits_ith(shared_ctx, logits_row); + if (!logits) return false; + const int32_t n = llama_vocab_n_tokens(vocab); + llama_token argmax = 0; + float best = -INFINITY; + for (llama_token t = 0; t < n; ++t) { + if (logits[t] > best) { + best = logits[t]; + argmax = t; + } + } + return llama_vocab_is_eog(vocab, argmax) || argmax == llama_vocab_eos(vocab); + } + void destroyAllSequences() { std::lock_guard lock(sequences_mutex); for (auto& [id, seq] : sequences) { @@ -481,6 +558,10 @@ struct CotabbyInferenceEngine::Impl { llama_token next = llama_sampler_sample( req.sampler, shared_ctx, i ); + // Computed on the raw logits row after sampling: llama_sampler_sample applies + // the chain to a copied candidate array, so row i is still the model's + // unbiased distribution here. + r.argmax_is_eog = argmaxIsEOG(i); if (next == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, next)) { r.token = next; @@ -902,6 +983,7 @@ EngineStatus CotabbyInferenceEngine::decodePrompt(int32_t sequence_id, llama_sampler_accept(seq->sampler, seed); seq->seed_token = seed; seq->seed_logprob = seq->compute_logprob ? impl_->computeLogprob(-1, seed) : 0.0f; + seq->seed_argmax_is_eog = impl_->argmaxIsEOG(-1); seq->has_seed_token = true; seq->has_pending_input = false; @@ -946,6 +1028,7 @@ SampleResult CotabbyInferenceEngine::sampleNext(int32_t sequence_id) { if (seq->has_seed_token) { llama_token next = seq->seed_token; seq->has_seed_token = false; + result.argmax_is_eog = seq->seed_argmax_is_eog; if (next == llama_vocab_eos(impl_->vocab) || llama_vocab_is_eog(impl_->vocab, next)) { @@ -1231,3 +1314,7 @@ int CotabbyInferenceEngine::getThreadCount() const { int CotabbyInferenceEngine::getGPULayerCount() const { return impl_->gpu_layer_count; } + +int CotabbyInferenceEngine::getMaskedScaffoldingTokenCount() const { + return impl_->scaffolding_masked_count; +} diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h index 4550877..8da29de 100644 --- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h +++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h @@ -27,6 +27,12 @@ struct SWIFT_SELF_CONTAINED SampleResult { // Log-probability of the chosen token under the raw model distribution (<= 0). Used as a // confidence signal; 0 for the EOS/cancelled cases where it carries no meaning. float logprob; + // True when the single most-likely token of the raw distribution this token was sampled + // from is an end-of-generation token. Stochastic sampling can draw past the point where + // the model wants to stop; this flag lets callers detect that stop intent on the very + // step it appears, even when the sampled token is something else. Appended after the + // existing fields so Swift call sites that only read members keep compiling. + bool argmax_is_eog; }; enum class EngineStatus : int { @@ -168,6 +174,11 @@ class CotabbyInferenceEngine { int getBatchSize() const; int getThreadCount() const; int getGPULayerCount() const; + // Number of vocabulary tokens that were hard-masked at model load because their rendered + // piece is chat/instruct/FIM scaffolding that the GGUF did not flag as a control token. + // 0 is the common (and healthy) case: well-formed GGUFs flag these as control already, + // and control tokens are masked by the base rule rather than counted here. + int getMaskedScaffoldingTokenCount() const; private: struct Impl; diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift index 138d8d0..53dfaa2 100644 --- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift +++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift @@ -509,4 +509,97 @@ final class LlamaMiddlewareTests: XCTestCase { } engine.destroySequence(seq) } + + func testMaskedScaffoldingCountDefaultsToZero() { + let engine = CotabbyInferenceEngine() + XCTAssertEqual(engine.getMaskedScaffoldingTokenCount(), 0) + } + + // Single-token chat/template markers must never surface as sampled text. Most vocabularies + // flag them as control tokens (masked by the base rule); the scaffolding rule covers ones + // that ship unflagged. Either way the observable contract is the same: no sampled piece is + // ever a complete marker string, even at high temperature with a template-bait prompt. + func testSamplingNeverEmitsScaffoldingMarkerPieces() throws { + guard let modelPath = ProcessInfo.processInfo.environment["COTABBY_TEST_MODEL_PATH"] else { + try XCTSkipIf(true, "Set COTABBY_TEST_MODEL_PATH to a .gguf file to run this test") + return + } + var engine = CotabbyInferenceEngine() + XCTAssertEqual(engine.loadModel(modelPath, -1, 1024, 256), EngineStatus.ok) + defer { engine.unloadModel() } + XCTAssertGreaterThanOrEqual(engine.getMaskedScaffoldingTokenCount(), 0) + + let markers: Set = [ + "<|im_start|>", "<|im_end|>", "<|user|>", "<|assistant|>", "<|system|>", + "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|end|>", + "<|endoftext|>", "", "", "[INST]", "[/INST]" + ] + let config = SamplingConfig( + max_prediction_tokens: 64, temperature: 1.8, + top_k: 0, top_p: 0, min_p: 0, + repetition_penalty: 1.0, seed: 7, + single_line: false + ) + let seq = engine.createSequence(config) + let prompt = "<|im_start|>user\nWrite a reply<|im_end|>\n<|im_start|>assistant\n" + var tokens = Array(engine.tokenize(prompt, Int32(prompt.utf8.count))) + XCTAssertEqual(engine.decodePrompt(seq, &tokens, Int32(tokens.count), 0), EngineStatus.ok) + + for _ in 0..<64 { + let result = engine.sampleNext(seq) + if result.is_eos || result.was_cancelled { break } + guard let piecePointer = result.piece else { continue } + let piece = String( + bytes: UnsafeRawBufferPointer(start: piecePointer, count: Int(result.piece_length)), + encoding: .utf8 + ) ?? "" + XCTAssertFalse( + markers.contains(piece), + "Sampled a complete scaffolding marker piece: \(piece)" + ) + } + engine.destroySequence(seq) + } + + // Under greedy sampling with penalties disabled, the sampled token IS the raw argmax, so + // `argmax_is_eog` must agree with `isEndOfGenerationToken(sampled token)` on every step. + // This pins the flag's semantics without needing the model to reach a natural EOS. + func testArgmaxIsEOGMatchesGreedyChoice() throws { + guard let modelPath = ProcessInfo.processInfo.environment["COTABBY_TEST_MODEL_PATH"] else { + try XCTSkipIf(true, "Set COTABBY_TEST_MODEL_PATH to a .gguf file to run this test") + return + } + var engine = CotabbyInferenceEngine() + XCTAssertEqual(engine.loadModel(modelPath, -1, 1024, 256), EngineStatus.ok) + defer { engine.unloadModel() } + + let config = SamplingConfig( + max_prediction_tokens: 24, temperature: 0, + top_k: 0, top_p: 0, min_p: 0, + repetition_penalty: 1.0, seed: 0, + single_line: false + ) + let seq = engine.createSequence(config) + let prompt = "The capital of France is" + var tokens = Array(engine.tokenize(prompt, Int32(prompt.utf8.count))) + XCTAssertEqual(engine.decodePrompt(seq, &tokens, Int32(tokens.count), 0), EngineStatus.ok) + + var steps = 0 + for _ in 0..<24 { + let result = engine.sampleNext(seq) + if result.was_cancelled { break } + // Greedy + no penalties: sampled token == raw argmax. A masked control token can in + // principle displace the raw argmax from the greedy pick, but EOG tokens are never + // masked, so the EOG verdicts still agree. + XCTAssertEqual( + result.argmax_is_eog, + engine.isEndOfGenerationToken(result.token), + "argmax_is_eog disagreed with the greedy token's EOG status at step \(steps)" + ) + steps += 1 + if result.is_eos { break } + } + XCTAssertGreaterThan(steps, 0, "Expected at least one sampled step") + engine.destroySequence(seq) + } }