quantumaikr · unamedkr · Apr 12, 2026 · Apr 12, 2026
diff --git a/quant.h b/quant.h
@@ -15943,19 +15943,21 @@ int tq_generate_chat_text(tq_model_t* model,
             if (n_suffix < 0) n_suffix = 0;
         }
 
+        /* Context overflow: return -2 instead of falling back to a
+         * dangerous full reprefill. The state still has stale KV at
+         * positions [n_new..prefix_pos) that would corrupt later tokens.
+         * Caller should reset the chat and retry. */
         int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
         if (prefix_pos + n_suffix + reserve + 32 > max_prompt) {
             free(suffix_toks);
             config->on_token = orig_cb; config->user_data = orig_ud;
-            *n_cached_io = 0;
-            if (cached_text_io && *cached_text_io) {
-                free(*cached_text_io); *cached_text_io = NULL;
+            if (accum.buf) free(accum.buf);
+            if (getenv("TQ_CHAT_DEBUG")) {
+                fprintf(stderr,
+                    "[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n",
+                    prefix_pos, n_suffix, reserve, max_prompt);
             }
-            int n2 = tq_generate_continue(model, tokenizer, state, prompt, config,
-                                           cached_tokens_io, n_cached_io, cached_capacity_io,
-                                           output, output_size);
-            generated = n2;
-            goto update_cache;
+            return -2;
         }
 
         int needed = prefix_pos + n_suffix + reserve + 16;

diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -603,12 +603,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
 }
 
 /* ============================================================================
- * tq_generate_continue — chat-mode generation with KV cache reuse.
+ * tq_generate_continue — chat-mode generation with KV cache reuse (token LCP).
  *
  * Caller-managed state: state and cached_tokens persist across calls.
  * Each call computes the longest common prefix between cached_tokens and
  * the new prompt, prefills only the diverging suffix, and updates the
  * cache record. Turns chat from O(history^2) into O(new_tokens_per_turn).
+ *
+ * NOTE: This is a lower-level API. It does NOT track cached_text. If a
+ * sliding window triggers (n_cached_io is reset to 0), any out-of-band
+ * cached_text the caller maintains becomes stale. Higher-level callers
+ * should use tq_generate_chat_text instead, which handles this safely.
  * ============================================================================ */
 static int tq_lcp_int(const int* a, int na, const int* b, int nb) {
     int lim = na < nb ? na : nb;
@@ -918,22 +923,28 @@ int tq_generate_chat_text(tq_model_t* model,
             if (n_suffix < 0) n_suffix = 0;
         }
 
-        /* Sliding window if needed (drop from start of cached) */
+        /* Context overflow check.
+         * The previous "fall back to tq_generate_continue with full
+         * reprefill" approach was UNSAFE: state already had the previous
+         * KV at positions [0..prefix_pos), and tq_generate_continue would
+         * write new positions [0..n_new), leaving stale KV at positions
+         * [n_new..prefix_pos) that subsequent generation might read.
+         *
+         * Correct behavior: return -2 (overflow) and let the caller
+         * decide — most callers should reset the chat and retry with a
+         * shorter prompt. Server can return HTTP 413, Python can raise
+         * an exception, WASM can show an error to the user. */
         int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
         if (prefix_pos + n_suffix + reserve + 32 > max_prompt) {
-            /* Force a full reprefill — simpler than partial cache shift */
             free(suffix_toks);
             config->on_token = orig_cb; config->user_data = orig_ud;
-            *n_cached_io = 0;
-            if (cached_text_io && *cached_text_io) {
-                free(*cached_text_io); *cached_text_io = NULL;
+            if (accum.buf) free(accum.buf);
+            if (getenv("TQ_CHAT_DEBUG")) {
+                fprintf(stderr,
+                    "[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n",
+                    prefix_pos, n_suffix, reserve, max_prompt);
             }
-            int n2 = tq_generate_continue(model, tokenizer, state, prompt, config,
-                                           cached_tokens_io, n_cached_io, cached_capacity_io,
-                                           output, output_size);
-            /* fall-through path captures cached_text below */
-            generated = n2;
-            goto update_cache;
+            return -2;
         }
 
         /* Grow cache buffer */

diff --git a/src/server/tq_server.c b/src/server/tq_server.c
@@ -779,12 +779,23 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
         kv_session_t* sess = get_or_create_session(server, req.session_id,
                                                     gen_cfg.kv_type,
                                                     gen_cfg.value_quant_bits);
-        tq_generate_chat_text(server->config.model, server->config.tokenizer,
+        int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
                                sess->kv_state, req.prompt, &gen_cfg,
                                &sess->cached_text,
                                &sess->cached_tokens, &sess->n_cached,
                                &sess->cached_capacity,
                                output, sizeof(output));
+        if (gen_rc == -2) {
+            /* Context overflow — auto-reset session and surface error.
+             * Client should retry with a shorter conversation history. */
+            LOG_ERROR("Session %s: context overflow, auto-reset", sess->id);
+            tq_free_state(sess->kv_state);
+            sess->kv_state = tq_create_state_ex(
+                &server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits);
+            if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; }
+            sess->n_cached = 0; sess->cached_capacity = 0;
+            if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; }
+        }
 
         /* Send final chunk with finish_reason */
         char final_chunk[SSE_CHUNK_SIZE];
@@ -817,12 +828,30 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
         kv_session_t* sess = get_or_create_session(server, req.session_id,
                                                     gen_cfg.kv_type,
                                                     gen_cfg.value_quant_bits);
-        tq_generate_chat_text(server->config.model, server->config.tokenizer,
+        int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
                                sess->kv_state, req.prompt, &gen_cfg,
                                &sess->cached_text,
                                &sess->cached_tokens, &sess->n_cached,
                                &sess->cached_capacity,
                                output, sizeof(output));
+        if (gen_rc == -2) {
+            /* Context overflow — return HTTP 413 instead of garbage. */
+            LOG_ERROR("Session %s: context overflow, returning 413", sess->id);
+            tq_free_state(sess->kv_state);
+            sess->kv_state = tq_create_state_ex(
+                &server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits);
+            if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; }
+            sess->n_cached = 0; sess->cached_capacity = 0;
+            if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; }
+            free(collect.buf);
+            pthread_mutex_unlock(&server->inference_mutex);
+            free_chat_request(&req);
+            send_json(fd, 413, "Payload Too Large",
+                "{\"error\":{\"message\":\"Conversation history exceeds context window. "
+                "Session has been reset; please retry with a shorter history.\","
+                "\"type\":\"context_overflow\",\"code\":\"context_full\"}}");
+            return;
+        }
 
         const char* content = collect.buf ? collect.buf : "";
 

diff --git a/wasm/quant.wasm b/wasm/quant.wasm
diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c
@@ -99,6 +99,17 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
      * sees a near-instant response on every turn after the first. */
     int n = quant_chat(g_ctx, prompt, on_token_streaming, NULL);
     double elapsed = emscripten_get_now() - t0;
+    if (n == -2) {
+        /* Context overflow — auto-reset and inform the JS caller so it
+         * can show a "context full, starting new chat" message and
+         * optionally retry with a shorter history. */
+        js_on_status("Context full \xe2\x80\x94 chat reset. Send a shorter message.");
+        quant_chat(g_ctx, NULL, NULL, NULL);
+        g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;
+        js_on_done(0, elapsed);
+        g_generating = 0;
+        return -2;
+    }
     js_on_done(n > 0 ? n : 0, elapsed);
     g_generating = 0;
     return 0;
@@ -107,7 +118,7 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
 EMSCRIPTEN_KEEPALIVE
 int wasm_generate(const char* prompt, float temperature, int max_tokens) {
     if (!g_model || !g_ctx || g_generating) return -1;
-    g_generating = 1; g_output_pos = 0; g_output[0] = '\0';
+    g_generating = 1; g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;
 
     g_ctx->config.temperature = temperature;
     g_ctx->config.top_p = 0.9f;
@@ -116,6 +127,14 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
     double t0 = emscripten_get_now();
     int n = quant_chat(g_ctx, prompt, on_token_sync, NULL);
     double elapsed = emscripten_get_now() - t0;
+    if (n == -2) {
+        js_on_status("Context full \xe2\x80\x94 chat reset.");
+        quant_chat(g_ctx, NULL, NULL, NULL);
+        g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;
+        js_on_done(0, elapsed);
+        g_generating = 0;
+        return -2;
+    }
     js_on_done(n > 0 ? n : 0, elapsed);
     g_generating = 0;
     return 0;
@@ -125,6 +144,11 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
 EMSCRIPTEN_KEEPALIVE
 void wasm_reset_chat(void) {
     if (g_ctx) quant_chat(g_ctx, NULL, NULL, NULL);
+    /* Also reset the streaming output buffer state — otherwise the next
+     * generation would append to stale text from the previous chat. */
+    g_output_pos = 0;
+    g_output[0] = '\0';
+    g_stream_count = 0;
 }
 
 EMSCRIPTEN_KEEPALIVE const char* wasm_model_info(void) {