diff --git a/quant.h b/quant.h index 2e4d457..9a2691c 100644 --- a/quant.h +++ b/quant.h @@ -15943,19 +15943,21 @@ int tq_generate_chat_text(tq_model_t* model, if (n_suffix < 0) n_suffix = 0; } + /* Context overflow: return -2 instead of falling back to a + * dangerous full reprefill. The state still has stale KV at + * positions [n_new..prefix_pos) that would corrupt later tokens. + * Caller should reset the chat and retry. */ int reserve = config->max_tokens > 0 ? config->max_tokens : 256; if (prefix_pos + n_suffix + reserve + 32 > max_prompt) { free(suffix_toks); config->on_token = orig_cb; config->user_data = orig_ud; - *n_cached_io = 0; - if (cached_text_io && *cached_text_io) { - free(*cached_text_io); *cached_text_io = NULL; + if (accum.buf) free(accum.buf); + if (getenv("TQ_CHAT_DEBUG")) { + fprintf(stderr, + "[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n", + prefix_pos, n_suffix, reserve, max_prompt); } - int n2 = tq_generate_continue(model, tokenizer, state, prompt, config, - cached_tokens_io, n_cached_io, cached_capacity_io, - output, output_size); - generated = n2; - goto update_cache; + return -2; } int needed = prefix_pos + n_suffix + reserve + 16; diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c index cca9e9b..1f45a35 100644 --- a/src/engine/tq_generate.c +++ b/src/engine/tq_generate.c @@ -603,12 +603,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer, } /* ============================================================================ - * tq_generate_continue — chat-mode generation with KV cache reuse. + * tq_generate_continue — chat-mode generation with KV cache reuse (token LCP). * * Caller-managed state: state and cached_tokens persist across calls. * Each call computes the longest common prefix between cached_tokens and * the new prompt, prefills only the diverging suffix, and updates the * cache record. Turns chat from O(history^2) into O(new_tokens_per_turn). + * + * NOTE: This is a lower-level API. It does NOT track cached_text. If a + * sliding window triggers (n_cached_io is reset to 0), any out-of-band + * cached_text the caller maintains becomes stale. Higher-level callers + * should use tq_generate_chat_text instead, which handles this safely. * ============================================================================ */ static int tq_lcp_int(const int* a, int na, const int* b, int nb) { int lim = na < nb ? na : nb; @@ -918,22 +923,28 @@ int tq_generate_chat_text(tq_model_t* model, if (n_suffix < 0) n_suffix = 0; } - /* Sliding window if needed (drop from start of cached) */ + /* Context overflow check. + * The previous "fall back to tq_generate_continue with full + * reprefill" approach was UNSAFE: state already had the previous + * KV at positions [0..prefix_pos), and tq_generate_continue would + * write new positions [0..n_new), leaving stale KV at positions + * [n_new..prefix_pos) that subsequent generation might read. + * + * Correct behavior: return -2 (overflow) and let the caller + * decide — most callers should reset the chat and retry with a + * shorter prompt. Server can return HTTP 413, Python can raise + * an exception, WASM can show an error to the user. */ int reserve = config->max_tokens > 0 ? config->max_tokens : 256; if (prefix_pos + n_suffix + reserve + 32 > max_prompt) { - /* Force a full reprefill — simpler than partial cache shift */ free(suffix_toks); config->on_token = orig_cb; config->user_data = orig_ud; - *n_cached_io = 0; - if (cached_text_io && *cached_text_io) { - free(*cached_text_io); *cached_text_io = NULL; + if (accum.buf) free(accum.buf); + if (getenv("TQ_CHAT_DEBUG")) { + fprintf(stderr, + "[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n", + prefix_pos, n_suffix, reserve, max_prompt); } - int n2 = tq_generate_continue(model, tokenizer, state, prompt, config, - cached_tokens_io, n_cached_io, cached_capacity_io, - output, output_size); - /* fall-through path captures cached_text below */ - generated = n2; - goto update_cache; + return -2; } /* Grow cache buffer */ diff --git a/src/server/tq_server.c b/src/server/tq_server.c index 898c3cb..81db519 100644 --- a/src/server/tq_server.c +++ b/src/server/tq_server.c @@ -779,12 +779,23 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod kv_session_t* sess = get_or_create_session(server, req.session_id, gen_cfg.kv_type, gen_cfg.value_quant_bits); - tq_generate_chat_text(server->config.model, server->config.tokenizer, + int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer, sess->kv_state, req.prompt, &gen_cfg, &sess->cached_text, &sess->cached_tokens, &sess->n_cached, &sess->cached_capacity, output, sizeof(output)); + if (gen_rc == -2) { + /* Context overflow — auto-reset session and surface error. + * Client should retry with a shorter conversation history. */ + LOG_ERROR("Session %s: context overflow, auto-reset", sess->id); + tq_free_state(sess->kv_state); + sess->kv_state = tq_create_state_ex( + &server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits); + if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; } + sess->n_cached = 0; sess->cached_capacity = 0; + if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; } + } /* Send final chunk with finish_reason */ char final_chunk[SSE_CHUNK_SIZE]; @@ -817,12 +828,30 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod kv_session_t* sess = get_or_create_session(server, req.session_id, gen_cfg.kv_type, gen_cfg.value_quant_bits); - tq_generate_chat_text(server->config.model, server->config.tokenizer, + int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer, sess->kv_state, req.prompt, &gen_cfg, &sess->cached_text, &sess->cached_tokens, &sess->n_cached, &sess->cached_capacity, output, sizeof(output)); + if (gen_rc == -2) { + /* Context overflow — return HTTP 413 instead of garbage. */ + LOG_ERROR("Session %s: context overflow, returning 413", sess->id); + tq_free_state(sess->kv_state); + sess->kv_state = tq_create_state_ex( + &server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits); + if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; } + sess->n_cached = 0; sess->cached_capacity = 0; + if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; } + free(collect.buf); + pthread_mutex_unlock(&server->inference_mutex); + free_chat_request(&req); + send_json(fd, 413, "Payload Too Large", + "{\"error\":{\"message\":\"Conversation history exceeds context window. " + "Session has been reset; please retry with a shorter history.\"," + "\"type\":\"context_overflow\",\"code\":\"context_full\"}}"); + return; + } const char* content = collect.buf ? collect.buf : ""; diff --git a/wasm/quant.wasm b/wasm/quant.wasm index f75dfa9..061f952 100755 Binary files a/wasm/quant.wasm and b/wasm/quant.wasm differ diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c index 9be11f7..281fd31 100644 --- a/wasm/quant_wasm.c +++ b/wasm/quant_wasm.c @@ -99,6 +99,17 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) { * sees a near-instant response on every turn after the first. */ int n = quant_chat(g_ctx, prompt, on_token_streaming, NULL); double elapsed = emscripten_get_now() - t0; + if (n == -2) { + /* Context overflow — auto-reset and inform the JS caller so it + * can show a "context full, starting new chat" message and + * optionally retry with a shorter history. */ + js_on_status("Context full \xe2\x80\x94 chat reset. Send a shorter message."); + quant_chat(g_ctx, NULL, NULL, NULL); + g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0; + js_on_done(0, elapsed); + g_generating = 0; + return -2; + } js_on_done(n > 0 ? n : 0, elapsed); g_generating = 0; return 0; @@ -107,7 +118,7 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) { EMSCRIPTEN_KEEPALIVE int wasm_generate(const char* prompt, float temperature, int max_tokens) { if (!g_model || !g_ctx || g_generating) return -1; - g_generating = 1; g_output_pos = 0; g_output[0] = '\0'; + g_generating = 1; g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0; g_ctx->config.temperature = temperature; g_ctx->config.top_p = 0.9f; @@ -116,6 +127,14 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) { double t0 = emscripten_get_now(); int n = quant_chat(g_ctx, prompt, on_token_sync, NULL); double elapsed = emscripten_get_now() - t0; + if (n == -2) { + js_on_status("Context full \xe2\x80\x94 chat reset."); + quant_chat(g_ctx, NULL, NULL, NULL); + g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0; + js_on_done(0, elapsed); + g_generating = 0; + return -2; + } js_on_done(n > 0 ? n : 0, elapsed); g_generating = 0; return 0; @@ -125,6 +144,11 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) { EMSCRIPTEN_KEEPALIVE void wasm_reset_chat(void) { if (g_ctx) quant_chat(g_ctx, NULL, NULL, NULL); + /* Also reset the streaming output buffer state — otherwise the next + * generation would append to stale text from the previous chat. */ + g_output_pos = 0; + g_output[0] = '\0'; + g_stream_count = 0; } EMSCRIPTEN_KEEPALIVE const char* wasm_model_info(void) {