Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -15943,19 +15943,21 @@ int tq_generate_chat_text(tq_model_t* model,
if (n_suffix < 0) n_suffix = 0;
}

/* Context overflow: return -2 instead of falling back to a
* dangerous full reprefill. The state still has stale KV at
* positions [n_new..prefix_pos) that would corrupt later tokens.
* Caller should reset the chat and retry. */
int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
if (prefix_pos + n_suffix + reserve + 32 > max_prompt) {
free(suffix_toks);
config->on_token = orig_cb; config->user_data = orig_ud;
*n_cached_io = 0;
if (cached_text_io && *cached_text_io) {
free(*cached_text_io); *cached_text_io = NULL;
if (accum.buf) free(accum.buf);
if (getenv("TQ_CHAT_DEBUG")) {
fprintf(stderr,
"[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n",
prefix_pos, n_suffix, reserve, max_prompt);
}
int n2 = tq_generate_continue(model, tokenizer, state, prompt, config,
cached_tokens_io, n_cached_io, cached_capacity_io,
output, output_size);
generated = n2;
goto update_cache;
return -2;
}

int needed = prefix_pos + n_suffix + reserve + 16;
Expand Down
35 changes: 23 additions & 12 deletions src/engine/tq_generate.c
Original file line number Diff line number Diff line change
Expand Up @@ -603,12 +603,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
}

/* ============================================================================
* tq_generate_continue — chat-mode generation with KV cache reuse.
* tq_generate_continue — chat-mode generation with KV cache reuse (token LCP).
*
* Caller-managed state: state and cached_tokens persist across calls.
* Each call computes the longest common prefix between cached_tokens and
* the new prompt, prefills only the diverging suffix, and updates the
* cache record. Turns chat from O(history^2) into O(new_tokens_per_turn).
*
* NOTE: This is a lower-level API. It does NOT track cached_text. If a
* sliding window triggers (n_cached_io is reset to 0), any out-of-band
* cached_text the caller maintains becomes stale. Higher-level callers
* should use tq_generate_chat_text instead, which handles this safely.
* ============================================================================ */
static int tq_lcp_int(const int* a, int na, const int* b, int nb) {
int lim = na < nb ? na : nb;
Expand Down Expand Up @@ -918,22 +923,28 @@ int tq_generate_chat_text(tq_model_t* model,
if (n_suffix < 0) n_suffix = 0;
}

/* Sliding window if needed (drop from start of cached) */
/* Context overflow check.
* The previous "fall back to tq_generate_continue with full
* reprefill" approach was UNSAFE: state already had the previous
* KV at positions [0..prefix_pos), and tq_generate_continue would
* write new positions [0..n_new), leaving stale KV at positions
* [n_new..prefix_pos) that subsequent generation might read.
*
* Correct behavior: return -2 (overflow) and let the caller
* decide — most callers should reset the chat and retry with a
* shorter prompt. Server can return HTTP 413, Python can raise
* an exception, WASM can show an error to the user. */
int reserve = config->max_tokens > 0 ? config->max_tokens : 256;
if (prefix_pos + n_suffix + reserve + 32 > max_prompt) {
/* Force a full reprefill — simpler than partial cache shift */
free(suffix_toks);
config->on_token = orig_cb; config->user_data = orig_ud;
*n_cached_io = 0;
if (cached_text_io && *cached_text_io) {
free(*cached_text_io); *cached_text_io = NULL;
if (accum.buf) free(accum.buf);
if (getenv("TQ_CHAT_DEBUG")) {
fprintf(stderr,
"[chat-text] OVERFLOW prefix_pos=%d n_suffix=%d reserve=%d max=%d\n",
prefix_pos, n_suffix, reserve, max_prompt);
}
int n2 = tq_generate_continue(model, tokenizer, state, prompt, config,
cached_tokens_io, n_cached_io, cached_capacity_io,
output, output_size);
/* fall-through path captures cached_text below */
generated = n2;
goto update_cache;
return -2;
}

/* Grow cache buffer */
Expand Down
33 changes: 31 additions & 2 deletions src/server/tq_server.c
Original file line number Diff line number Diff line change
Expand Up @@ -779,12 +779,23 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
kv_session_t* sess = get_or_create_session(server, req.session_id,
gen_cfg.kv_type,
gen_cfg.value_quant_bits);
tq_generate_chat_text(server->config.model, server->config.tokenizer,
int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
sess->kv_state, req.prompt, &gen_cfg,
&sess->cached_text,
&sess->cached_tokens, &sess->n_cached,
&sess->cached_capacity,
output, sizeof(output));
if (gen_rc == -2) {
/* Context overflow — auto-reset session and surface error.
* Client should retry with a shorter conversation history. */
LOG_ERROR("Session %s: context overflow, auto-reset", sess->id);
tq_free_state(sess->kv_state);
sess->kv_state = tq_create_state_ex(
&server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits);
if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; }
sess->n_cached = 0; sess->cached_capacity = 0;
if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; }
}

/* Send final chunk with finish_reason */
char final_chunk[SSE_CHUNK_SIZE];
Expand Down Expand Up @@ -817,12 +828,30 @@ static void handle_chat_completions(tq_server_t* server, int fd, const char* bod
kv_session_t* sess = get_or_create_session(server, req.session_id,
gen_cfg.kv_type,
gen_cfg.value_quant_bits);
tq_generate_chat_text(server->config.model, server->config.tokenizer,
int gen_rc = tq_generate_chat_text(server->config.model, server->config.tokenizer,
sess->kv_state, req.prompt, &gen_cfg,
&sess->cached_text,
&sess->cached_tokens, &sess->n_cached,
&sess->cached_capacity,
output, sizeof(output));
if (gen_rc == -2) {
/* Context overflow — return HTTP 413 instead of garbage. */
LOG_ERROR("Session %s: context overflow, returning 413", sess->id);
tq_free_state(sess->kv_state);
sess->kv_state = tq_create_state_ex(
&server->config.model->config, gen_cfg.kv_type, gen_cfg.value_quant_bits);
if (sess->cached_tokens) { free(sess->cached_tokens); sess->cached_tokens = NULL; }
sess->n_cached = 0; sess->cached_capacity = 0;
if (sess->cached_text) { free(sess->cached_text); sess->cached_text = NULL; }
free(collect.buf);
pthread_mutex_unlock(&server->inference_mutex);
free_chat_request(&req);
send_json(fd, 413, "Payload Too Large",
"{\"error\":{\"message\":\"Conversation history exceeds context window. "
"Session has been reset; please retry with a shorter history.\","
"\"type\":\"context_overflow\",\"code\":\"context_full\"}}");
return;
}

const char* content = collect.buf ? collect.buf : "";

Expand Down
Binary file modified wasm/quant.wasm
Binary file not shown.
26 changes: 25 additions & 1 deletion wasm/quant_wasm.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,17 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
* sees a near-instant response on every turn after the first. */
int n = quant_chat(g_ctx, prompt, on_token_streaming, NULL);
double elapsed = emscripten_get_now() - t0;
if (n == -2) {
/* Context overflow — auto-reset and inform the JS caller so it
* can show a "context full, starting new chat" message and
* optionally retry with a shorter history. */
js_on_status("Context full \xe2\x80\x94 chat reset. Send a shorter message.");
quant_chat(g_ctx, NULL, NULL, NULL);
g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;
js_on_done(0, elapsed);
g_generating = 0;
return -2;
}
js_on_done(n > 0 ? n : 0, elapsed);
g_generating = 0;
return 0;
Expand All @@ -107,7 +118,7 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
EMSCRIPTEN_KEEPALIVE
int wasm_generate(const char* prompt, float temperature, int max_tokens) {
if (!g_model || !g_ctx || g_generating) return -1;
g_generating = 1; g_output_pos = 0; g_output[0] = '\0';
g_generating = 1; g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;

g_ctx->config.temperature = temperature;
g_ctx->config.top_p = 0.9f;
Expand All @@ -116,6 +127,14 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
double t0 = emscripten_get_now();
int n = quant_chat(g_ctx, prompt, on_token_sync, NULL);
double elapsed = emscripten_get_now() - t0;
if (n == -2) {
js_on_status("Context full \xe2\x80\x94 chat reset.");
quant_chat(g_ctx, NULL, NULL, NULL);
g_output_pos = 0; g_output[0] = '\0'; g_stream_count = 0;
js_on_done(0, elapsed);
g_generating = 0;
return -2;
}
js_on_done(n > 0 ? n : 0, elapsed);
g_generating = 0;
return 0;
Expand All @@ -125,6 +144,11 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
EMSCRIPTEN_KEEPALIVE
void wasm_reset_chat(void) {
if (g_ctx) quant_chat(g_ctx, NULL, NULL, NULL);
/* Also reset the streaming output buffer state — otherwise the next
* generation would append to stale text from the previous chat. */
g_output_pos = 0;
g_output[0] = '\0';
g_stream_count = 0;
}

EMSCRIPTEN_KEEPALIVE const char* wasm_model_info(void) {
Expand Down
Loading