diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h index 7c3c72c..67a6fce 100644 --- a/include/turboquant/tq_engine.h +++ b/include/turboquant/tq_engine.h @@ -63,6 +63,16 @@ typedef struct { float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */ float attn_logit_softcap; /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */ int* per_layer_inter_dim; /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */ + + /* Phi-3 LongRoPE config */ + int rope_orig_ctx_len; /* original context length (e.g., 4096) */ + float rope_attn_factor; /* attention magnitude scaling */ + const float* rope_factors_short; /* [head_dim/2] for short context */ + const float* rope_factors_long; /* [head_dim/2] for long context */ + + /* Phi-3 fused-tensor flags — set during load */ + int has_fused_qkv; /* any layer has gguf_w_qkv */ + int has_fused_up_gate; /* any layer has gguf_w_up_gate */ } tq_model_config_t; /* ============================================================ @@ -178,6 +188,12 @@ typedef struct { const void* gguf_w_up; int gguf_w_up_type; const void* gguf_w_down; int gguf_w_down_type; + /* Phi-3 fused projections (from quant.h, synced 2026-04-12). + * gguf_w_qkv: [hidden, q_dim + k_dim + v_dim] concatenated QKV + * gguf_w_up_gate: [hidden, 2 * intermediate_dim] concatenated gate||up */ + const void* gguf_w_qkv; int gguf_w_qkv_type; + const void* gguf_w_up_gate; int gguf_w_up_gate_type; + /* MoE expert weights (NULL for dense FFN layers) */ void* moe; /* tq_moe_layer_t* (from tq_gguf.h), NULL if dense */ diff --git a/src/backend/metal/tq_metal_dispatch.m b/src/backend/metal/tq_metal_dispatch.m index ff31870..54b8f97 100644 --- a/src/backend/metal/tq_metal_dispatch.m +++ b/src/backend/metal/tq_metal_dispatch.m @@ -621,10 +621,21 @@ void tq_free_metal_backend(void) { return [[tq_mtl_device name] UTF8String]; } +/** + * Disable Metal dispatch globally. Called by the model loader when a + * fused-tensor architecture (Phi-3) is detected — the Metal matmul + * kernels don't handle the non-standard output dimensions. + */ +static int tq_metal_disabled = 0; +void tq_metal_disable(void) { + tq_metal_disabled = 1; +} + /** * Check if Metal backend is available and initialized. */ int tq_metal_available(void) { + if (tq_metal_disabled) return 0; /* Lazy initialization: first call triggers Metal setup */ static int init_done = 0; if (!init_done) { diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c index f3a69a4..b9994bf 100644 --- a/src/engine/tq_generate.c +++ b/src/engine/tq_generate.c @@ -218,13 +218,21 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer, int n_prompt = 0; if (tokenizer && prompt) { - /* BOS token handling: - * Gemma 3/4: BOS=2 (required) - * LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it - * Qwen3.5: no BOS needed */ + /* BOS token handling (synced from quant.h 2026-04-12): + * Gemma: always (model_type == 1) + * Phi-3 / Llama: yes if `` is in the vocab (id 1). + * Qwen3.5: no BOS needed. */ int add_bos = 0; if (model->config.model_type == 1) { - add_bos = 1; /* Gemma: always prepend BOS=2 */ + add_bos = 1; + } else { + int s_id = -1; + for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) { + if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "") == 0) { + s_id = i; break; + } + } + if (s_id >= 0) add_bos = 1; } n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos); } else { diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c index 9b9eccb..ef3cbd0 100644 --- a/src/engine/tq_model.c +++ b/src/engine/tq_model.c @@ -2931,6 +2931,23 @@ tq_model_t* tq_load_gguf(const char* path) { c->attn_logit_softcap = 50.0f; } + /* Phi-3 LongRoPE config + factor tables (synced from quant.h 2026-04-12). */ + c->rope_orig_ctx_len = (int)tq_gguf_get_u32(gguf, + GGUF_KEY("rope.scaling.original_context_length"), 0); + c->rope_attn_factor = tq_gguf_get_f32(gguf, + GGUF_KEY("rope.scaling.attn_factor"), 0.0f); + { + const tq_gguf_tensor_t* rfs = tq_gguf_find_tensor(gguf, "rope_factors_short.weight"); + const tq_gguf_tensor_t* rfl = tq_gguf_find_tensor(gguf, "rope_factors_long.weight"); + if (rfs && rfs->type == TQ_GGML_TYPE_F32) c->rope_factors_short = (const float*)rfs->data; + if (rfl && rfl->type == TQ_GGML_TYPE_F32) c->rope_factors_long = (const float*)rfl->data; + if (rfs || rfl) { + fprintf(stderr, + "tq_load_gguf: LongRoPE detected — orig_ctx=%d, attn_factor=%.4f\n", + c->rope_orig_ctx_len, c->rope_attn_factor); + } + } + /* Cap context for memory safety on small machines. * GGUF models often claim 262K context but we cap at 4096 by default. * Users can override with --ctx flag in quant. */ @@ -3219,10 +3236,26 @@ tq_model_t* tq_load_gguf(const char* path) { } } - /* Attention weights — keep as GGUF quantized pointers for on-the-fly dequant. - * We store the raw data pointer + type info using a small struct packed into - * the existing FP32 weight pointer fields. For GGUF models, we use a special - * dispatch: if gguf_ctx is non-NULL, the forward pass uses tq_matmul_gguf. */ + /* Phi-3 fused QKV detection (synced from quant.h 2026-04-12). + * Phi-3 ships `blk.N.attn_qkv.weight` with shape [hidden, 3*hidden] + * instead of three separate `attn_q/k/v.weight` tensors. */ + snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l); + const tq_gguf_tensor_t* wqkv_t = find_gguf_tensor(gguf, tname); + if (wqkv_t && !layer->delta_a_log) { + /* Only take the fused path when there are NO DeltaNet weights — + * otherwise the DeltaNet code below handles attn_qkv itself. */ + layer->gguf_w_qkv = wqkv_t->data; + layer->gguf_w_qkv_type = wqkv_t->type; + c->has_fused_qkv = 1; + snprintf(tname, sizeof(tname), "blk.%d.attn_output.weight", l); + t = find_gguf_tensor(gguf, tname); + if (t) { layer->gguf_wo = t->data; layer->gguf_wo_type = t->type; } + attn_indices[n_attn_layers++] = l; + goto post_attn_load; + } + + /* Standard llama-style attention weights — keep as GGUF quantized + * pointers for on-the-fly dequant. */ snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l); const tq_gguf_tensor_t* wq_t = find_gguf_tensor(gguf, tname); int is_attn_layer = (wq_t != NULL); @@ -3265,6 +3298,7 @@ tq_model_t* tq_load_gguf(const char* path) { attn_indices[n_attn_layers++] = l; } +post_attn_load: /* Check for DeltaNet / SSM weights (Qwen3.5 hybrid) */ snprintf(tname, sizeof(tname), "blk.%d.ssm_a", l); t = find_gguf_tensor(gguf, tname); @@ -3518,13 +3552,28 @@ tq_model_t* tq_load_gguf(const char* path) { if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; } } } else { - /* Dense model: use GGUF on-the-fly dequant */ + /* Dense model: use GGUF on-the-fly dequant. + * Phi-3 fused FFN: when `ffn_up` has shape [hidden, 2*ff] AND + * there is no separate `ffn_gate`, it's a fused gate||up tensor. */ snprintf(tname, sizeof(tname), "blk.%d.ffn_gate.weight", l); t = find_gguf_tensor(gguf, tname); if (t) { layer->gguf_w_gate = t->data; layer->gguf_w_gate_type = t->type; } + snprintf(tname, sizeof(tname), "blk.%d.ffn_up.weight", l); t = find_gguf_tensor(gguf, tname); - if (t) { layer->gguf_w_up = t->data; layer->gguf_w_up_type = t->type; } + if (t) { + if (!layer->gguf_w_gate && t->n_dims >= 2 && + c->intermediate_dim > 0 && + (int)t->shape[1] == 2 * c->intermediate_dim) { + layer->gguf_w_up_gate = t->data; + layer->gguf_w_up_gate_type = t->type; + c->has_fused_up_gate = 1; + } else { + layer->gguf_w_up = t->data; + layer->gguf_w_up_type = t->type; + } + } + snprintf(tname, sizeof(tname), "blk.%d.ffn_down.weight", l); t = find_gguf_tensor(gguf, tname); if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; } @@ -3540,6 +3589,20 @@ tq_model_t* tq_load_gguf(const char* path) { n_attn_layers, c->n_layers); } + /* Hard-fail when neither standard self_attn nor DeltaNet was detected. + * (Synced from quant.h — prevents silent garbage from unsupported archs.) */ + if (n_attn_layers == 0 && c->delta_n_heads == 0) { + fprintf(stderr, + "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n" + " Detected 0 self_attn layers and no DeltaNet weights.\n" + " This usually means the model uses fused QKV projection\n" + " (e.g., Phi-3 `attn_qkv`) which this build does not yet handle.\n" + " See docs/supported_models.md for the architecture support matrix.\n", + gguf->arch[0] ? gguf->arch : "unknown"); + tq_free_model(model); + return NULL; + } + /* Set up layer_is_sliding for Gemma hybrid attention. * Detect from K tensor shape: sliding and full layers have different K output dims. * The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */ @@ -4072,9 +4135,20 @@ skip_q4_conversion: ; * Adding +1 at runtime would double-apply and cause activation explosion. * The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */ - /* Initialize persistent Metal GPU buffers for layer-level compute */ + /* Initialize persistent Metal GPU buffers for layer-level compute. + * + * Skip Metal for Phi-3 fused-tensor models: the Metal matmul kernels + * assume standard separate-tensor layouts (Q4_K blocks per row, + * fixed output buffer sizes). Fused QKV and fused gate||up produce + * larger output vectors that the Metal kernel doesn't handle. + * + * This is the right trade-off because: + * 1. CPU NEON Q4×Q8 is already faster than Metal for sub-4B models + * (measured: 95 tok/s CPU vs 38 tok/s GPU on SmolLM2). + * 2. Phi-3's 32K vocab means the lm_head matmul (where Metal helps + * most due to large output dim) is small — CPU handles it fine. */ #ifdef TQ_HAS_METAL - { + if (!c->has_fused_qkv && !c->has_fused_up_gate) { extern int tq_metal_gpu_init_buffers(int, int, int, int); extern int tq_metal_gpu_init_attn(int, int, int); int max_q_dim = c->n_heads * c->head_dim; @@ -4086,9 +4160,14 @@ skip_q4_conversion: ; if (full_kv > max_kv_dim) max_kv_dim = full_kv; } tq_metal_gpu_init_buffers(c->hidden_dim, c->intermediate_dim, max_q_dim, max_kv_dim); - - /* Initialize attention + KV cache GPU buffers for compute graph forward */ tq_metal_gpu_init_attn(c->n_heads, c->max_seq_len, max_kv_dim); + } else { + /* Disable Metal matmul dispatch globally for this process. + * The Metal backend is still initialized (MoE kernels etc.) but + * tq_matmul_gguf will check this flag and skip GPU dispatch. */ + extern void tq_metal_disable(void); + tq_metal_disable(); + fprintf(stderr, "tq_load_gguf: Metal GPU dispatch disabled (fused-tensor model — CPU is faster)\n"); } #endif diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c index 0a80a63..21c1c75 100644 --- a/src/engine/tq_tokenizer.c +++ b/src/engine/tq_tokenizer.c @@ -1182,11 +1182,17 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text, int n_tokens = 0; /* Add BOS token if requested. - * Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */ + * Different model families use different BOS strings (synced from + * quant.h 2026-04-12): + * Gemma: (id 2) + * Llama / Phi-3: (id 1) — SentencePiece convention + * Qwen / ChatML: <|im_start|> + * Llama-3: <|begin_of_text|> */ if (add_bos) { - /* Look up token in vocab; default to id 2 (Gemma convention) */ int bos_id = str_lookup(tok, ""); - if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); } + if (bos_id < 0) bos_id = str_lookup(tok, ""); + if (bos_id < 0) bos_id = str_lookup(tok, "<|im_start|>"); + if (bos_id < 0) bos_id = str_lookup(tok, "<|begin_of_text|>"); if (bos_id >= 0) { tokens[n_tokens++] = bos_id; } diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c index 441fbd0..2357229 100644 --- a/src/engine/tq_transformer.c +++ b/src/engine/tq_transformer.c @@ -185,6 +185,15 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type, if (max_q_dim > max_dim) max_dim = max_q_dim; if (q_proj_dim > max_dim) max_dim = q_proj_dim; if (delta_qkv_dim > max_dim) max_dim = delta_qkv_dim; + /* Phi-3 fused QKV: one matmul writes [Q | K | V] into a temp buffer + * (s->xb2), so max_dim must cover the fused output size. */ + int fused_qkv_dim = q_dim + 2 * (config->n_kv_heads * config->head_dim); + if (config->has_fused_qkv && fused_qkv_dim > max_dim) max_dim = fused_qkv_dim; + + /* Phi-3 fused gate||up FFN: one matmul writes 2*inter floats into s->hb, + * so s->hb must be sized to 2*inter_dim. */ + int inter_dim_alloc = inter_dim; + if (config->has_fused_up_gate) inter_dim_alloc = 2 * inter_dim; s->x = (float*)calloc((size_t)dim, sizeof(float)); s->xb = (float*)calloc((size_t)max_dim, sizeof(float)); @@ -193,7 +202,7 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type, s->k = (float*)calloc((size_t)max_kv_dim, sizeof(float)); s->v = (float*)calloc((size_t)max_kv_dim, sizeof(float)); s->att = (float*)calloc((size_t)n_heads * max_seq, sizeof(float)); - s->hb = (float*)calloc((size_t)inter_dim, sizeof(float)); + s->hb = (float*)calloc((size_t)inter_dim_alloc, sizeof(float)); s->hb2 = (float*)calloc((size_t)inter_dim, sizeof(float)); s->logits = (float*)calloc((size_t)config->vocab_size, sizeof(float)); @@ -957,6 +966,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) int has_q2 = (layer->wq_q2 != NULL); int has_q4 = (layer->wq_q4 != NULL); int has_gguf = (layer->gguf_wq != NULL); + int has_fused_qkv_layer = (layer->gguf_w_qkv != NULL); if (has_q2 || has_q4) { tq_quantize_row_q8(s->xb, s->xb_q8, s->xb_q8s, dim); } @@ -974,7 +984,19 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) * layer-level batch scope in tq_forward(). */ float* gate_q = NULL; - if (c->attn_output_gate) { + if (has_fused_qkv_layer) { + /* Phi-3 fused QKV: one matmul produces [Q | K | V] in a temp + * buffer, then memcpy splits into s->q / s->k / s->v. */ + int q_out = n_heads * head_dim; + int kv_out = kv_dim; + int total_out = q_out + 2 * kv_out; + tq_matmul_gguf(s->xb2, s->xb, + layer->gguf_w_qkv, layer->gguf_w_qkv_type, + total_out, dim); + memcpy(s->q, s->xb2, (size_t)q_out * sizeof(float)); + memcpy(s->k, s->xb2 + q_out, (size_t)kv_out * sizeof(float)); + memcpy(s->v, s->xb2 + q_out + kv_out, (size_t)kv_out * sizeof(float)); + } else if (c->attn_output_gate) { int qg_dim = n_heads * head_dim * 2; if (layer->wq_q2) { TQ_MATMUL_Q2_OR_1BIT(s->xb2, s->xb, layer->wq_q2, layer->wq_q2s, s->xb_q8, s->xb_q8s, qg_dim, dim, model->use_1bit_weights); @@ -1015,7 +1037,9 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) tq_matmul(s->q, s->xb, layer->wq, n_heads * head_dim, dim); } } - if (layer->wk_q2) { + if (has_fused_qkv_layer) { + /* Already populated s->q/s->k/s->v above — skip K projection */ + } else if (layer->wk_q2) { TQ_MATMUL_Q2_OR_1BIT(s->k, s->xb, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim, model->use_1bit_weights); } else if (layer->wk_q4) { tq_matmul_q4q2_preq(s->k, layer->wk_q4, layer->wk_q4s, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim); @@ -1026,9 +1050,12 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) } else { tq_matmul(s->k, s->xb, layer->wk, kv_dim, dim); } - /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */ int has_v_weights = (layer->wv_q2 || layer->wv_q4 || layer->wv_q8 || layer->gguf_wv || layer->wv); + if (has_fused_qkv_layer) { + /* skip — handled by the fused branch */ + } else { + /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */ if (!has_v_weights) { /* K=V: value is same as key (attention_k_eq_v) */ memcpy(s->v, s->k, kv_dim * sizeof(float)); @@ -1043,6 +1070,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) } else { tq_matmul(s->v, s->xb, layer->wv, kv_dim, dim); } + } /* Flush batched Q+K+V GPU dispatches before CPU-side RoPE/attention */ if (has_gguf) tq_metal_batch_flush_if_available(); @@ -1193,7 +1221,50 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos) model->layer_is_sliding && model->layer_is_sliding[l]) { rope_base = c->rope_local_base_freq; } - tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base); + /* Phi-3 LongRoPE: NeoX-style rotation with per-frequency rescaling */ + if (c->rope_factors_short || c->rope_factors_long) { + const float* factors = + (pos >= c->rope_orig_ctx_len && c->rope_factors_long) + ? c->rope_factors_long + : (c->rope_factors_short ? c->rope_factors_short + : c->rope_factors_long); + int half = head_dim / 2; + for (int h = 0; h < n_heads; h++) { + float* qh = s->q + h * head_dim; + for (int i = 0; i < half; i++) { + float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim); + float freq = base_freq / factors[i]; + float theta = pos * freq; + float cos_t = cosf(theta); + float sin_t = sinf(theta); + float q0 = qh[i]; + float q1 = qh[i + half]; + qh[i] = q0 * cos_t - q1 * sin_t; + qh[i + half] = q0 * sin_t + q1 * cos_t; + } + } + for (int h = 0; h < n_kv_heads; h++) { + float* kh = s->k + h * head_dim; + for (int i = 0; i < half; i++) { + float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim); + float freq = base_freq / factors[i]; + float theta = pos * freq; + float cos_t = cosf(theta); + float sin_t = sinf(theta); + float k0 = kh[i]; + float k1 = kh[i + half]; + kh[i] = k0 * cos_t - k1 * sin_t; + kh[i + half] = k0 * sin_t + k1 * cos_t; + } + } + if (pos >= c->rope_orig_ctx_len && c->rope_attn_factor > 0.0f) { + float scale = c->rope_attn_factor; + int n_q = n_heads * head_dim; + for (int i = 0; i < n_q; i++) s->q[i] *= scale; + } + } else { + tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base); + } } /* Store K,V in cache. @@ -2325,6 +2396,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) { if (layer->delta_a_log) { /* DeltaNet layer */ deltanet_forward(model, s, l); + } else if (layer->gguf_w_qkv) { + /* Phi-3 fused QKV — gguf_wq/wk/wv are NULL because Q, K + * and V are concatenated into gguf_w_qkv. self_attn_forward + * handles the fused dispatch internally. */ + self_attn_forward(model, s, l, pos); } else if ((layer->wq || layer->wq_q8 || layer->wq_q4 || layer->gguf_wq || layer->wq_q2) && (layer->wk || layer->wk_q8 || layer->wk_q4 || layer->gguf_wk || layer->wk_q2) && (layer->wv || layer->wv_q8 || layer->wv_q4 || layer->gguf_wv || layer->wv_q2 || @@ -2508,8 +2584,8 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) { * Qwen: layers are either MoE or dense, NOT both. * Gemma 3 non-MoE layers: run dense FFN. */ if (!did_moe && - (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate) && - (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up) && + (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate || layer->gguf_w_up_gate) && + (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up || layer->gguf_w_up_gate) && (layer->w_down || layer->w_down_q8 || layer->w_down_q4 || layer->w_down_q2 || layer->gguf_w_down)) { /* Pre-FFN norm: Gemma 4 dual-FFN uses pre_ffw_norm_2 for the dense FFN. @@ -2557,6 +2633,13 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) { s->xb_q8, s->xb_q8s, inter, dim); tq_matmul_q4_preq(s->hb2, layer->w_up_q4, layer->w_up_q4s, s->xb_q8, s->xb_q8s, inter, dim); + } else if (layer->gguf_w_up_gate) { + /* Phi-3 fused gate||up: one matmul produces [gate | up], + * then we split into s->hb (gate) and s->hb2 (up). */ + tq_matmul_gguf(s->hb, s->xb, + layer->gguf_w_up_gate, layer->gguf_w_up_gate_type, + 2 * inter, dim); + memcpy(s->hb2, s->hb + inter, (size_t)inter * sizeof(float)); } else if (layer->gguf_w_gate) { /* Gate+up GPU dispatches batched by layer-level batch scope */ tq_matmul_gguf(s->hb, s->xb, layer->gguf_w_gate, layer->gguf_w_gate_type, inter, dim); diff --git a/tools/phi3_kvcomp_test.c b/tools/phi3_kvcomp_test.c new file mode 100644 index 0000000..b331933 --- /dev/null +++ b/tools/phi3_kvcomp_test.c @@ -0,0 +1,100 @@ +/* phi3_kvcomp_test — validate Phi-3 forward path against quant.cpp's KV + * cache compression layers. + * + * The Phi-3 architecture support PR (#65) was validated end-to-end with + * KV compression DISABLED (kv_compress=0). The fused QKV / fused gate+up + * forward branches do not touch the KV cache code path directly, but + * the way s->k is written into the cache (and read back during attention) + * goes through the same KV-quant code paths as Llama / SmolLM2. This + * test exercises that interaction. + * + * Modes covered: + * off baseline (matches PR #65 validation) + * 1 / 4-bit UNIFORM_4B K + 4-bit V + * 1 / 4-bit + progressive 128 + last 128 tokens of K kept FP32 + * 1 / 4-bit + aggressive 512 + last 512 tokens of K kept FP32 + * 2 / delta+3-bit UNIFORM_3B K + 4-bit V + delta encoding + * + * For each mode we generate 80 greedy tokens from a fixed prompt and + * print the output. A working mode produces coherent English; a broken + * mode produces fragmented garbage. Compare modes side-by-side. + */ +#define QUANT_IMPLEMENTATION +#include "../quant.h" + +#include +#include +#include +#include + +static void print_token(const char* text, void* ud) { + (void)ud; + fputs(text, stdout); + fflush(stdout); +} + +static int run_one(quant_model* model, const char* label, + int kv_compress, int k_highres_window, + const char* prompt) { + /* max_tokens=256 deliberately exceeds the 128-token progressive + * window so we actually exercise the boundary where recent keys + * shift from FP32 (highres buffer) into the quantized cache. */ + quant_config cfg = { + .temperature = 0.0f, /* greedy */ + .top_p = 1.0f, + .max_tokens = 256, + .n_threads = 4, + .kv_compress = kv_compress, + .context_length = 0, + .k_highres_window = k_highres_window, + }; + quant_ctx* ctx = quant_new(model, &cfg); + if (!ctx) { + fprintf(stderr, "quant_new failed for mode %s\n", label); + return -1; + } + + fprintf(stderr, "\n=== %s ===\n", label); + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + int n = quant_generate(ctx, prompt, print_token, NULL); + clock_gettime(CLOCK_MONOTONIC, &t1); + double secs = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + fprintf(stderr, "\n[%s] %d tokens in %.2fs (%.1f tok/s)\n", + label, n, secs, secs > 0 ? n / secs : 0.0); + + quant_free_ctx(ctx); + return n; +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "usage: %s [prompt]\n", argv[0]); + return 1; + } + const char* user_msg = (argc >= 3) ? argv[2] + : "Explain in two paragraphs why the sky appears blue during the day."; + + /* Phi-3 chat template */ + char prompt[1024]; + snprintf(prompt, sizeof(prompt), + "<|user|>\n%s<|end|>\n<|assistant|>\n", user_msg); + + fprintf(stderr, "Loading %s ...\n", argv[1]); + quant_model* model = quant_load(argv[1]); + if (!model) { + fprintf(stderr, "quant_load failed\n"); + return 2; + } + + int rc = 0; + rc |= run_one(model, "off (baseline, FP32 KV)", 0, 0, prompt) < 0; + rc |= run_one(model, "kv_compress=1 (4-bit, no progressive)", 1, 0, prompt) < 0; + rc |= run_one(model, "kv_compress=1 + progressive(128)", 1, 128, prompt) < 0; + rc |= run_one(model, "kv_compress=1 + aggressive(512)", 1, 512, prompt) < 0; + rc |= run_one(model, "kv_compress=2 (delta+3-bit)", 2, 0, prompt) < 0; + + quant_free_model(model); + fputc('\n', stderr); + return rc ? 4 : 0; +}