diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
index 7c3c72c..67a6fce 100644
--- a/include/turboquant/tq_engine.h
+++ b/include/turboquant/tq_engine.h
@@ -63,6 +63,16 @@ typedef struct {
     float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
     float attn_logit_softcap;  /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
     int* per_layer_inter_dim;  /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */
+
+    /* Phi-3 LongRoPE config */
+    int rope_orig_ctx_len;             /* original context length (e.g., 4096) */
+    float rope_attn_factor;            /* attention magnitude scaling */
+    const float* rope_factors_short;   /* [head_dim/2] for short context */
+    const float* rope_factors_long;    /* [head_dim/2] for long context */
+
+    /* Phi-3 fused-tensor flags — set during load */
+    int has_fused_qkv;                 /* any layer has gguf_w_qkv */
+    int has_fused_up_gate;             /* any layer has gguf_w_up_gate */
 } tq_model_config_t;
 
 /* ============================================================
@@ -178,6 +188,12 @@ typedef struct {
     const void* gguf_w_up;   int gguf_w_up_type;
     const void* gguf_w_down; int gguf_w_down_type;
 
+    /* Phi-3 fused projections (from quant.h, synced 2026-04-12).
+     * gguf_w_qkv:     [hidden, q_dim + k_dim + v_dim] concatenated QKV
+     * gguf_w_up_gate: [hidden, 2 * intermediate_dim] concatenated gate||up */
+    const void* gguf_w_qkv;     int gguf_w_qkv_type;
+    const void* gguf_w_up_gate; int gguf_w_up_gate_type;
+
     /* MoE expert weights (NULL for dense FFN layers) */
     void* moe;               /* tq_moe_layer_t* (from tq_gguf.h), NULL if dense */
 
diff --git a/src/backend/metal/tq_metal_dispatch.m b/src/backend/metal/tq_metal_dispatch.m
index ff31870..54b8f97 100644
--- a/src/backend/metal/tq_metal_dispatch.m
+++ b/src/backend/metal/tq_metal_dispatch.m
@@ -621,10 +621,21 @@ void tq_free_metal_backend(void) {
     return [[tq_mtl_device name] UTF8String];
 }
 
+/**
+ * Disable Metal dispatch globally. Called by the model loader when a
+ * fused-tensor architecture (Phi-3) is detected — the Metal matmul
+ * kernels don't handle the non-standard output dimensions.
+ */
+static int tq_metal_disabled = 0;
+void tq_metal_disable(void) {
+    tq_metal_disabled = 1;
+}
+
 /**
  * Check if Metal backend is available and initialized.
  */
 int tq_metal_available(void) {
+    if (tq_metal_disabled) return 0;
     /* Lazy initialization: first call triggers Metal setup */
     static int init_done = 0;
     if (!init_done) {
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
index f3a69a4..b9994bf 100644
--- a/src/engine/tq_generate.c
+++ b/src/engine/tq_generate.c
@@ -218,13 +218,21 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int n_prompt = 0;
 
     if (tokenizer && prompt) {
-        /* BOS token handling:
-         * Gemma 3/4: BOS=2 (required)
-         * LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
-         * Qwen3.5: no BOS needed */
+        /* BOS token handling (synced from quant.h 2026-04-12):
+         *   Gemma:         always (model_type == 1)
+         *   Phi-3 / Llama: yes if `<s>` is in the vocab (id 1).
+         *   Qwen3.5:       no BOS needed. */
         int add_bos = 0;
         if (model->config.model_type == 1) {
-            add_bos = 1; /* Gemma: always prepend BOS=2 */
+            add_bos = 1;
+        } else {
+            int s_id = -1;
+            for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) {
+                if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<s>") == 0) {
+                    s_id = i; break;
+                }
+            }
+            if (s_id >= 0) add_bos = 1;
         }
         n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
     } else {
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
index 9b9eccb..ef3cbd0 100644
--- a/src/engine/tq_model.c
+++ b/src/engine/tq_model.c
@@ -2931,6 +2931,23 @@ tq_model_t* tq_load_gguf(const char* path) {
         c->attn_logit_softcap = 50.0f;
     }
 
+    /* Phi-3 LongRoPE config + factor tables (synced from quant.h 2026-04-12). */
+    c->rope_orig_ctx_len = (int)tq_gguf_get_u32(gguf,
+        GGUF_KEY("rope.scaling.original_context_length"), 0);
+    c->rope_attn_factor = tq_gguf_get_f32(gguf,
+        GGUF_KEY("rope.scaling.attn_factor"), 0.0f);
+    {
+        const tq_gguf_tensor_t* rfs = tq_gguf_find_tensor(gguf, "rope_factors_short.weight");
+        const tq_gguf_tensor_t* rfl = tq_gguf_find_tensor(gguf, "rope_factors_long.weight");
+        if (rfs && rfs->type == TQ_GGML_TYPE_F32) c->rope_factors_short = (const float*)rfs->data;
+        if (rfl && rfl->type == TQ_GGML_TYPE_F32) c->rope_factors_long  = (const float*)rfl->data;
+        if (rfs || rfl) {
+            fprintf(stderr,
+                "tq_load_gguf: LongRoPE detected — orig_ctx=%d, attn_factor=%.4f\n",
+                c->rope_orig_ctx_len, c->rope_attn_factor);
+        }
+    }
+
     /* Cap context for memory safety on small machines.
      * GGUF models often claim 262K context but we cap at 4096 by default.
      * Users can override with --ctx flag in quant. */
@@ -3219,10 +3236,26 @@ tq_model_t* tq_load_gguf(const char* path) {
             }
         }
 
-        /* Attention weights — keep as GGUF quantized pointers for on-the-fly dequant.
-         * We store the raw data pointer + type info using a small struct packed into
-         * the existing FP32 weight pointer fields. For GGUF models, we use a special
-         * dispatch: if gguf_ctx is non-NULL, the forward pass uses tq_matmul_gguf. */
+        /* Phi-3 fused QKV detection (synced from quant.h 2026-04-12).
+         * Phi-3 ships `blk.N.attn_qkv.weight` with shape [hidden, 3*hidden]
+         * instead of three separate `attn_q/k/v.weight` tensors. */
+        snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l);
+        const tq_gguf_tensor_t* wqkv_t = find_gguf_tensor(gguf, tname);
+        if (wqkv_t && !layer->delta_a_log) {
+            /* Only take the fused path when there are NO DeltaNet weights —
+             * otherwise the DeltaNet code below handles attn_qkv itself. */
+            layer->gguf_w_qkv = wqkv_t->data;
+            layer->gguf_w_qkv_type = wqkv_t->type;
+            c->has_fused_qkv = 1;
+            snprintf(tname, sizeof(tname), "blk.%d.attn_output.weight", l);
+            t = find_gguf_tensor(gguf, tname);
+            if (t) { layer->gguf_wo = t->data; layer->gguf_wo_type = t->type; }
+            attn_indices[n_attn_layers++] = l;
+            goto post_attn_load;
+        }
+
+        /* Standard llama-style attention weights — keep as GGUF quantized
+         * pointers for on-the-fly dequant. */
         snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
         const tq_gguf_tensor_t* wq_t = find_gguf_tensor(gguf, tname);
         int is_attn_layer = (wq_t != NULL);
@@ -3265,6 +3298,7 @@ tq_model_t* tq_load_gguf(const char* path) {
             attn_indices[n_attn_layers++] = l;
         }
 
+post_attn_load:
         /* Check for DeltaNet / SSM weights (Qwen3.5 hybrid) */
         snprintf(tname, sizeof(tname), "blk.%d.ssm_a", l);
         t = find_gguf_tensor(gguf, tname);
@@ -3518,13 +3552,28 @@ tq_model_t* tq_load_gguf(const char* path) {
                 if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
             }
         } else {
-            /* Dense model: use GGUF on-the-fly dequant */
+            /* Dense model: use GGUF on-the-fly dequant.
+             * Phi-3 fused FFN: when `ffn_up` has shape [hidden, 2*ff] AND
+             * there is no separate `ffn_gate`, it's a fused gate||up tensor. */
             snprintf(tname, sizeof(tname), "blk.%d.ffn_gate.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) { layer->gguf_w_gate = t->data; layer->gguf_w_gate_type = t->type; }
+
             snprintf(tname, sizeof(tname), "blk.%d.ffn_up.weight", l);
             t = find_gguf_tensor(gguf, tname);
-            if (t) { layer->gguf_w_up = t->data; layer->gguf_w_up_type = t->type; }
+            if (t) {
+                if (!layer->gguf_w_gate && t->n_dims >= 2 &&
+                    c->intermediate_dim > 0 &&
+                    (int)t->shape[1] == 2 * c->intermediate_dim) {
+                    layer->gguf_w_up_gate = t->data;
+                    layer->gguf_w_up_gate_type = t->type;
+                    c->has_fused_up_gate = 1;
+                } else {
+                    layer->gguf_w_up = t->data;
+                    layer->gguf_w_up_type = t->type;
+                }
+            }
+
             snprintf(tname, sizeof(tname), "blk.%d.ffn_down.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
@@ -3540,6 +3589,20 @@ tq_model_t* tq_load_gguf(const char* path) {
                 n_attn_layers, c->n_layers);
     }
 
+    /* Hard-fail when neither standard self_attn nor DeltaNet was detected.
+     * (Synced from quant.h — prevents silent garbage from unsupported archs.) */
+    if (n_attn_layers == 0 && c->delta_n_heads == 0) {
+        fprintf(stderr,
+            "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n"
+            "  Detected 0 self_attn layers and no DeltaNet weights.\n"
+            "  This usually means the model uses fused QKV projection\n"
+            "  (e.g., Phi-3 `attn_qkv`) which this build does not yet handle.\n"
+            "  See docs/supported_models.md for the architecture support matrix.\n",
+            gguf->arch[0] ? gguf->arch : "unknown");
+        tq_free_model(model);
+        return NULL;
+    }
+
     /* Set up layer_is_sliding for Gemma hybrid attention.
      * Detect from K tensor shape: sliding and full layers have different K output dims.
      * The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */
@@ -4072,9 +4135,20 @@ skip_q4_conversion: ;
      *   Adding +1 at runtime would double-apply and cause activation explosion.
      * The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */
 
-    /* Initialize persistent Metal GPU buffers for layer-level compute */
+    /* Initialize persistent Metal GPU buffers for layer-level compute.
+     *
+     * Skip Metal for Phi-3 fused-tensor models: the Metal matmul kernels
+     * assume standard separate-tensor layouts (Q4_K blocks per row,
+     * fixed output buffer sizes). Fused QKV and fused gate||up produce
+     * larger output vectors that the Metal kernel doesn't handle.
+     *
+     * This is the right trade-off because:
+     * 1. CPU NEON Q4×Q8 is already faster than Metal for sub-4B models
+     *    (measured: 95 tok/s CPU vs 38 tok/s GPU on SmolLM2).
+     * 2. Phi-3's 32K vocab means the lm_head matmul (where Metal helps
+     *    most due to large output dim) is small — CPU handles it fine. */
 #ifdef TQ_HAS_METAL
-    {
+    if (!c->has_fused_qkv && !c->has_fused_up_gate) {
         extern int tq_metal_gpu_init_buffers(int, int, int, int);
         extern int tq_metal_gpu_init_attn(int, int, int);
         int max_q_dim = c->n_heads * c->head_dim;
@@ -4086,9 +4160,14 @@ skip_q4_conversion: ;
             if (full_kv > max_kv_dim) max_kv_dim = full_kv;
         }
         tq_metal_gpu_init_buffers(c->hidden_dim, c->intermediate_dim, max_q_dim, max_kv_dim);
-
-        /* Initialize attention + KV cache GPU buffers for compute graph forward */
         tq_metal_gpu_init_attn(c->n_heads, c->max_seq_len, max_kv_dim);
+    } else {
+        /* Disable Metal matmul dispatch globally for this process.
+         * The Metal backend is still initialized (MoE kernels etc.) but
+         * tq_matmul_gguf will check this flag and skip GPU dispatch. */
+        extern void tq_metal_disable(void);
+        tq_metal_disable();
+        fprintf(stderr, "tq_load_gguf: Metal GPU dispatch disabled (fused-tensor model — CPU is faster)\n");
     }
 #endif
 
diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c
index 0a80a63..21c1c75 100644
--- a/src/engine/tq_tokenizer.c
+++ b/src/engine/tq_tokenizer.c
@@ -1182,11 +1182,17 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
     int n_tokens = 0;
 
     /* Add BOS token if requested.
-     * Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */
+     * Different model families use different BOS strings (synced from
+     * quant.h 2026-04-12):
+     *   Gemma:           <bos>     (id 2)
+     *   Llama / Phi-3:   <s>       (id 1)  — SentencePiece convention
+     *   Qwen / ChatML:   <|im_start|>
+     *   Llama-3:         <|begin_of_text|> */
     if (add_bos) {
-        /* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
         int bos_id = str_lookup(tok, "<bos>");
-        if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
+        if (bos_id < 0) bos_id = str_lookup(tok, "<s>");
+        if (bos_id < 0) bos_id = str_lookup(tok, "<|im_start|>");
+        if (bos_id < 0) bos_id = str_lookup(tok, "<|begin_of_text|>");
         if (bos_id >= 0) {
             tokens[n_tokens++] = bos_id;
         }
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
index 441fbd0..2357229 100644
--- a/src/engine/tq_transformer.c
+++ b/src/engine/tq_transformer.c
@@ -185,6 +185,15 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     if (max_q_dim > max_dim) max_dim = max_q_dim;
     if (q_proj_dim > max_dim) max_dim = q_proj_dim;
     if (delta_qkv_dim > max_dim) max_dim = delta_qkv_dim;
+    /* Phi-3 fused QKV: one matmul writes [Q | K | V] into a temp buffer
+     * (s->xb2), so max_dim must cover the fused output size. */
+    int fused_qkv_dim = q_dim + 2 * (config->n_kv_heads * config->head_dim);
+    if (config->has_fused_qkv && fused_qkv_dim > max_dim) max_dim = fused_qkv_dim;
+
+    /* Phi-3 fused gate||up FFN: one matmul writes 2*inter floats into s->hb,
+     * so s->hb must be sized to 2*inter_dim. */
+    int inter_dim_alloc = inter_dim;
+    if (config->has_fused_up_gate) inter_dim_alloc = 2 * inter_dim;
 
     s->x      = (float*)calloc((size_t)dim, sizeof(float));
     s->xb     = (float*)calloc((size_t)max_dim, sizeof(float));
@@ -193,7 +202,7 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     s->k      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
     s->v      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
     s->att    = (float*)calloc((size_t)n_heads * max_seq, sizeof(float));
-    s->hb     = (float*)calloc((size_t)inter_dim, sizeof(float));
+    s->hb     = (float*)calloc((size_t)inter_dim_alloc, sizeof(float));
     s->hb2    = (float*)calloc((size_t)inter_dim, sizeof(float));
     s->logits = (float*)calloc((size_t)config->vocab_size, sizeof(float));
 
@@ -957,6 +966,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     int has_q2 = (layer->wq_q2 != NULL);
     int has_q4 = (layer->wq_q4 != NULL);
     int has_gguf = (layer->gguf_wq != NULL);
+    int has_fused_qkv_layer = (layer->gguf_w_qkv != NULL);
     if (has_q2 || has_q4) {
         tq_quantize_row_q8(s->xb, s->xb_q8, s->xb_q8s, dim);
     }
@@ -974,7 +984,19 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
      * layer-level batch scope in tq_forward(). */
 
     float* gate_q = NULL;
-    if (c->attn_output_gate) {
+    if (has_fused_qkv_layer) {
+        /* Phi-3 fused QKV: one matmul produces [Q | K | V] in a temp
+         * buffer, then memcpy splits into s->q / s->k / s->v. */
+        int q_out  = n_heads * head_dim;
+        int kv_out = kv_dim;
+        int total_out = q_out + 2 * kv_out;
+        tq_matmul_gguf(s->xb2, s->xb,
+                       layer->gguf_w_qkv, layer->gguf_w_qkv_type,
+                       total_out, dim);
+        memcpy(s->q, s->xb2,                       (size_t)q_out  * sizeof(float));
+        memcpy(s->k, s->xb2 + q_out,               (size_t)kv_out * sizeof(float));
+        memcpy(s->v, s->xb2 + q_out + kv_out,      (size_t)kv_out * sizeof(float));
+    } else if (c->attn_output_gate) {
         int qg_dim = n_heads * head_dim * 2;
         if (layer->wq_q2) {
             TQ_MATMUL_Q2_OR_1BIT(s->xb2, s->xb, layer->wq_q2, layer->wq_q2s, s->xb_q8, s->xb_q8s, qg_dim, dim, model->use_1bit_weights);
@@ -1015,7 +1037,9 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
             tq_matmul(s->q, s->xb, layer->wq, n_heads * head_dim, dim);
         }
     }
-    if (layer->wk_q2) {
+    if (has_fused_qkv_layer) {
+        /* Already populated s->q/s->k/s->v above — skip K projection */
+    } else if (layer->wk_q2) {
         TQ_MATMUL_Q2_OR_1BIT(s->k, s->xb, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim, model->use_1bit_weights);
     } else if (layer->wk_q4) {
         tq_matmul_q4q2_preq(s->k, layer->wk_q4, layer->wk_q4s, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim);
@@ -1026,9 +1050,12 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     } else {
         tq_matmul(s->k, s->xb, layer->wk, kv_dim, dim);
     }
-    /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */
     int has_v_weights = (layer->wv_q2 || layer->wv_q4 || layer->wv_q8 ||
                          layer->gguf_wv || layer->wv);
+    if (has_fused_qkv_layer) {
+        /* skip — handled by the fused branch */
+    } else {
+    /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */
     if (!has_v_weights) {
         /* K=V: value is same as key (attention_k_eq_v) */
         memcpy(s->v, s->k, kv_dim * sizeof(float));
@@ -1043,6 +1070,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     } else {
         tq_matmul(s->v, s->xb, layer->wv, kv_dim, dim);
     }
+    }
 
     /* Flush batched Q+K+V GPU dispatches before CPU-side RoPE/attention */
     if (has_gguf) tq_metal_batch_flush_if_available();
@@ -1193,7 +1221,50 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
             model->layer_is_sliding && model->layer_is_sliding[l]) {
             rope_base = c->rope_local_base_freq;
         }
-        tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+        /* Phi-3 LongRoPE: NeoX-style rotation with per-frequency rescaling */
+        if (c->rope_factors_short || c->rope_factors_long) {
+            const float* factors =
+                (pos >= c->rope_orig_ctx_len && c->rope_factors_long)
+                    ? c->rope_factors_long
+                    : (c->rope_factors_short ? c->rope_factors_short
+                                              : c->rope_factors_long);
+            int half = head_dim / 2;
+            for (int h = 0; h < n_heads; h++) {
+                float* qh = s->q + h * head_dim;
+                for (int i = 0; i < half; i++) {
+                    float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim);
+                    float freq = base_freq / factors[i];
+                    float theta = pos * freq;
+                    float cos_t = cosf(theta);
+                    float sin_t = sinf(theta);
+                    float q0 = qh[i];
+                    float q1 = qh[i + half];
+                    qh[i]        = q0 * cos_t - q1 * sin_t;
+                    qh[i + half] = q0 * sin_t + q1 * cos_t;
+                }
+            }
+            for (int h = 0; h < n_kv_heads; h++) {
+                float* kh = s->k + h * head_dim;
+                for (int i = 0; i < half; i++) {
+                    float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim);
+                    float freq = base_freq / factors[i];
+                    float theta = pos * freq;
+                    float cos_t = cosf(theta);
+                    float sin_t = sinf(theta);
+                    float k0 = kh[i];
+                    float k1 = kh[i + half];
+                    kh[i]        = k0 * cos_t - k1 * sin_t;
+                    kh[i + half] = k0 * sin_t + k1 * cos_t;
+                }
+            }
+            if (pos >= c->rope_orig_ctx_len && c->rope_attn_factor > 0.0f) {
+                float scale = c->rope_attn_factor;
+                int n_q = n_heads * head_dim;
+                for (int i = 0; i < n_q; i++) s->q[i] *= scale;
+            }
+        } else {
+            tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+        }
     }
 
     /* Store K,V in cache.
@@ -2325,6 +2396,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         if (layer->delta_a_log) {
             /* DeltaNet layer */
             deltanet_forward(model, s, l);
+        } else if (layer->gguf_w_qkv) {
+            /* Phi-3 fused QKV — gguf_wq/wk/wv are NULL because Q, K
+             * and V are concatenated into gguf_w_qkv. self_attn_forward
+             * handles the fused dispatch internally. */
+            self_attn_forward(model, s, l, pos);
         } else if ((layer->wq || layer->wq_q8 || layer->wq_q4 || layer->gguf_wq || layer->wq_q2) &&
                    (layer->wk || layer->wk_q8 || layer->wk_q4 || layer->gguf_wk || layer->wk_q2) &&
                    (layer->wv || layer->wv_q8 || layer->wv_q4 || layer->gguf_wv || layer->wv_q2 ||
@@ -2508,8 +2584,8 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
          * Qwen: layers are either MoE or dense, NOT both.
          * Gemma 3 non-MoE layers: run dense FFN. */
         if (!did_moe &&
-            (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate) &&
-            (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up) &&
+            (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate || layer->gguf_w_up_gate) &&
+            (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up || layer->gguf_w_up_gate) &&
             (layer->w_down || layer->w_down_q8 || layer->w_down_q4 || layer->w_down_q2 || layer->gguf_w_down)) {
 
             /* Pre-FFN norm: Gemma 4 dual-FFN uses pre_ffw_norm_2 for the dense FFN.
@@ -2557,6 +2633,13 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
                                    s->xb_q8, s->xb_q8s, inter, dim);
                 tq_matmul_q4_preq(s->hb2, layer->w_up_q4, layer->w_up_q4s,
                                    s->xb_q8, s->xb_q8s, inter, dim);
+            } else if (layer->gguf_w_up_gate) {
+                /* Phi-3 fused gate||up: one matmul produces [gate | up],
+                 * then we split into s->hb (gate) and s->hb2 (up). */
+                tq_matmul_gguf(s->hb, s->xb,
+                               layer->gguf_w_up_gate, layer->gguf_w_up_gate_type,
+                               2 * inter, dim);
+                memcpy(s->hb2, s->hb + inter, (size_t)inter * sizeof(float));
             } else if (layer->gguf_w_gate) {
                 /* Gate+up GPU dispatches batched by layer-level batch scope */
                 tq_matmul_gguf(s->hb, s->xb, layer->gguf_w_gate, layer->gguf_w_gate_type, inter, dim);
diff --git a/tools/phi3_kvcomp_test.c b/tools/phi3_kvcomp_test.c
new file mode 100644
index 0000000..b331933
--- /dev/null
+++ b/tools/phi3_kvcomp_test.c
@@ -0,0 +1,100 @@
+/* phi3_kvcomp_test — validate Phi-3 forward path against quant.cpp's KV
+ * cache compression layers.
+ *
+ * The Phi-3 architecture support PR (#65) was validated end-to-end with
+ * KV compression DISABLED (kv_compress=0). The fused QKV / fused gate+up
+ * forward branches do not touch the KV cache code path directly, but
+ * the way s->k is written into the cache (and read back during attention)
+ * goes through the same KV-quant code paths as Llama / SmolLM2. This
+ * test exercises that interaction.
+ *
+ * Modes covered:
+ *   off                          baseline (matches PR #65 validation)
+ *   1 / 4-bit                    UNIFORM_4B K + 4-bit V
+ *   1 / 4-bit + progressive 128  + last 128 tokens of K kept FP32
+ *   1 / 4-bit + aggressive  512  + last 512 tokens of K kept FP32
+ *   2 / delta+3-bit              UNIFORM_3B K + 4-bit V + delta encoding
+ *
+ * For each mode we generate 80 greedy tokens from a fixed prompt and
+ * print the output. A working mode produces coherent English; a broken
+ * mode produces fragmented garbage. Compare modes side-by-side.
+ */
+#define QUANT_IMPLEMENTATION
+#include "../quant.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+static void print_token(const char* text, void* ud) {
+    (void)ud;
+    fputs(text, stdout);
+    fflush(stdout);
+}
+
+static int run_one(quant_model* model, const char* label,
+                    int kv_compress, int k_highres_window,
+                    const char* prompt) {
+    /* max_tokens=256 deliberately exceeds the 128-token progressive
+     * window so we actually exercise the boundary where recent keys
+     * shift from FP32 (highres buffer) into the quantized cache. */
+    quant_config cfg = {
+        .temperature = 0.0f,           /* greedy */
+        .top_p = 1.0f,
+        .max_tokens = 256,
+        .n_threads = 4,
+        .kv_compress = kv_compress,
+        .context_length = 0,
+        .k_highres_window = k_highres_window,
+    };
+    quant_ctx* ctx = quant_new(model, &cfg);
+    if (!ctx) {
+        fprintf(stderr, "quant_new failed for mode %s\n", label);
+        return -1;
+    }
+
+    fprintf(stderr, "\n=== %s ===\n", label);
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    int n = quant_generate(ctx, prompt, print_token, NULL);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double secs = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
+    fprintf(stderr, "\n[%s] %d tokens in %.2fs (%.1f tok/s)\n",
+            label, n, secs, secs > 0 ? n / secs : 0.0);
+
+    quant_free_ctx(ctx);
+    return n;
+}
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "usage: %s <model.gguf> [prompt]\n", argv[0]);
+        return 1;
+    }
+    const char* user_msg = (argc >= 3) ? argv[2]
+        : "Explain in two paragraphs why the sky appears blue during the day.";
+
+    /* Phi-3 chat template */
+    char prompt[1024];
+    snprintf(prompt, sizeof(prompt),
+             "<|user|>\n%s<|end|>\n<|assistant|>\n", user_msg);
+
+    fprintf(stderr, "Loading %s ...\n", argv[1]);
+    quant_model* model = quant_load(argv[1]);
+    if (!model) {
+        fprintf(stderr, "quant_load failed\n");
+        return 2;
+    }
+
+    int rc = 0;
+    rc |= run_one(model, "off (baseline, FP32 KV)",      0, 0,   prompt) < 0;
+    rc |= run_one(model, "kv_compress=1 (4-bit, no progressive)", 1, 0,   prompt) < 0;
+    rc |= run_one(model, "kv_compress=1 + progressive(128)",      1, 128, prompt) < 0;
+    rc |= run_one(model, "kv_compress=1 + aggressive(512)",       1, 512, prompt) < 0;
+    rc |= run_one(model, "kv_compress=2 (delta+3-bit)",           2, 0,   prompt) < 0;
+
+    quant_free_model(model);
+    fputc('\n', stderr);
+    return rc ? 4 : 0;
+}