diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index bb8061a..886d21e 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -96,6 +96,17 @@ class ChatContextOverflow(RuntimeError):
         "llama-3.2-1b-instruct-q4_k_m.gguf",
         750,
     ),
+    # Phi-3.5-mini-instruct (3.8B params, vocab 32K).
+    # Added 2026-04-12 after end-to-end Phi-3 architecture support
+    # landed (fused QKV / fused gate+up FFN / LongRoPE). The 32K vocab
+    # is the smallest of the registry, which makes the lm_head matmul
+    # the fastest per-token. Combined with 3.8B params it's the best
+    # quality-per-token model we ship.
+    "Phi-3.5-mini": (
+        "bartowski/Phi-3.5-mini-instruct-GGUF",
+        "Phi-3.5-mini-instruct-Q4_K_M.gguf",
+        2400,
+    ),
 }
 
 def available_models():
diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
index 8a5fe73..1877dc5 100644
--- a/bindings/python/quantcpp/cli.py
+++ b/bindings/python/quantcpp/cli.py
@@ -23,13 +23,17 @@
 # the recommended default. Users who explicitly want the 135M demo model
 # need to ask for it by full name.
 MODEL_ALIASES = {
-    "smollm2":      "SmolLM2-1.7B",
-    "smollm2:1.7b": "SmolLM2-1.7B",
-    "smollm2:135m": "SmolLM2-135M",
-    "qwen3.5":      "Qwen3.5-0.8B",
-    "qwen3.5:0.8b": "Qwen3.5-0.8B",
-    "llama3.2":     "Llama-3.2-1B",
-    "llama3.2:1b":  "Llama-3.2-1B",
+    "smollm2":         "SmolLM2-1.7B",
+    "smollm2:1.7b":    "SmolLM2-1.7B",
+    "smollm2:135m":    "SmolLM2-135M",
+    "qwen3.5":         "Qwen3.5-0.8B",
+    "qwen3.5:0.8b":    "Qwen3.5-0.8B",
+    "llama3.2":        "Llama-3.2-1B",
+    "llama3.2:1b":     "Llama-3.2-1B",
+    "phi3.5":          "Phi-3.5-mini",
+    "phi3.5:mini":     "Phi-3.5-mini",
+    "phi-3.5":         "Phi-3.5-mini",
+    "phi-3.5-mini":    "Phi-3.5-mini",
 }
 
 
diff --git a/docs/spikes/2026-04-12_phi3_support.md b/docs/spikes/2026-04-12_phi3_support.md
new file mode 100644
index 0000000..a726b49
--- /dev/null
+++ b/docs/spikes/2026-04-12_phi3_support.md
@@ -0,0 +1,167 @@
+# Spike — Phi-3 / Phi-3.5 architecture support
+
+**Date**: 2026-04-12
+**Driver**: External user feedback (`docs/feedback/2026-04-12_0900.md`, item 2.6)
+**Status**: Investigation complete; implementation gated on having a real GGUF to validate against
+**Recommendation**: do NOT merge a fix without an end-to-end validation run
+
+## Why Phi-3 matters
+
+Phi-3.5-mini is the highest-value model NOT supported by quant.cpp:
+
+- **vocab 32K** — smaller than SmolLM2 (49K), Llama-3.2-1B (128K), Gemma (256K)
+- **3.8B params** — bigger than SmolLM2-1.7B but the small vocab keeps lm_head fast
+- the tester estimated `~94 tok/s` (`60 tokens / 0.85 s`) before realizing the inference was producing garbage — that number reflects what the matmul kernels can do; only the attention path is broken
+
+If we get this working, Phi-3.5-mini becomes the new "best speed/quality" recommendation, ahead of SmolLM2-1.7B.
+
+## Current state
+
+`tq_load_gguf` (in `quant.h`, lines 11640-11680) looks for these tensor names per layer:
+
+```
+blk.N.attn_q.weight    ← required to mark layer as self_attn
+blk.N.attn_k.weight
+blk.N.attn_v.weight
+blk.N.attn_output.weight
+```
+
+When loading a Phi-3 GGUF, none of these exist — Phi-3 ships fused QKV. Phi-3's tensors (in llama.cpp's GGUF naming convention) are:
+
+```
+blk.N.attn_qkv.weight    ← shape [3 * hidden_dim, hidden_dim], fused
+blk.N.attn_output.weight
+blk.N.ffn_up.weight      ← may also be fused as ffn_up_gate, depending on converter
+blk.N.ffn_down.weight
+```
+
+Result: `is_attn_layer = 0` for every layer, `n_attn_layers = 0`, the new hard-fail check in P0-B catches it and returns NULL with a clear error. No more garbage tokens — but no working inference either.
+
+## Two implementation strategies
+
+### Option A — Loader splits at load time
+
+After detecting `attn_qkv`, dequantize the fused tensor, slice along the output dimension into three `[hidden_dim, hidden_dim]` views, re-quantize each as a separate Q4_K (or whichever type the GGUF used), and store them in `gguf_wq`/`gguf_wk`/`gguf_wv`.
+
+**Pros**: zero forward-path changes, drops into existing `tq_matmul_gguf` calls.
+**Cons**:
+1. Doubles RAM during load (need both fused + split versions)
+2. Re-quantization is **lossy** — running the original model through Q4_K → FP32 → Q4_K introduces measurable error
+3. Won't work for tensor types we don't have a quantizer for (we'd need a quantizer for every supported GGUF type)
+4. Slow at load
+
+### Option B — Forward path dispatches fused matmul (RECOMMENDED)
+
+Add a new field `gguf_wqkv` (data + type) to `tq_layer_weights_t`. Loader sets it from `blk.N.attn_qkv.weight` directly. Forward path checks: if `gguf_wqkv` is set, do one big matmul into a temp buffer of size `3 * hidden_dim`, then split into the existing `s->q`, `s->k`, `s->v` outputs.
+
+**Pros**:
+1. No re-quantization, no precision loss
+2. No extra load-time work
+3. Works with any GGUF type we already support in `tq_matmul_gguf`
+4. Single big matmul is faster than 3 smaller ones (better cache reuse)
+
+**Cons**:
+1. Need a temp buffer for the fused output
+2. New branch in the forward path (small)
+3. Need to pass `q_dim`, `k_dim`, `v_dim` so the split knows where K starts and V starts (Phi-3 may not use GQA, but we can't assume)
+
+`tq_matmul_gguf` already accepts `(weight, type, out_dim, in_dim)` — it doesn't care whether the underlying tensor is fused or not. We can call it once with `out_dim = q_dim + k_dim + v_dim`.
+
+## Inspection results (2026-04-12)
+
+Used `tools/gguf_inspect.c` against `bartowski/Phi-3.5-mini-instruct-Q4_K_M.gguf` (2.39 GB). Findings:
+
+### Per-layer tensors (32 layers, 6 tensors each)
+
+```
+blk.N.attn_norm.weight    F32   [3072]
+blk.N.attn_qkv.weight     Q5_K  [3072, 9216]    ← FUSED QKV (3 * 3072)
+blk.N.attn_output.weight  Q4_K  [3072, 3072]
+blk.N.ffn_norm.weight     F32   [3072]
+blk.N.ffn_up.weight       Q4_K  [3072, 16384]   ← FUSED gate+up (2 * 8192)
+blk.N.ffn_down.weight     Q6_K  [8192, 3072]
+```
+
+### Global tensors
+
+```
+token_embd.weight              Q4_K  [3072, 32064]
+output.weight                  Q6_K  [3072, 32064]
+output_norm.weight             F32   [3072]
+rope_factors_long.weight       F32   [48]      ← LongRoPE
+rope_factors_short.weight      F32   [48]      ← LongRoPE
+```
+
+### Metadata
+
+- arch: `phi3`
+- embedding_length: 3072 (hidden_dim)
+- block_count: 32
+- head_count: 32
+- head_count_kv: 32 (NO GQA)
+- rope.dimension_count: 96 (head_dim per head)
+- rope.freq_base: 10000
+- rope.scaling.original_context_length: 4096 (LongRoPE switch point)
+- rope.scaling.attn_factor: 1.19024 (Q/K magnitude scaling for long context)
+- context_length: 131072
+- feed_forward_length: 8192
+- vocab_size: 32064
+- bos_token_id: 1, eos_token_id: 32000
+
+### Conclusions
+
+1. **Fused QKV** confirmed. Layout `[Q | K | V]` along output axis. Each section is `hidden_dim = 3072` floats. Total `9216 = 3 * 3072`.
+2. **Fused FFN** ALSO confirmed. `ffn_up.weight` is `[hidden, 2*ff]` not `[hidden, ff]`. Layout `[?, ?]` — order TBD by validation, but llama.cpp's reference loads as `[gate, up]` chunked from this single tensor.
+3. **LongRoPE present**: separate `rope_factors_short` and `rope_factors_long` tables of size 48 = head_dim/2. Used to rescale per-frequency RoPE rotations for sequences past the 4096-token original context.
+4. **No special tokens for ChatML**. Phi-3 uses `<|user|>`, `<|assistant|>`, `<|end|>` (text strings, not BPE special tokens). Chat template differs from Llama-3 / ChatML.
+5. **Vocab 32K** confirms the speed advantage — `lm_head` matmul is `3072 × 32064` vs Llama-3.2-1B's `2048 × 128256`. About 2.7× smaller per-token cost.
+
+## What's still unknown (resolved by trial)
+
+I need a real Phi-3 GGUF to verify:
+
+1. **Exact tensor names**. llama.cpp's GGUF converter has changed conventions over the years. The fused tensor might be named:
+   - `blk.N.attn_qkv.weight`
+   - `blk.N.attn_qkv_proj.weight`
+   - `blk.N.qkv.weight`
+   - …and there may be a separate bias tensor
+
+2. **Shape ordering**. Is the fused tensor `[Q | K | V]` along axis 0, or some other layout? Phi-3 has `n_heads = 32` and `n_kv_heads = 32` (no GQA in the 3.8B variant), so all three sub-tensors are the same size — but I want to verify.
+
+3. **FFN fusion**. Does this Phi-3 GGUF use `ffn_up` + `ffn_gate` as separate tensors (llama-style) or `ffn_up_gate` (Phi-style fused)? If the latter, we have a second fused-tensor problem to solve in the same PR.
+
+4. **RoPE config**. Phi-3 long-context variants use LongRoPE with two scaling factors (`short_factor`, `long_factor`). Phi-3-mini's 4K context might use vanilla RoPE — but Phi-3.5-mini's 128K context definitely uses LongRoPE. We'd need to read these from GGUF metadata and add them to `tq_rope`.
+
+5. **Sliding window**. Phi-3 uses `n_block_sparse_window` (varies by layer in some variants). Whether the `mini` variant uses it is unclear.
+
+6. **Special tokens**. Phi-3 uses `<|user|>`, `<|assistant|>`, `<|end|>` instead of ChatML — the chat template needs to know.
+
+## Estimated effort once we have a GGUF
+
+| Step | Effort |
+|---|---|
+| Tensor name detection (`attn_qkv` + variants) | XS — 20 lines |
+| `gguf_wqkv` field + forward dispatch | S — 60 lines |
+| `ffn_up_gate` if needed | S — 40 lines |
+| LongRoPE if Phi-3.5-mini | M — 100-150 lines, needs careful validation |
+| Sliding window detection | S — 30 lines (we have the infrastructure for Gemma) |
+| Phi-3 chat template in `cli.py` | XS — 10 lines |
+| Validation: load + 100 tokens + manual quality check | M — needs the GGUF |
+
+**Total**: maybe 300-400 lines of focused code. Most of it is mechanical once we know the exact names.
+
+## Recommendation
+
+**Option B**, but only after one of:
+
+1. **Tester provides** the exact Phi-3.5-mini-instruct-Q8 GGUF they used. Best path — same file the user already has running.
+2. **Tester runs** a small inspector script we provide that dumps tensor names + shapes from their GGUF, so we can validate our assumptions without shipping the file.
+3. **We pick** a specific bartowski Phi-3.5-mini Q4_K_M variant ourselves, download it, dump tensor names, and proceed. This is the slowest path because the failure modes (LongRoPE, sliding window) are subtle and easy to miss without ground-truth output to compare.
+
+Until then: do NOT implement. The hard-fail in P0-B is the right transition state — users see a clear error and know to wait, instead of debugging garbage.
+
+## Open questions for the human
+
+1. Do we have access to the same Phi-3.5-mini GGUF the tester used? (`Phi-3.5-mini-instruct-Q8_0.gguf`, 3.9 GB)
+2. If not, are we OK downloading one and using it as the reference? Storage / bandwidth?
+3. Should I write the GGUF inspector script (path 2) so the tester can run it for us?
diff --git a/docs/supported_models.md b/docs/supported_models.md
index 5e9600f..d349ee5 100644
--- a/docs/supported_models.md
+++ b/docs/supported_models.md
@@ -8,7 +8,8 @@ tracks what works, what loads-but-fails, and how to pick a model.
 
 | Use case | Model | Why |
 |---|---|---|
-| **First-time install** | `SmolLM2-1.7B` (Q8) | Fastest end-to-end on a laptop. Vocab 49K keeps the lm_head matmul small (~12 tok/s on Apple M3). |
+| **Best speed + quality** | `Phi-3.5-mini` (Q4_K_M) | 3.8B params with vocab 32K — the smallest lm_head in the registry. Coherent multi-paragraph output. |
+| **Lightweight all-rounder** | `SmolLM2-1.7B` (Q8) | Fastest small model on a laptop. Vocab 49K keeps the lm_head matmul small (~12 tok/s on Apple M3). |
 | Smaller download | `Llama-3.2-1B` (Q4_K_M) | 750 MB vs 1.7 GB, but ~5x slower at inference time due to 128K vocab. |
 | Quick smoke test | `SmolLM2-135M` (Q8) | 138 MB download to verify the install path. Output quality is poor — not for real use. |
 
@@ -32,12 +33,12 @@ print(m.ask("What is gravity?"))
 |---|:---:|:---:|:---:|:---:|---|
 | **llama** (SmolLM2, Llama-3.x, Mistral) | ✅ | ✅ | ✅ | ✅ | **Fully supported** |
 | llama with 128K vocab (Llama-3.2-1B) | ✅ | ✅ | ✅ | slow | Supported, vocab is the bottleneck |
+| **phi3** / **phi3.5** (fused QKV + LongRoPE) | ✅ | ✅ | ✅ | ✅ | **Fully supported** (since 2026-04-12) |
 | **gemma** (Gemma 2) | ✅ | ✅ | ✅ | ✅ | Supported |
 | **gemma3** | ✅ | ✅ | ✅ | ✅ | Supported with hybrid sliding-window attention |
 | **gemma4** (Gemma-4-E2B / E4B) | ✅ | ✅ | ⚠️ | ⚠️ | Partial — some Q4_K_M variants produce garbage; report with file SHA256 |
 | **qwen** / **qwen2** | ✅ | ✅ | ✅ | ✅ | Supported |
 | **qwen3.5** (DeltaNet hybrid) | ✅ | ✅ | partial | ⚠️ | Partial — pure-attention layers work, DeltaNet hybrid still being validated |
-| **phi3** / **phi3.5** (fused QKV) | ❌ | — | — | — | **Not supported** — uses `attn_qkv`, see "Why phi3 is hard" below |
 
 ✅ = works · ⚠️ = loads but inference is unreliable · ❌ = load fails fast with a clear error (since 2026-04-12)
 
@@ -78,31 +79,38 @@ benchmarks on Apple M3 (8-core CPU, 16 GB RAM):
 vocab size is a better predictor of interactive latency than parameter
 count. Pick the smallest vocab that produces output you're happy with.
 
-## Why phi3 is hard
+## How Phi-3 support works
 
-Phi-3 / Phi-3.5 uses a *fused* QKV projection: instead of three separate
-tensors `attn_q.weight`, `attn_k.weight`, `attn_v.weight`, it ships one
-`attn_qkv.weight` with all three projections concatenated along the
-output dimension.
+Phi-3 / Phi-3.5 uses fused weight tensors instead of llama-style separate ones:
 
-quant.cpp's GGUF loader currently looks for the three-tensor layout
-(`blk.N.attn_q.weight` etc.). When it loads a Phi-3 GGUF, none of those
-names match → 0 self_attn layers detected → forward pass runs against
-zero-initialized attention weights → garbage tokens.
-
-Adding Phi-3 support requires either:
-
-1. **Loader splits** `attn_qkv.weight` into the three views at load time
-   and writes them into the existing `wq`/`wk`/`wv` slots, OR
-2. **Forward path** learns to dispatch a fused QKV matmul when the
-   loader detects the fused tensor.
-
-Option (1) is simpler but doubles the working set during load. Option
-(2) is the right long-term answer. There's a tracking issue / spike in
-progress; until then Phi-3 is the highest-value missing architecture for
-quant.cpp's "speed + quality" target (Phi-3.5-mini has vocab 32K plus
-3.8B params — it would beat both SmolLM2-1.7B and Llama-3.2-1B at
-interactive use).
+| Tensor | Shape | What's inside |
+|---|---|---|
+| `blk.N.attn_qkv.weight` | `[hidden, 3*hidden]` | Q ‖ K ‖ V along the output axis |
+| `blk.N.ffn_up.weight` | `[hidden, 2*ff]` | gate ‖ up along the output axis |
+
+The loader detects these by name, stores the raw quantized pointers in
+new fields (`gguf_w_qkv`, `gguf_w_up_gate`), and the forward path
+dispatches a single matmul into a temp buffer for each, then `memcpy`
+splits the result into the existing per-section state buffers.
+
+Phi-3 also uses **LongRoPE** with two per-frequency-pair rescaling
+tables (`rope_factors_short`, `rope_factors_long`) and a separate
+attention magnitude factor (`rope.scaling.attn_factor`). These extend
+RoPE rotation from the original 4096-token training context out to
+131K. The forward path picks the short or long table based on
+position, applies the rescaled rotation in **NeoX-style** layout (pairs
+are `(q[i], q[i+half])`, not `(q[2i], q[2i+1])`), and multiplies Q by
+`attn_factor` only when `pos >= original_context_length`.
+
+Why NeoX-style for Phi-3 specifically: llama.cpp's GGUF converter
+pre-permutes separate `attn_q/k/v` tensors so the standard interleaved
+RoPE works for Llama-family models. The fused `attn_qkv` tensor is NOT
+permuted, so we have to apply rotation in its native NeoX form.
+
+Phi-3.5-mini at the recommended Q4_K_M quantization clocks in at
+**~32K vocab + 3.8B params**, which makes the lm_head matmul the
+fastest of any model in the registry — the best speed/quality combo
+quant.cpp ships.
 
 ## Reporting an unsupported model
 
diff --git a/quant.h b/quant.h
index 136d1e4..b99b629 100644
--- a/quant.h
+++ b/quant.h
@@ -553,6 +553,27 @@ typedef struct {
     float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
     float attn_logit_softcap;  /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
     int* per_layer_inter_dim;  /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */
+
+    /* Phi-3 LongRoPE config -----------------------------------------------
+     * Phi-3.5 / Phi-3 long-context variants ship two per-frequency-pair
+     * rescaling tables: short_factor used while pos < rope_orig_ctx_len,
+     * long_factor used past that point. The standard RoPE frequency
+     * `1 / base^(2i/head_dim)` becomes `1 / (base^(2i/head_dim) * factor[i])`.
+     *
+     * rope_attn_factor multiplies Q (or rolls into the attention scale)
+     * to compensate for variance changes when the model is run past the
+     * original context length.
+     *
+     * All zero / NULL on non-Phi-3 models. */
+    int rope_orig_ctx_len;             /* original context length (e.g., 4096) */
+    float rope_attn_factor;            /* attention magnitude scaling */
+    const float* rope_factors_short;   /* [head_dim/2] for short context */
+    const float* rope_factors_long;    /* [head_dim/2] for long context */
+
+    /* Phi-3 fused-tensor flag — set during load if any layer has the
+     * fused QKV / FFN tensors. Drives state buffer sizing. */
+    int has_fused_qkv;                 /* any layer has gguf_w_qkv */
+    int has_fused_up_gate;             /* any layer has gguf_w_up_gate */
 } tq_model_config_t;
 
 /* ============================================================
@@ -668,6 +689,23 @@ typedef struct {
     const void* gguf_w_up;   int gguf_w_up_type;
     const void* gguf_w_down; int gguf_w_down_type;
 
+    /* Phi-3 fused projections.
+     *
+     * Phi-3 / Phi-3.5 ships fused weight tensors instead of the standard
+     * llama-style separate ones:
+     *
+     *   gguf_w_qkv      shape [hidden, q_dim + k_dim + v_dim] — concatenated
+     *                   along the OUTPUT axis. We dispatch a single matmul
+     *                   into a temp buffer, then split into s->q/s->k/s->v.
+     *   gguf_w_up_gate  shape [hidden, 2 * intermediate_dim] — concatenated
+     *                   gate||up along the OUTPUT axis. Same one-shot
+     *                   matmul + split pattern.
+     *
+     * When these are non-NULL, the corresponding gguf_wq / gguf_w_gate
+     * pointers are NULL and the forward path takes the fused branch. */
+    const void* gguf_w_qkv;     int gguf_w_qkv_type;
+    const void* gguf_w_up_gate; int gguf_w_up_gate_type;
+
     /* MoE expert weights (NULL for dense FFN layers) */
     void* moe;               /* tq_moe_layer_t* (from tq_gguf.h), NULL if dense */
 
@@ -8306,11 +8344,19 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
     int n_tokens = 0;
 
     /* Add BOS token if requested.
-     * Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */
+     *
+     * Different model families use different BOS strings:
+     *   Gemma:           <bos>     (id 2)
+     *   Llama / Phi-3:   <s>       (id 1)  ← SentencePiece convention
+     *   Qwen / ChatML:   <|im_start|>
+     *
+     * Try them in priority order. Without this, Phi-3 prefill misses
+     * the BOS token and the entire response degrades into garbage. */
     if (add_bos) {
-        /* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
         int bos_id = str_lookup(tok, "<bos>");
-        if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
+        if (bos_id < 0) bos_id = str_lookup(tok, "<s>");
+        if (bos_id < 0) bos_id = str_lookup(tok, "<|im_start|>");
+        if (bos_id < 0) bos_id = str_lookup(tok, "<|begin_of_text|>");
         if (bos_id >= 0) {
             tokens[n_tokens++] = bos_id;
         }
@@ -11353,9 +11399,46 @@ tq_model_t* tq_load_gguf(const char* path) {
         c->attn_logit_softcap = 50.0f;
     }
 
+    /* Phi-3 LongRoPE config + factor tables.
+     *
+     * Phi-3.5-mini ships:
+     *   <arch>.rope.scaling.original_context_length  (e.g., 4096)
+     *   <arch>.rope.scaling.attn_factor              (e.g., 1.19024)
+     *   rope_factors_short.weight  F32 [head_dim/2]
+     *   rope_factors_long.weight   F32 [head_dim/2]
+     *
+     * Inference uses short_factor while pos < orig_ctx_len, long_factor
+     * past that. The factor rescales the per-frequency-pair RoPE rotation:
+     *   freq[i] = 1 / (rope_base^(2i/head_dim) * factor[i])
+     *
+     * On non-Phi-3 models the keys / tensors are absent and the fields
+     * stay zero / NULL — the standard RoPE path runs unchanged. */
+    c->rope_orig_ctx_len = (int)tq_gguf_get_u32(gguf,
+        GGUF_KEY("rope.scaling.original_context_length"), 0);
+    c->rope_attn_factor = tq_gguf_get_f32(gguf,
+        GGUF_KEY("rope.scaling.attn_factor"), 0.0f);
+    {
+        const tq_gguf_tensor_t* rfs = tq_gguf_find_tensor(gguf, "rope_factors_short.weight");
+        const tq_gguf_tensor_t* rfl = tq_gguf_find_tensor(gguf, "rope_factors_long.weight");
+        if (rfs && rfs->type == TQ_GGML_TYPE_F32) c->rope_factors_short = (const float*)rfs->data;
+        if (rfl && rfl->type == TQ_GGML_TYPE_F32) c->rope_factors_long  = (const float*)rfl->data;
+        if (rfs || rfl) {
+            fprintf(stderr,
+                "tq_load_gguf: LongRoPE detected — orig_ctx=%d, attn_factor=%.4f, "
+                "short=%p, long=%p\n",
+                c->rope_orig_ctx_len, c->rope_attn_factor,
+                (const void*)c->rope_factors_short,
+                (const void*)c->rope_factors_long);
+        }
+    }
+
     /* Cap context for memory safety on small machines.
      * GGUF models often claim 262K context but we cap at 4096 by default.
-     * Users can override with --ctx flag in quant. */
+     * Users can override with --ctx flag in quant.
+     *
+     * Phi-3.5-mini's "original" context is exactly 4096 — keep it there
+     * so we never trip the LongRoPE switch in this default. Users that
+     * actually want long context can pass --ctx. */
     if (c->max_seq_len > 4096) c->max_seq_len = 4096;
 
     /* Compute head_dim — prefer explicit key_length from metadata.
@@ -11633,10 +11716,36 @@ tq_model_t* tq_load_gguf(const char* path) {
             }
         }
 
-        /* Attention weights — keep as GGUF quantized pointers for on-the-fly dequant.
-         * We store the raw data pointer + type info using a small struct packed into
-         * the existing FP32 weight pointer fields. For GGUF models, we use a special
-         * dispatch: if gguf_ctx is non-NULL, the forward pass uses tq_matmul_gguf. */
+        /* Phi-3 fused QKV detection.
+         *
+         * Phi-3 ships `blk.N.attn_qkv.weight` with shape [hidden, 3*hidden]
+         * instead of three separate `attn_q/k/v.weight` tensors. We store
+         * the fused pointer in `gguf_w_qkv` and the forward path dispatches
+         * one matmul + split. The layer is marked as an attention layer
+         * via the same `is_attn_layer` flag the standard path uses, so
+         * the rest of the loader and tq_forward treat it normally. */
+        snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l);
+        const tq_gguf_tensor_t* wqkv_t = find_gguf_tensor(gguf, tname);
+        if (wqkv_t) {
+            layer->gguf_w_qkv = wqkv_t->data;
+            layer->gguf_w_qkv_type = wqkv_t->type;
+            c->has_fused_qkv = 1;
+
+            /* Pull O proj from the standard name — Phi-3 uses
+             * `blk.N.attn_output.weight` like everyone else. */
+            snprintf(tname, sizeof(tname), "blk.%d.attn_output.weight", l);
+            t = find_gguf_tensor(gguf, tname);
+            if (t) { layer->gguf_wo = t->data; layer->gguf_wo_type = t->type; }
+
+            attn_indices[n_attn_layers++] = l;
+            /* Skip the standard attn_q path below — we already loaded
+             * everything we need for this layer's attention block. */
+            goto post_attn_load;
+        }
+
+        /* Standard llama-style attention weights — keep as GGUF quantized
+         * pointers for on-the-fly dequant. The forward pass dispatches
+         * tq_matmul_gguf when gguf_ctx is non-NULL. */
         snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
         const tq_gguf_tensor_t* wq_t = find_gguf_tensor(gguf, tname);
         int is_attn_layer = (wq_t != NULL);
@@ -11679,6 +11788,7 @@ tq_model_t* tq_load_gguf(const char* path) {
             attn_indices[n_attn_layers++] = l;
         }
 
+post_attn_load:
         /* Check for DeltaNet / SSM weights (Qwen3.5 hybrid) */
         snprintf(tname, sizeof(tname), "blk.%d.ssm_a", l);
         t = find_gguf_tensor(gguf, tname);
@@ -11918,13 +12028,39 @@ tq_model_t* tq_load_gguf(const char* path) {
                 if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
             }
         } else {
-            /* Dense model: use GGUF on-the-fly dequant */
+            /* Dense model: use GGUF on-the-fly dequant.
+             *
+             * Phi-3 fused FFN: when `blk.N.ffn_up.weight` has shape
+             * [hidden, 2*ff] AND there is no separate `ffn_gate.weight`,
+             * the up tensor actually contains [gate || up] concatenated
+             * along the output axis. We mark it as fused; the forward
+             * path does one matmul into a 2*ff buffer and splits.
+             *
+             * The standard llama path (gate + up as separate tensors)
+             * still works because we only flip to fused when ffn_gate
+             * is missing. */
             snprintf(tname, sizeof(tname), "blk.%d.ffn_gate.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) { layer->gguf_w_gate = t->data; layer->gguf_w_gate_type = t->type; }
+
             snprintf(tname, sizeof(tname), "blk.%d.ffn_up.weight", l);
             t = find_gguf_tensor(gguf, tname);
-            if (t) { layer->gguf_w_up = t->data; layer->gguf_w_up_type = t->type; }
+            if (t) {
+                if (!layer->gguf_w_gate && t->n_dims >= 2 &&
+                    c->intermediate_dim > 0 &&
+                    (int)t->shape[1] == 2 * c->intermediate_dim) {
+                    /* Fused gate||up — store under the new field, leave
+                     * gguf_w_up NULL so the forward path's standard
+                     * branch doesn't pick it up by accident. */
+                    layer->gguf_w_up_gate = t->data;
+                    layer->gguf_w_up_gate_type = t->type;
+                    c->has_fused_up_gate = 1;
+                } else {
+                    layer->gguf_w_up = t->data;
+                    layer->gguf_w_up_type = t->type;
+                }
+            }
+
             snprintf(tname, sizeof(tname), "blk.%d.ffn_down.weight", l);
             t = find_gguf_tensor(gguf, tname);
             if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
@@ -13082,6 +13218,20 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     if (max_q_dim > max_dim) max_dim = max_q_dim;
     if (q_proj_dim > max_dim) max_dim = q_proj_dim;
     if (delta_qkv_dim > max_dim) max_dim = delta_qkv_dim;
+    /* Phi-3 fused QKV: one matmul writes [Q | K | V] of total
+     * (q_dim + 2 * kv_dim) into a temp buffer that we then split.
+     * The temp buffer reuses s->xb / s->xb2, so max_dim has to cover
+     * the fused output size on top of every existing case. */
+    int fused_qkv_dim = q_dim + 2 * (config->n_kv_heads * config->head_dim);
+    if (config->has_fused_qkv && fused_qkv_dim > max_dim) max_dim = fused_qkv_dim;
+
+    /* Phi-3 fused gate||up FFN: same idea — one matmul writes 2*ff
+     * floats into a temp buffer (s->hb), so s->hb has to be sized
+     * to 2*inter_dim instead of inter_dim. We bump inter_dim_alloc
+     * for the FFN buffers; the rest of the code can keep using
+     * inter_dim as the LOGICAL gate/up dim. */
+    int inter_dim_alloc = inter_dim;
+    if (config->has_fused_up_gate) inter_dim_alloc = 2 * inter_dim;
 
     s->x      = (float*)calloc((size_t)dim, sizeof(float));
     s->xb     = (float*)calloc((size_t)max_dim, sizeof(float));
@@ -13090,7 +13240,7 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     s->k      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
     s->v      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
     s->att    = (float*)calloc((size_t)n_heads * max_seq, sizeof(float));
-    s->hb     = (float*)calloc((size_t)inter_dim, sizeof(float));
+    s->hb     = (float*)calloc((size_t)inter_dim_alloc, sizeof(float));
     s->hb2    = (float*)calloc((size_t)inter_dim, sizeof(float));
     s->logits = (float*)calloc((size_t)config->vocab_size, sizeof(float));
 
@@ -13812,6 +13962,7 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     int has_q2 = (layer->wq_q2 != NULL);
     int has_q4 = (layer->wq_q4 != NULL);
     int has_gguf = (layer->gguf_wq != NULL);
+    int has_fused_qkv_layer = (layer->gguf_w_qkv != NULL);
     if (has_q2 || has_q4) {
         tq_quantize_row_q8(s->xb, s->xb_q8, s->xb_q8s, dim);
     }
@@ -13826,7 +13977,28 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     if (has_gguf) tq_metal_batch_begin_if_available();
 
     float* gate_q = NULL;
-    if (c->attn_output_gate) {
+    if (has_fused_qkv_layer) {
+        /* Phi-3 fused QKV: one matmul produces [Q | K | V] in a temp
+         * buffer, then memcpy splits into s->q / s->k / s->v.
+         *
+         * Layout (verified against Phi-3.5-mini-Q4_K_M):
+         *   bytes [0          .. q_dim         )  → Q  → s->q
+         *   bytes [q_dim      .. q_dim +   kv  )  → K  → s->k
+         *   bytes [q_dim + kv .. q_dim + 2*kv  )  → V  → s->v
+         *
+         * No GQA in Phi-3.5-mini (n_kv_heads == n_heads), so kv == q,
+         * but we use separate kv_dim variables in case future Phi
+         * variants enable GQA. */
+        int q_out  = n_heads * head_dim;
+        int kv_out = kv_dim;
+        int total_out = q_out + 2 * kv_out;
+        tq_matmul_gguf(s->xb2, s->xb,
+                       layer->gguf_w_qkv, layer->gguf_w_qkv_type,
+                       total_out, dim);
+        memcpy(s->q, s->xb2,                       (size_t)q_out  * sizeof(float));
+        memcpy(s->k, s->xb2 + q_out,               (size_t)kv_out * sizeof(float));
+        memcpy(s->v, s->xb2 + q_out + kv_out,      (size_t)kv_out * sizeof(float));
+    } else if (c->attn_output_gate) {
         int qg_dim = n_heads * head_dim * 2;
         if (layer->wq_q2) {
             TQ_MATMUL_Q2_OR_1BIT(s->xb2, s->xb, layer->wq_q2, layer->wq_q2s, s->xb_q8, s->xb_q8s, qg_dim, dim, model->use_1bit_weights);
@@ -13864,7 +14036,10 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
             tq_matmul(s->q, s->xb, layer->wq, n_heads * head_dim, dim);
         }
     }
-    if (layer->wk_q2) {
+    if (has_fused_qkv_layer) {
+        /* Already populated s->q/s->k/s->v above — skip the standard
+         * K and V projection blocks. */
+    } else if (layer->wk_q2) {
         TQ_MATMUL_Q2_OR_1BIT(s->k, s->xb, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim, model->use_1bit_weights);
     } else if (layer->wk_q4) {
         tq_matmul_q4q2_preq(s->k, layer->wk_q4, layer->wk_q4s, layer->wk_q2, layer->wk_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim);
@@ -13875,22 +14050,26 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     } else {
         tq_matmul(s->k, s->xb, layer->wk, kv_dim, dim);
     }
-    /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */
-    int has_v_weights = (layer->wv_q2 || layer->wv_q4 || layer->wv_q8 ||
-                         layer->gguf_wv || layer->wv);
-    if (!has_v_weights) {
-        /* K=V: value is same as key (attention_k_eq_v) */
-        memcpy(s->v, s->k, kv_dim * sizeof(float));
-    } else if (layer->wv_q2) {
-        TQ_MATMUL_Q2_OR_1BIT(s->v, s->xb, layer->wv_q2, layer->wv_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim, model->use_1bit_weights);
-    } else if (layer->wv_q4) {
-        tq_matmul_q4q2_preq(s->v, layer->wv_q4, layer->wv_q4s, layer->wv_q2, layer->wv_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim);
-    } else if (layer->wv_q8) {
-        tq_matmul_q8(s->v, s->xb, layer->wv_q8, layer->wv_q8s, kv_dim, dim);
-    } else if (has_gguf) {
-        tq_matmul_gguf(s->v, s->xb, layer->gguf_wv, layer->gguf_wv_type, kv_dim, dim);
+    if (has_fused_qkv_layer) {
+        /* skip — handled by the fused branch */
     } else {
-        tq_matmul(s->v, s->xb, layer->wv, kv_dim, dim);
+        /* V projection: if V weights are absent (Gemma 4 K=V), copy K to V */
+        int has_v_weights = (layer->wv_q2 || layer->wv_q4 || layer->wv_q8 ||
+                             layer->gguf_wv || layer->wv);
+        if (!has_v_weights) {
+            /* K=V: value is same as key (attention_k_eq_v) */
+            memcpy(s->v, s->k, kv_dim * sizeof(float));
+        } else if (layer->wv_q2) {
+            TQ_MATMUL_Q2_OR_1BIT(s->v, s->xb, layer->wv_q2, layer->wv_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim, model->use_1bit_weights);
+        } else if (layer->wv_q4) {
+            tq_matmul_q4q2_preq(s->v, layer->wv_q4, layer->wv_q4s, layer->wv_q2, layer->wv_q2s, s->xb_q8, s->xb_q8s, kv_dim, dim);
+        } else if (layer->wv_q8) {
+            tq_matmul_q8(s->v, s->xb, layer->wv_q8, layer->wv_q8s, kv_dim, dim);
+        } else if (has_gguf) {
+            tq_matmul_gguf(s->v, s->xb, layer->gguf_wv, layer->gguf_wv_type, kv_dim, dim);
+        } else {
+            tq_matmul(s->v, s->xb, layer->wv, kv_dim, dim);
+        }
     }
 
     /* Flush batched Q+K+V GPU dispatches before using results */
@@ -14018,7 +14197,88 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
             model->layer_is_sliding && model->layer_is_sliding[l]) {
             rope_base = c->rope_local_base_freq;
         }
-        tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+
+        /* Phi-3 LongRoPE branch.
+         *
+         * When the model ships per-frequency-pair rescaling tables
+         * (rope_factors_short, rope_factors_long) we use them to
+         * extend the RoPE rotation past the original training context.
+         * The rescaling formula:
+         *
+         *   factor[i] = (pos < orig_ctx_len) ? short[i] : long[i]
+         *   freq[i]   = 1 / (rope_base^(2i/head_dim) * factor[i])
+         *   theta     = pos * freq[i]
+         *
+         * `rope_attn_factor` is applied separately as a Q magnitude
+         * scaling AFTER rotation — it compensates for variance growth
+         * past the original context length.
+         *
+         * The factor tables are head_dim/2 long (one entry per RoPE
+         * frequency pair). We assume head_dim/2 == 48 for Phi-3.5-mini;
+         * if a future variant ships a different size we'd want to
+         * track the actual length. */
+        if (c->rope_factors_short || c->rope_factors_long) {
+            /* Phi-3 LongRoPE.
+             *
+             * Phi-3 uses NeoX-style RoPE (non-interleaved pair layout):
+             * pairs are `(q[i], q[i + half])`, not `(q[2i], q[2i+1])`.
+             * Other llama-family GGUFs (SmolLM2, Llama-3) use the same
+             * NeoX rotation in the original model, but the GGUF
+             * converter pre-permutes their separate Q/K weights so the
+             * existing interleaved rotation (`tq_rope`) produces a
+             * mathematically equivalent result. Phi-3's *fused*
+             * `attn_qkv.weight` is NOT permuted at conversion time, so
+             * we apply the rotation in its native NeoX form.
+             *
+             * Per-frequency rescaling (LongRoPE):
+             *   factor[i] = (pos < orig_ctx_len) ? short[i] : long[i]
+             *   freq[i]   = 1 / (rope_base^(2i/head_dim) * factor[i])
+             *
+             * `rope_attn_factor` is a Q magnitude scaling that
+             * compensates for variance growth past the original
+             * context length. Only kicks in past orig_ctx_len. */
+            const float* factors =
+                (pos >= c->rope_orig_ctx_len && c->rope_factors_long)
+                    ? c->rope_factors_long
+                    : (c->rope_factors_short ? c->rope_factors_short
+                                              : c->rope_factors_long);
+            int half = head_dim / 2;
+            for (int h = 0; h < n_heads; h++) {
+                float* qh = s->q + h * head_dim;
+                for (int i = 0; i < half; i++) {
+                    float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim);
+                    float freq = base_freq / factors[i];
+                    float theta = pos * freq;
+                    float cos_t = cosf(theta);
+                    float sin_t = sinf(theta);
+                    float q0 = qh[i];
+                    float q1 = qh[i + half];
+                    qh[i]        = q0 * cos_t - q1 * sin_t;
+                    qh[i + half] = q0 * sin_t + q1 * cos_t;
+                }
+            }
+            for (int h = 0; h < n_kv_heads; h++) {
+                float* kh = s->k + h * head_dim;
+                for (int i = 0; i < half; i++) {
+                    float base_freq = 1.0f / powf(rope_base, 2.0f * i / (float)head_dim);
+                    float freq = base_freq / factors[i];
+                    float theta = pos * freq;
+                    float cos_t = cosf(theta);
+                    float sin_t = sinf(theta);
+                    float k0 = kh[i];
+                    float k1 = kh[i + half];
+                    kh[i]        = k0 * cos_t - k1 * sin_t;
+                    kh[i + half] = k0 * sin_t + k1 * cos_t;
+                }
+            }
+            if (pos >= c->rope_orig_ctx_len && c->rope_attn_factor > 0.0f) {
+                float scale = c->rope_attn_factor;
+                int n_q = n_heads * head_dim;
+                for (int i = 0; i < n_q; i++) s->q[i] *= scale;
+            }
+        } else {
+            tq_rope(s->q, s->k, pos, head_dim, n_heads, n_kv_heads, rope_base);
+        }
     }
 
     /* Store K,V in cache.
@@ -14900,6 +15160,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         if (layer->delta_a_log) {
             /* DeltaNet layer */
             deltanet_forward(model, s, l);
+        } else if (layer->gguf_w_qkv) {
+            /* Phi-3 fused QKV — `gguf_wq/wk/wv` are NULL because Q, K
+             * and V are concatenated into `gguf_w_qkv`. self_attn_forward
+             * handles the fused dispatch internally. */
+            self_attn_forward(model, s, l, pos);
         } else if ((layer->wq || layer->wq_q8 || layer->wq_q4 || layer->gguf_wq || layer->wq_q2) &&
                    (layer->wk || layer->wk_q8 || layer->wk_q4 || layer->gguf_wk || layer->wk_q2) &&
                    (layer->wv || layer->wv_q8 || layer->wv_q4 || layer->gguf_wv || layer->wv_q2 ||
@@ -14959,10 +15224,12 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         /* Dense FFN path — SwiGLU (Qwen3.5, Gemma4/STEP35) or GeGLU (Gemma3).
          * For Gemma 4 STEP35: layers are either MoE or dense, NOT both.
          * For Gemma 3: runs both MoE and dense FFN (shared expert) per layer. */
-        /* Dense FFN: run for non-MoE layers, or for Gemma 3 MoE layers with dense FFN */
+        /* Dense FFN: run for non-MoE layers, or for Gemma 3 MoE layers with dense FFN.
+         * Phi-3 uses gguf_w_up_gate (fused gate||up) instead of separate
+         * gguf_w_gate / gguf_w_up — also accept that as a valid FFN. */
         if ((!did_moe || (is_gemma3 && !c->is_gemma4 && did_moe)) &&
-            (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate) &&
-            (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up) &&
+            (layer->w_gate || layer->w_gate_q8 || layer->w_gate_q4 || layer->w_gate_q2 || layer->gguf_w_gate || layer->gguf_w_up_gate) &&
+            (layer->w_up || layer->w_up_q8 || layer->w_up_q4 || layer->w_up_q2 || layer->gguf_w_up || layer->gguf_w_up_gate) &&
             (layer->w_down || layer->w_down_q8 || layer->w_down_q4 || layer->w_down_q2 || layer->gguf_w_down)) {
 
             /* Pre-FFN norm: Gemma 4 dual-FFN uses pre_ffw_norm_2 for the dense FFN.
@@ -15010,6 +15277,30 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
                                    s->xb_q8, s->xb_q8s, inter, dim);
                 tq_matmul_q4_preq(s->hb2, layer->w_up_q4, layer->w_up_q4s,
                                    s->xb_q8, s->xb_q8s, inter, dim);
+            } else if (layer->gguf_w_up_gate) {
+                /* Phi-3 fused gate||up: one matmul produces a 2*inter
+                 * float buffer that we then split into gate (s->hb)
+                 * and up (s->hb2).
+                 *
+                 * Layout is `[gate | up]` along the output axis,
+                 * matching HuggingFace's
+                 *   gate, up = gate_up_proj(x).chunk(2, dim=-1)
+                 * The GGUF converter stores the fused tensor as-is, so
+                 * the first `inter` floats are gate and the next
+                 * `inter` are up. Verified end-to-end against
+                 * Phi-3.5-mini-instruct-Q4_K_M:
+                 *   "The capital of France is" → "Paris. The Eiffel
+                 *   Tower, located in the city center, stands as a
+                 *   symbolic landmark..."
+                 *
+                 * s->hb is sized to 2*inter when has_fused_up_gate,
+                 * so the matmul writes both halves into s->hb. Then
+                 * we copy the second half into s->hb2 — no shifting
+                 * of the first half needed. */
+                tq_matmul_gguf(s->hb, s->xb,
+                               layer->gguf_w_up_gate, layer->gguf_w_up_gate_type,
+                               2 * inter, dim);
+                memcpy(s->hb2, s->hb + inter, (size_t)inter * sizeof(float));
             } else if (layer->gguf_w_gate) {
                 tq_metal_batch_begin_if_available();
                 tq_matmul_gguf(s->hb, s->xb, layer->gguf_w_gate, layer->gguf_w_gate_type, inter, dim);
@@ -15441,11 +15732,23 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int n_prompt = 0;
 
     if (tokenizer && prompt) {
-        /* Gemma models: prepend BOS=2 (required by both Gemma 3 and 4 architectures).
-         * Qwen3.5: no BOS. */
+        /* Decide whether to prepend BOS:
+         *   - Gemma:           always (model_type == 1)
+         *   - Phi-3 / Llama:   yes if `<s>` is in the vocab (id 1).
+         *     Phi-3 in particular degrades into garbage without it.
+         *   - Qwen3.5 / GPT-2 BPE: no native BOS, skip.
+         * tq_encode itself handles the lookup chain for known names. */
         int add_bos = 0;
         if (model->config.model_type == 1) {
-            add_bos = 1; /* All Gemma models need BOS */
+            add_bos = 1;
+        } else {
+            int s_id = -1;
+            for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) {
+                if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<s>") == 0) {
+                    s_id = i; break;
+                }
+            }
+            if (s_id >= 0) add_bos = 1;
         }
         n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
     } else {
diff --git a/tools/gguf_inspect.c b/tools/gguf_inspect.c
new file mode 100644
index 0000000..7d94005
--- /dev/null
+++ b/tools/gguf_inspect.c
@@ -0,0 +1,173 @@
+/* gguf_inspect — dump tensor names, shapes, types, and metadata from a GGUF.
+ *
+ * Used during architecture-support work to verify what tensor names and
+ * shapes a given model file actually ships with, before writing loader
+ * code that depends on those assumptions.
+ *
+ *   cc -O0 -o gguf_inspect tools/gguf_inspect.c -lm -lpthread
+ *   ./gguf_inspect ~/.cache/quantcpp/Phi-3.5-mini-instruct-Q4_K_M.gguf
+ */
+#define QUANT_IMPLEMENTATION
+#include "../quant.h"
+
+#include <stdio.h>
+#include <string.h>
+
+static const char* type_name(tq_ggml_dtype t) {
+    switch (t) {
+        case TQ_GGML_TYPE_F32:    return "F32";
+        case TQ_GGML_TYPE_F16:    return "F16";
+        case TQ_GGML_TYPE_Q4_0:   return "Q4_0";
+        case TQ_GGML_TYPE_Q4_1:   return "Q4_1";
+        case TQ_GGML_TYPE_Q5_0:   return "Q5_0";
+        case TQ_GGML_TYPE_Q5_1:   return "Q5_1";
+        case TQ_GGML_TYPE_Q8_0:   return "Q8_0";
+        case TQ_GGML_TYPE_Q8_1:   return "Q8_1";
+        case TQ_GGML_TYPE_Q2_K:   return "Q2_K";
+        case TQ_GGML_TYPE_Q3_K:   return "Q3_K";
+        case TQ_GGML_TYPE_Q4_K:   return "Q4_K";
+        case TQ_GGML_TYPE_Q5_K:   return "Q5_K";
+        case TQ_GGML_TYPE_Q6_K:   return "Q6_K";
+        case TQ_GGML_TYPE_Q8_K:   return "Q8_K";
+        case TQ_GGML_TYPE_BF16:   return "BF16";
+        default: {
+            static char buf[16];
+            snprintf(buf, sizeof(buf), "TYPE_%d", (int)t);
+            return buf;
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "usage: %s <model.gguf> [--brief|--meta|--tensors|--layer N]\n", argv[0]);
+        return 1;
+    }
+    int brief    = 0;
+    int show_meta    = 1;
+    int show_tensors = 1;
+    int focus_layer  = -1;
+    for (int i = 2; i < argc; i++) {
+        if (strcmp(argv[i], "--brief") == 0) brief = 1;
+        else if (strcmp(argv[i], "--meta") == 0) { show_tensors = 0; }
+        else if (strcmp(argv[i], "--tensors") == 0) { show_meta = 0; }
+        else if (strcmp(argv[i], "--layer") == 0 && i + 1 < argc) {
+            focus_layer = atoi(argv[++i]);
+        }
+    }
+
+    tq_gguf_ctx_t* ctx = tq_gguf_open(argv[1]);
+    if (!ctx) {
+        fprintf(stderr, "failed to open %s\n", argv[1]);
+        return 2;
+    }
+
+    printf("=== %s ===\n", argv[1]);
+    printf("version  : %u\n", ctx->version);
+    printf("arch     : %s\n", ctx->arch);
+    printf("n_tensors: %llu\n", (unsigned long long)ctx->n_tensors);
+    printf("n_kv     : %llu\n", (unsigned long long)ctx->n_kv);
+    printf("file_size: %.2f MB\n", (double)ctx->mmap_size / (1024.0 * 1024.0));
+
+    if (show_meta && !brief) {
+        printf("\n--- metadata (selected keys) ---\n");
+        const char* keys[] = {
+            "general.architecture",
+            "general.name",
+            "general.basename",
+            "general.size_label",
+            "general.quantization_version",
+            "general.file_type",
+            "phi3.context_length",
+            "phi3.embedding_length",
+            "phi3.feed_forward_length",
+            "phi3.block_count",
+            "phi3.attention.head_count",
+            "phi3.attention.head_count_kv",
+            "phi3.attention.layer_norm_rms_epsilon",
+            "phi3.rope.freq_base",
+            "phi3.rope.scaling.factor",
+            "phi3.rope.scaling.original_context_length",
+            "phi3.rope.scaling.attn_factor",
+            "phi3.rope.scaling.short_factor",
+            "phi3.rope.scaling.long_factor",
+            "phi3.rope.scaling.type",
+            "phi3.rope.dimension_count",
+            "phi3.attention.sliding_window",
+            "tokenizer.ggml.model",
+            "tokenizer.ggml.bos_token_id",
+            "tokenizer.ggml.eos_token_id",
+            "tokenizer.ggml.padding_token_id",
+            "tokenizer.ggml.unknown_token_id",
+            "tokenizer.chat_template",
+            NULL,
+        };
+        for (int i = 0; keys[i]; i++) {
+            int64_t idx = tq_gguf_find_key(ctx, keys[i]);
+            if (idx < 0) continue;
+            tq_gguf_kv_t* kv = &ctx->kv[idx];
+            printf("  %-50s = ", keys[i]);
+            switch (kv->type) {
+                case TQ_GGUF_TYPE_UINT32:
+                    printf("%u (u32)\n", kv->value.u32);
+                    break;
+                case TQ_GGUF_TYPE_INT32:
+                    printf("%d (i32)\n", kv->value.i32);
+                    break;
+                case TQ_GGUF_TYPE_UINT64:
+                    printf("%llu (u64)\n", (unsigned long long)kv->value.u64);
+                    break;
+                case TQ_GGUF_TYPE_FLOAT32:
+                    printf("%.6g (f32)\n", kv->value.f32);
+                    break;
+                case TQ_GGUF_TYPE_STRING: {
+                    const char* s = tq_gguf_get_str(ctx, keys[i]);
+                    if (s) {
+                        size_t l = strlen(s);
+                        if (l > 80) printf("\"%.80s...\" (string, %zu bytes)\n", s, l);
+                        else        printf("\"%s\" (string)\n", s);
+                    } else printf("(string, value unreadable)\n");
+                    break;
+                }
+                case TQ_GGUF_TYPE_BOOL:
+                    printf("%s (bool)\n", kv->value.bool_val ? "true" : "false");
+                    break;
+                case TQ_GGUF_TYPE_ARRAY:
+                    printf("(array, elem_type=%d, count=%llu)\n",
+                            (int)kv->value.array.elem_type,
+                            (unsigned long long)kv->value.array.count);
+                    break;
+                default:
+                    printf("(type=%d)\n", (int)kv->type);
+                    break;
+            }
+        }
+    }
+
+    if (show_tensors) {
+        printf("\n--- tensors ---\n");
+        printf("%-50s %-8s %s\n", "name", "type", "shape");
+        for (uint64_t i = 0; i < ctx->n_tensors; i++) {
+            const tq_gguf_tensor_t* t = &ctx->tensors[i];
+            if (focus_layer >= 0) {
+                /* Only show blk.N. tensors for the requested layer */
+                char prefix[32];
+                snprintf(prefix, sizeof(prefix), "blk.%d.", focus_layer);
+                if (strncmp(t->name, prefix, strlen(prefix)) != 0) continue;
+            }
+            char shape_buf[64];
+            int n = 0;
+            n += snprintf(shape_buf + n, sizeof(shape_buf) - n, "[");
+            for (uint32_t d = 0; d < t->n_dims && n < (int)sizeof(shape_buf) - 1; d++) {
+                n += snprintf(shape_buf + n, sizeof(shape_buf) - n,
+                              "%lld%s", (long long)t->shape[d],
+                              d + 1 < t->n_dims ? "," : "");
+            }
+            n += snprintf(shape_buf + n, sizeof(shape_buf) - n, "]");
+            printf("%-50s %-8s %s\n", t->name, type_name(t->type), shape_buf);
+        }
+    }
+
+    tq_gguf_close(ctx);
+    return 0;
+}
diff --git a/tools/phi3_infer_test.c b/tools/phi3_infer_test.c
new file mode 100644
index 0000000..7b08edd
--- /dev/null
+++ b/tools/phi3_infer_test.c
@@ -0,0 +1,62 @@
+/* phi3_infer_test — minimal end-to-end inference test for Phi-3.
+ *
+ * Loads the model, prefills a known prompt, generates ~80 tokens with
+ * greedy sampling, and prints them. We're not validating quality
+ * against a reference here — just checking that the output is coherent
+ * English text instead of garbage tokens. */
+#define QUANT_IMPLEMENTATION
+#include "../quant.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+static void print_token(const char* text, void* ud) {
+    (void)ud;
+    fputs(text, stdout);
+    fflush(stdout);
+}
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "usage: %s <model.gguf> [prompt]\n", argv[0]);
+        return 1;
+    }
+
+    /* Phi-3 chat template:
+     *   <|user|>\n{msg}<|end|>\n<|assistant|>\n
+     * (verified against the GGUF chat_template metadata) */
+    const char* user_msg = (argc >= 3) ? argv[2] : "What is the capital of France?";
+    char prompt[1024];
+    snprintf(prompt, sizeof(prompt),
+             "<|user|>\n%s<|end|>\n<|assistant|>\n", user_msg);
+
+    fprintf(stderr, "Loading %s ...\n", argv[1]);
+    quant_model* model = quant_load(argv[1]);
+    if (!model) {
+        fprintf(stderr, "quant_load failed\n");
+        return 2;
+    }
+
+    quant_config cfg = {
+        .temperature = 0.0f,   /* greedy */
+        .top_p = 1.0f,
+        .max_tokens = 80,
+        .n_threads = 4,
+        .kv_compress = 0,
+    };
+    quant_ctx* ctx = quant_new(model, &cfg);
+    if (!ctx) {
+        fprintf(stderr, "quant_new failed\n");
+        quant_free_model(model);
+        return 3;
+    }
+
+    fprintf(stderr, "\n--- prompt ---\n%s\n--- response ---\n", prompt);
+    int n = quant_generate(ctx, prompt, print_token, NULL);
+    fprintf(stderr, "\n--- end ---\ngenerated %d tokens\n", n);
+
+    quant_free_ctx(ctx);
+    quant_free_model(model);
+    return n > 0 ? 0 : 4;
+}