Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions include/turboquant/tq_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ typedef struct {
float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
float attn_logit_softcap; /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
int* per_layer_inter_dim; /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */

/* Phi-3 LongRoPE config */
int rope_orig_ctx_len; /* original context length (e.g., 4096) */
float rope_attn_factor; /* attention magnitude scaling */
const float* rope_factors_short; /* [head_dim/2] for short context */
const float* rope_factors_long; /* [head_dim/2] for long context */

/* Phi-3 fused-tensor flags — set during load */
int has_fused_qkv; /* any layer has gguf_w_qkv */
int has_fused_up_gate; /* any layer has gguf_w_up_gate */
} tq_model_config_t;

/* ============================================================
Expand Down Expand Up @@ -178,6 +188,12 @@ typedef struct {
const void* gguf_w_up; int gguf_w_up_type;
const void* gguf_w_down; int gguf_w_down_type;

/* Phi-3 fused projections (from quant.h, synced 2026-04-12).
* gguf_w_qkv: [hidden, q_dim + k_dim + v_dim] concatenated QKV
* gguf_w_up_gate: [hidden, 2 * intermediate_dim] concatenated gate||up */
const void* gguf_w_qkv; int gguf_w_qkv_type;
const void* gguf_w_up_gate; int gguf_w_up_gate_type;

/* MoE expert weights (NULL for dense FFN layers) */
void* moe; /* tq_moe_layer_t* (from tq_gguf.h), NULL if dense */

Expand Down
11 changes: 11 additions & 0 deletions src/backend/metal/tq_metal_dispatch.m
Original file line number Diff line number Diff line change
Expand Up @@ -621,10 +621,21 @@ void tq_free_metal_backend(void) {
return [[tq_mtl_device name] UTF8String];
}

/**
* Disable Metal dispatch globally. Called by the model loader when a
* fused-tensor architecture (Phi-3) is detected — the Metal matmul
* kernels don't handle the non-standard output dimensions.
*/
static int tq_metal_disabled = 0;
void tq_metal_disable(void) {
tq_metal_disabled = 1;
}

/**
* Check if Metal backend is available and initialized.
*/
int tq_metal_available(void) {
if (tq_metal_disabled) return 0;
/* Lazy initialization: first call triggers Metal setup */
static int init_done = 0;
if (!init_done) {
Expand Down
18 changes: 13 additions & 5 deletions src/engine/tq_generate.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,21 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
int n_prompt = 0;

if (tokenizer && prompt) {
/* BOS token handling:
* Gemma 3/4: BOS=2 (required)
* LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
* Qwen3.5: no BOS needed */
/* BOS token handling (synced from quant.h 2026-04-12):
* Gemma: always (model_type == 1)
* Phi-3 / Llama: yes if `<s>` is in the vocab (id 1).
* Qwen3.5: no BOS needed. */
int add_bos = 0;
if (model->config.model_type == 1) {
add_bos = 1; /* Gemma: always prepend BOS=2 */
add_bos = 1;
} else {
int s_id = -1;
for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) {
if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<s>") == 0) {
s_id = i; break;
}
}
if (s_id >= 0) add_bos = 1;
}
n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
} else {
Expand Down
99 changes: 89 additions & 10 deletions src/engine/tq_model.c
Original file line number Diff line number Diff line change
Expand Up @@ -2931,6 +2931,23 @@ tq_model_t* tq_load_gguf(const char* path) {
c->attn_logit_softcap = 50.0f;
}

/* Phi-3 LongRoPE config + factor tables (synced from quant.h 2026-04-12). */
c->rope_orig_ctx_len = (int)tq_gguf_get_u32(gguf,
GGUF_KEY("rope.scaling.original_context_length"), 0);
c->rope_attn_factor = tq_gguf_get_f32(gguf,
GGUF_KEY("rope.scaling.attn_factor"), 0.0f);
{
const tq_gguf_tensor_t* rfs = tq_gguf_find_tensor(gguf, "rope_factors_short.weight");
const tq_gguf_tensor_t* rfl = tq_gguf_find_tensor(gguf, "rope_factors_long.weight");
if (rfs && rfs->type == TQ_GGML_TYPE_F32) c->rope_factors_short = (const float*)rfs->data;
if (rfl && rfl->type == TQ_GGML_TYPE_F32) c->rope_factors_long = (const float*)rfl->data;
if (rfs || rfl) {
fprintf(stderr,
"tq_load_gguf: LongRoPE detected — orig_ctx=%d, attn_factor=%.4f\n",
c->rope_orig_ctx_len, c->rope_attn_factor);
}
}

/* Cap context for memory safety on small machines.
* GGUF models often claim 262K context but we cap at 4096 by default.
* Users can override with --ctx flag in quant. */
Expand Down Expand Up @@ -3219,10 +3236,26 @@ tq_model_t* tq_load_gguf(const char* path) {
}
}

/* Attention weights — keep as GGUF quantized pointers for on-the-fly dequant.
* We store the raw data pointer + type info using a small struct packed into
* the existing FP32 weight pointer fields. For GGUF models, we use a special
* dispatch: if gguf_ctx is non-NULL, the forward pass uses tq_matmul_gguf. */
/* Phi-3 fused QKV detection (synced from quant.h 2026-04-12).
* Phi-3 ships `blk.N.attn_qkv.weight` with shape [hidden, 3*hidden]
* instead of three separate `attn_q/k/v.weight` tensors. */
snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l);
const tq_gguf_tensor_t* wqkv_t = find_gguf_tensor(gguf, tname);
if (wqkv_t && !layer->delta_a_log) {
/* Only take the fused path when there are NO DeltaNet weights —
* otherwise the DeltaNet code below handles attn_qkv itself. */
layer->gguf_w_qkv = wqkv_t->data;
layer->gguf_w_qkv_type = wqkv_t->type;
c->has_fused_qkv = 1;
snprintf(tname, sizeof(tname), "blk.%d.attn_output.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_wo = t->data; layer->gguf_wo_type = t->type; }
attn_indices[n_attn_layers++] = l;
goto post_attn_load;
}

/* Standard llama-style attention weights — keep as GGUF quantized
* pointers for on-the-fly dequant. */
snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
const tq_gguf_tensor_t* wq_t = find_gguf_tensor(gguf, tname);
int is_attn_layer = (wq_t != NULL);
Expand Down Expand Up @@ -3265,6 +3298,7 @@ tq_model_t* tq_load_gguf(const char* path) {
attn_indices[n_attn_layers++] = l;
}

post_attn_load:
/* Check for DeltaNet / SSM weights (Qwen3.5 hybrid) */
snprintf(tname, sizeof(tname), "blk.%d.ssm_a", l);
t = find_gguf_tensor(gguf, tname);
Expand Down Expand Up @@ -3518,13 +3552,28 @@ tq_model_t* tq_load_gguf(const char* path) {
if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
}
} else {
/* Dense model: use GGUF on-the-fly dequant */
/* Dense model: use GGUF on-the-fly dequant.
* Phi-3 fused FFN: when `ffn_up` has shape [hidden, 2*ff] AND
* there is no separate `ffn_gate`, it's a fused gate||up tensor. */
snprintf(tname, sizeof(tname), "blk.%d.ffn_gate.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_w_gate = t->data; layer->gguf_w_gate_type = t->type; }

snprintf(tname, sizeof(tname), "blk.%d.ffn_up.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_w_up = t->data; layer->gguf_w_up_type = t->type; }
if (t) {
if (!layer->gguf_w_gate && t->n_dims >= 2 &&
c->intermediate_dim > 0 &&
(int)t->shape[1] == 2 * c->intermediate_dim) {
layer->gguf_w_up_gate = t->data;
layer->gguf_w_up_gate_type = t->type;
c->has_fused_up_gate = 1;
} else {
layer->gguf_w_up = t->data;
layer->gguf_w_up_type = t->type;
}
}

snprintf(tname, sizeof(tname), "blk.%d.ffn_down.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
Expand All @@ -3540,6 +3589,20 @@ tq_model_t* tq_load_gguf(const char* path) {
n_attn_layers, c->n_layers);
}

/* Hard-fail when neither standard self_attn nor DeltaNet was detected.
* (Synced from quant.h — prevents silent garbage from unsupported archs.) */
if (n_attn_layers == 0 && c->delta_n_heads == 0) {
fprintf(stderr,
"tq_load_gguf: ERROR — model architecture '%s' is not supported.\n"
" Detected 0 self_attn layers and no DeltaNet weights.\n"
" This usually means the model uses fused QKV projection\n"
" (e.g., Phi-3 `attn_qkv`) which this build does not yet handle.\n"
" See docs/supported_models.md for the architecture support matrix.\n",
gguf->arch[0] ? gguf->arch : "unknown");
tq_free_model(model);
return NULL;
}

/* Set up layer_is_sliding for Gemma hybrid attention.
* Detect from K tensor shape: sliding and full layers have different K output dims.
* The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */
Expand Down Expand Up @@ -4072,9 +4135,20 @@ skip_q4_conversion: ;
* Adding +1 at runtime would double-apply and cause activation explosion.
* The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */

/* Initialize persistent Metal GPU buffers for layer-level compute */
/* Initialize persistent Metal GPU buffers for layer-level compute.
*
* Skip Metal for Phi-3 fused-tensor models: the Metal matmul kernels
* assume standard separate-tensor layouts (Q4_K blocks per row,
* fixed output buffer sizes). Fused QKV and fused gate||up produce
* larger output vectors that the Metal kernel doesn't handle.
*
* This is the right trade-off because:
* 1. CPU NEON Q4×Q8 is already faster than Metal for sub-4B models
* (measured: 95 tok/s CPU vs 38 tok/s GPU on SmolLM2).
* 2. Phi-3's 32K vocab means the lm_head matmul (where Metal helps
* most due to large output dim) is small — CPU handles it fine. */
#ifdef TQ_HAS_METAL
{
if (!c->has_fused_qkv && !c->has_fused_up_gate) {
extern int tq_metal_gpu_init_buffers(int, int, int, int);
extern int tq_metal_gpu_init_attn(int, int, int);
int max_q_dim = c->n_heads * c->head_dim;
Expand All @@ -4086,9 +4160,14 @@ skip_q4_conversion: ;
if (full_kv > max_kv_dim) max_kv_dim = full_kv;
}
tq_metal_gpu_init_buffers(c->hidden_dim, c->intermediate_dim, max_q_dim, max_kv_dim);

/* Initialize attention + KV cache GPU buffers for compute graph forward */
tq_metal_gpu_init_attn(c->n_heads, c->max_seq_len, max_kv_dim);
} else {
/* Disable Metal matmul dispatch globally for this process.
* The Metal backend is still initialized (MoE kernels etc.) but
* tq_matmul_gguf will check this flag and skip GPU dispatch. */
extern void tq_metal_disable(void);
tq_metal_disable();
fprintf(stderr, "tq_load_gguf: Metal GPU dispatch disabled (fused-tensor model — CPU is faster)\n");
}
#endif

Expand Down
12 changes: 9 additions & 3 deletions src/engine/tq_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1182,11 +1182,17 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
int n_tokens = 0;

/* Add BOS token if requested.
* Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */
* Different model families use different BOS strings (synced from
* quant.h 2026-04-12):
* Gemma: <bos> (id 2)
* Llama / Phi-3: <s> (id 1) — SentencePiece convention
* Qwen / ChatML: <|im_start|>
* Llama-3: <|begin_of_text|> */
if (add_bos) {
/* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
int bos_id = str_lookup(tok, "<bos>");
if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
if (bos_id < 0) bos_id = str_lookup(tok, "<s>");
if (bos_id < 0) bos_id = str_lookup(tok, "<|im_start|>");
if (bos_id < 0) bos_id = str_lookup(tok, "<|begin_of_text|>");
if (bos_id >= 0) {
tokens[n_tokens++] = bos_id;
}
Expand Down
Loading