Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions bindings/python/quantcpp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ def cmd_run(args):
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
n_threads=args.threads)

if args.prompt:
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
prompt_parts = args.prompt if args.prompt else None
if prompt_parts:
question = " ".join(prompt_parts) if isinstance(prompt_parts, list) else prompt_parts
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
Expand Down Expand Up @@ -357,6 +358,8 @@ def cmd_chat_default(args):
def main():
import argparse

from quantcpp import __version__

parser = argparse.ArgumentParser(
prog="quantcpp",
description="Chat with a local LLM. No API key, no GPU, no server.",
Expand Down Expand Up @@ -387,6 +390,8 @@ def main():
""",
)

parser.add_argument("--version", action="version", version=f"quantcpp {__version__}")

sub = parser.add_subparsers(dest="command")

# pull
Expand Down Expand Up @@ -433,7 +438,32 @@ def main():
parser.add_argument("--temperature", "-t", type=float, default=0.7)
parser.add_argument("--threads", "-j", type=int, default=4)

args = parser.parse_args()
# Backwards-compat (issue #54): if the first positional arg is not a
# known subcommand, treat all positionals as a prompt. We must detect
# this BEFORE argparse sees the argv, because the subparser will reject
# unknown choices with an error.
known_commands = {"pull", "list", "run", "serve", "client"}
argv = sys.argv[1:]

first_pos = None
for a in argv:
if a.startswith("-"):
continue
first_pos = a
break

if first_pos and first_pos not in known_commands:
# Parse with a minimal parser that has no subcommands
compat = argparse.ArgumentParser(prog="quantcpp", add_help=False)
compat.add_argument("prompt", nargs="*", default=None)
compat.add_argument("--model", "-m", default=None)
compat.add_argument("--max-tokens", "-n", type=int, default=256)
compat.add_argument("--temperature", "-t", type=float, default=0.7)
compat.add_argument("--threads", "-j", type=int, default=4)
args = compat.parse_args(argv)
return cmd_chat_default(args)

args = parser.parse_args(argv)

if args.command == "pull":
return cmd_pull(args)
Expand Down
14 changes: 14 additions & 0 deletions include/turboquant/tq_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ typedef struct {
float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
float attn_logit_softcap; /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
int* per_layer_inter_dim; /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */

/* Phi-3 LongRoPE parameters */
int rope_orig_ctx_len; /* original context length (e.g., 4096) */
float rope_attn_factor; /* attention magnitude scaling */
const float* rope_factors_short; /* [head_dim/2] for short context */
const float* rope_factors_long; /* [head_dim/2] for long context */

/* Phi-3 fused-tensor flags — drive state buffer sizing */
int has_fused_qkv; /* any layer has gguf_w_qkv */
int has_fused_up_gate; /* any layer has gguf_w_up_gate */
} tq_model_config_t;

/* ============================================================
Expand Down Expand Up @@ -173,6 +183,10 @@ typedef struct {
const void* gguf_delta_a; int gguf_delta_a_type;
const void* gguf_delta_b; int gguf_delta_b_type;
const void* gguf_delta_out; int gguf_delta_out_type;
/* Phi-3 fused projections — one matmul + memcpy split */
const void* gguf_w_qkv; int gguf_w_qkv_type; /* [hidden, q+k+v] fused QKV */
const void* gguf_w_up_gate; int gguf_w_up_gate_type; /* [hidden, 2*inter] fused gate||up */

/* GGUF FFN (dense layers in MoE models) */
const void* gguf_w_gate; int gguf_w_gate_type;
const void* gguf_w_up; int gguf_w_up_type;
Expand Down
26 changes: 18 additions & 8 deletions quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -13317,12 +13317,16 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
s->delta_dvec = (float*)calloc((size_t)dv, sizeof(float));
}

/* Quantization workspace */
/* Quantization workspace — use MAX head_dim for hybrid attention (Gemma 4).
* Sliding layers have head_dim=256, full layers have head_dim=512.
* Quantized cache must accommodate the larger dimension. (issue #61) */
size_t block_size = tq_type_block_size(kv_type);
size_t type_size = tq_type_type_size(kv_type);
if (block_size == 0) block_size = TQ_BK;
if (type_size == 0) type_size = sizeof(block_tq_uniform_4b);
size_t n_blocks_per_head = ((size_t)config->head_dim + block_size - 1) / block_size;
int max_head_dim = config->head_dim;
if (config->full_head_dim > max_head_dim) max_head_dim = config->full_head_dim;
size_t n_blocks_per_head = ((size_t)max_head_dim + block_size - 1) / block_size;
/* quant_key_buf is used as a gather buffer for integer attention:
* we collect quantized key blocks for one KV head across all seq positions.
* Size needed: max_seq_len * blocks_per_head * type_size */
Expand All @@ -13337,7 +13341,10 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
* Layout: [n_layers][max_seq_len][n_kv_heads][blocks_per_head * type_size]
* Each key vector is quantized when stored, then reused for fast Q4xQ8 attention. */
s->quant_head_stride = n_blocks_per_head * type_size;
size_t quant_pos_stride = s->quant_head_stride * (size_t)config->n_kv_heads;
/* Use max kv_heads for position stride (hybrid: sliding=8, full=2 but larger heads) */
int max_kv_heads = config->n_kv_heads;
if (config->full_n_kv_heads > max_kv_heads) max_kv_heads = config->full_n_kv_heads;
size_t quant_pos_stride = s->quant_head_stride * (size_t)max_kv_heads;
s->quant_kv_stride = quant_pos_stride * (size_t)max_seq;
if (kv_type < TQ_TYPE_COUNT) {
s->quant_key_cache = calloc((size_t)n_layers * s->quant_kv_stride, 1);
Expand Down Expand Up @@ -14388,15 +14395,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
/* Quantized KV cache: stride was allocated with sliding dims (c->n_kv_heads, c->head_dim).
* For hybrid attention full layers with different head_dim, skip quant cache
* (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
/* Hybrid attention KV cache: allocated with max(sliding, full) dimensions.
* quant_head_stride uses max_head_dim, quant_pos_stride uses max_kv_heads.
* Both sliding and full layers can use the quantized cache. (issue #61) */
int cache_n_kv_heads = c->n_kv_heads;
if (head_dim != c->head_dim) {
/* Full layer: head_dim mismatch with quant cache allocation.
* Disable both quantized and integer attention → use FP32 path. */
if (c->full_n_kv_heads > cache_n_kv_heads) cache_n_kv_heads = c->full_n_kv_heads;
if (head_dim != c->head_dim && c->full_head_dim == 0) {
/* Non-hybrid head_dim mismatch — disable quantized path */
use_quant_kv = 0;
use_int_attn = 0;
/* Ensure K is stored in FP32 cache (may have been skipped above) */
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
} else if (use_int_attn && head_dim != c->head_dim) {
} else if (use_int_attn && head_dim != c->head_dim && c->full_head_dim == 0) {
use_int_attn = 0;
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
}
Expand Down Expand Up @@ -16297,6 +16306,7 @@ static int chat_find_marker(const char* h, int hlen, const char* m) {
static const char* const CHAT_END_MARKERS[] = {
"<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
"<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
"</s>", "<|end|>",
NULL,
};

Expand Down
41 changes: 41 additions & 0 deletions src/backend/cpu/tq_cpu_dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@ extern void tq_qjl_attention_avx2(const float* q, const void* kv,
float* s, int seq, int hd);
#endif

#if defined(__ARM_FEATURE_SVE)
/* SVE optimized implementations (stubs — delegate to reference for now) */
extern void tq_uniform_4b_quantize_sve(const float* src, void* dst, int n);
extern void tq_uniform_4b_dequantize_sve(const void* src, float* dst, int n);
extern void tq_polar_quantize_sve(const float* src, void* dst, int n);
extern void tq_polar_dequantize_sve(const void* src, float* dst, int n);
extern void tq_qjl_quantize_sve(const float* src, void* dst, int n);
extern void tq_qjl_attention_sve(const float* q, const void* kv,
float* s, int seq, int hd);
#endif

/* ================================================================
* CPU feature detection
* ================================================================ */
Expand Down Expand Up @@ -118,6 +129,23 @@ void tq_cpu_dispatch_init(void) {
tq_dispatch_table[TQ_TYPE_QJL_1B].attention = tq_qjl_attention_neon;
#endif

/* --- ARM SVE dispatch (compile-time detection) --- */
#if defined(__ARM_FEATURE_SVE)
/* SVE takes priority over NEON when available (wider vectors).
* Currently stubs that delegate to reference — swap with real
* SVE implementations as they are developed. */
tq_dispatch_table[TQ_TYPE_UNIFORM_4B].quantize = tq_uniform_4b_quantize_sve;
tq_dispatch_table[TQ_TYPE_UNIFORM_4B].dequantize = tq_uniform_4b_dequantize_sve;

tq_dispatch_table[TQ_TYPE_POLAR_3B].quantize = tq_polar_quantize_sve;
tq_dispatch_table[TQ_TYPE_POLAR_3B].dequantize = tq_polar_dequantize_sve;
tq_dispatch_table[TQ_TYPE_POLAR_4B].quantize = tq_polar_quantize_sve;
tq_dispatch_table[TQ_TYPE_POLAR_4B].dequantize = tq_polar_dequantize_sve;

tq_dispatch_table[TQ_TYPE_QJL_1B].quantize = tq_qjl_quantize_sve;
tq_dispatch_table[TQ_TYPE_QJL_1B].attention = tq_qjl_attention_sve;
#endif

/* --- x86 AVX2 dispatch (runtime detection) --- */
#if defined(__AVX2__)
/* If compiled with -mavx2, AVX2 is always available */
Expand Down Expand Up @@ -173,6 +201,19 @@ const char* tq_get_dispatch_backend(tq_type type) {
if (!tq_dispatch_initialized) tq_cpu_dispatch_init();
if (type < 0 || type >= TQ_TYPE_COUNT) return "unknown";

#if defined(__ARM_FEATURE_SVE)
/* Check if using SVE versions */
if (type == TQ_TYPE_UNIFORM_4B &&
tq_dispatch_table[type].quantize == tq_uniform_4b_quantize_sve)
return "sve";
if ((type == TQ_TYPE_POLAR_3B || type == TQ_TYPE_POLAR_4B) &&
tq_dispatch_table[type].quantize == tq_polar_quantize_sve)
return "sve";
if (type == TQ_TYPE_QJL_1B &&
tq_dispatch_table[type].quantize == tq_qjl_quantize_sve)
return "sve";
#endif

#if defined(__ARM_NEON)
/* Check if using NEON versions */
if (type == TQ_TYPE_UNIFORM_4B &&
Expand Down
6 changes: 5 additions & 1 deletion src/engine/tq_generate.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,14 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
if (tokenizer && prompt) {
/* BOS token handling:
* Gemma 3/4: BOS=2 (required)
* Phi-3: BOS via <s> (required — garbage without it)
* LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
* Qwen3.5: no BOS needed */
int add_bos = 0;
if (model->config.model_type == 1) {
add_bos = 1; /* Gemma: always prepend BOS=2 */
} else if (model->config.has_fused_qkv) {
add_bos = 1; /* Phi-3: requires <s> BOS */
}
n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
} else {
Expand Down Expand Up @@ -645,7 +648,7 @@ int tq_generate_continue(tq_model_t* model,
if (!new_tokens) return -1;
int n_new = 0;
if (tokenizer && prompt) {
int add_bos = (model->config.model_type == 1) ? 1 : 0;
int add_bos = (model->config.model_type == 1 || model->config.has_fused_qkv) ? 1 : 0;
n_new = tq_encode(tokenizer, prompt, new_tokens, max_prompt, add_bos);
}
if (n_new <= 0) {
Expand Down Expand Up @@ -905,6 +908,7 @@ static int chat_find_marker(const char* h, int hlen, const char* m) {
static const char* const CHAT_END_MARKERS[] = {
"<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
"<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
"</s>", "<|end|>",
NULL,
};

Expand Down
82 changes: 81 additions & 1 deletion src/engine/tq_model.c
Original file line number Diff line number Diff line change
Expand Up @@ -2931,6 +2931,20 @@ tq_model_t* tq_load_gguf(const char* path) {
c->attn_logit_softcap = 50.0f;
}

/* LongRoPE config (Phi-3 etc.) */
c->rope_orig_ctx_len = (int)tq_gguf_get_u32(gguf, GGUF_KEY("rope.scaling.original_context_length"), 0);
c->rope_attn_factor = tq_gguf_get_f32(gguf, GGUF_KEY("rope.scaling.attn_factor"), 0.0f);
{
const tq_gguf_tensor_t* rfs = tq_gguf_find_tensor(gguf, "rope_factors_short.weight");
const tq_gguf_tensor_t* rfl = tq_gguf_find_tensor(gguf, "rope_factors_long.weight");
if (rfs && rfs->type == TQ_GGML_TYPE_F32) c->rope_factors_short = (const float*)rfs->data;
if (rfl && rfl->type == TQ_GGML_TYPE_F32) c->rope_factors_long = (const float*)rfl->data;
if (rfs || rfl) {
fprintf(stderr, "tq_load_gguf: LongRoPE detected — orig_ctx=%d, attn_factor=%.4f\n",
c->rope_orig_ctx_len, c->rope_attn_factor);
}
}

/* Cap context for memory safety on small machines.
* GGUF models often claim 262K context but we cap at 4096 by default.
* Users can override with --ctx flag in quant. */
Expand Down Expand Up @@ -3223,6 +3237,23 @@ tq_model_t* tq_load_gguf(const char* path) {
* We store the raw data pointer + type info using a small struct packed into
* the existing FP32 weight pointer fields. For GGUF models, we use a special
* dispatch: if gguf_ctx is non-NULL, the forward pass uses tq_matmul_gguf. */

/* Fused QKV detection (Phi-3 etc.): attn_qkv.weight contains Q, K, V concatenated */
snprintf(tname, sizeof(tname), "blk.%d.attn_qkv.weight", l);
const tq_gguf_tensor_t* wqkv_t = find_gguf_tensor(gguf, tname);
if (wqkv_t) {
layer->gguf_w_qkv = wqkv_t->data;
layer->gguf_w_qkv_type = wqkv_t->type;
c->has_fused_qkv = 1;

snprintf(tname, sizeof(tname), "blk.%d.attn_output.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_wo = t->data; layer->gguf_wo_type = t->type; }

attn_indices[n_attn_layers++] = l;
goto post_attn_load; /* Skip standard attn_q/k/v loading */
}

snprintf(tname, sizeof(tname), "blk.%d.attn_q.weight", l);
const tq_gguf_tensor_t* wq_t = find_gguf_tensor(gguf, tname);
int is_attn_layer = (wq_t != NULL);
Expand Down Expand Up @@ -3264,6 +3295,7 @@ tq_model_t* tq_load_gguf(const char* path) {

attn_indices[n_attn_layers++] = l;
}
post_attn_load: ; /* Both fused QKV and standard Q/K/V paths converge here */

/* Check for DeltaNet / SSM weights (Qwen3.5 hybrid) */
snprintf(tname, sizeof(tname), "blk.%d.ssm_a", l);
Expand Down Expand Up @@ -3524,7 +3556,18 @@ tq_model_t* tq_load_gguf(const char* path) {
if (t) { layer->gguf_w_gate = t->data; layer->gguf_w_gate_type = t->type; }
snprintf(tname, sizeof(tname), "blk.%d.ffn_up.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_w_up = t->data; layer->gguf_w_up_type = t->type; }
if (t) {
/* Phi-3 fused gate||up: ffn_up contains both gate and up projections
* concatenated along output dim (shape[1] == 2 * intermediate_dim) */
if (c->intermediate_dim > 0 && (int)t->shape[1] == 2 * c->intermediate_dim) {
layer->gguf_w_up_gate = t->data;
layer->gguf_w_up_gate_type = t->type;
c->has_fused_up_gate = 1;
} else {
layer->gguf_w_up = t->data;
layer->gguf_w_up_type = t->type;
}
}
snprintf(tname, sizeof(tname), "blk.%d.ffn_down.weight", l);
t = find_gguf_tensor(gguf, tname);
if (t) { layer->gguf_w_down = t->data; layer->gguf_w_down_type = t->type; }
Expand Down Expand Up @@ -4412,6 +4455,43 @@ void tq_free_model(tq_model_t* model) {
}
}
free(model->moe_config);

/* Free dequantized norm/embedding buffers (GGUF path only).
* In the GGUF path, dequant_tensor_fp32() individually malloc's each
* norm weight. In the SafeTensor path, these point into _converted_data
* (freed above), so we must NOT free them again. (issue #60) */
if (model->gguf_ctx && model->layers) {
for (int l = 0; l < model->config.n_layers; l++) {
tq_layer_weights_t* layer = &model->layers[l];
free(layer->attn_norm);
free(layer->ffn_norm);
free(layer->q_norm);
free(layer->k_norm);
free(layer->post_attn_norm);
free(layer->post_ffn_norm);
free(layer->pre_ffn_norm);
free(layer->post_ffn_norm_1);
free(layer->pre_ffn_norm_2);
free(layer->post_ffn_norm_2);
free(layer->ple_norm);
free(layer->delta_a_log);
free(layer->delta_conv1d);
free(layer->delta_dt_bias);
free(layer->delta_in_proj_qkv);
free(layer->delta_in_proj_z);
free(layer->delta_norm);
free(layer->delta_in_proj_a);
free(layer->delta_in_proj_b);
free(layer->delta_out_proj);
}
free(model->token_embedding);
free(model->output_weight);
free(model->output_norm);
free(model->rope_freqs);
free(model->ple_proj);
free(model->ple_proj_norm);
}

free(model->layers);

/* Free GGUF context (handles munmap internally) */
Expand Down
1 change: 1 addition & 0 deletions src/engine/tq_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,7 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
if (add_bos) {
/* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
int bos_id = str_lookup(tok, "<bos>");
if (bos_id < 0) { bos_id = str_lookup(tok, "<s>"); }
if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
if (bos_id >= 0) {
tokens[n_tokens++] = bos_id;
Expand Down
Loading
Loading