Skip to content

Commit 34e5539

Browse files
committed
Merge branch 'fix/qwen35-quant-ask' — 13 issues resolved + sync hardening
2 parents eb4f7d1 + 221efbb commit 34e5539

11 files changed

Lines changed: 600 additions & 49 deletions

File tree

bindings/python/quantcpp/cli.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,9 @@ def cmd_run(args):
153153
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
154154
n_threads=args.threads)
155155

156-
if args.prompt:
157-
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
156+
prompt_parts = args.prompt if args.prompt else None
157+
if prompt_parts:
158+
question = " ".join(prompt_parts) if isinstance(prompt_parts, list) else prompt_parts
158159
for tok in m.generate(question):
159160
print(tok, end="", flush=True)
160161
print()
@@ -357,6 +358,8 @@ def cmd_chat_default(args):
357358
def main():
358359
import argparse
359360

361+
from quantcpp import __version__
362+
360363
parser = argparse.ArgumentParser(
361364
prog="quantcpp",
362365
description="Chat with a local LLM. No API key, no GPU, no server.",
@@ -387,6 +390,8 @@ def main():
387390
""",
388391
)
389392

393+
parser.add_argument("--version", action="version", version=f"quantcpp {__version__}")
394+
390395
sub = parser.add_subparsers(dest="command")
391396

392397
# pull
@@ -433,7 +438,32 @@ def main():
433438
parser.add_argument("--temperature", "-t", type=float, default=0.7)
434439
parser.add_argument("--threads", "-j", type=int, default=4)
435440

436-
args = parser.parse_args()
441+
# Backwards-compat (issue #54): if the first positional arg is not a
442+
# known subcommand, treat all positionals as a prompt. We must detect
443+
# this BEFORE argparse sees the argv, because the subparser will reject
444+
# unknown choices with an error.
445+
known_commands = {"pull", "list", "run", "serve", "client"}
446+
argv = sys.argv[1:]
447+
448+
first_pos = None
449+
for a in argv:
450+
if a.startswith("-"):
451+
continue
452+
first_pos = a
453+
break
454+
455+
if first_pos and first_pos not in known_commands:
456+
# Parse with a minimal parser that has no subcommands
457+
compat = argparse.ArgumentParser(prog="quantcpp", add_help=False)
458+
compat.add_argument("prompt", nargs="*", default=None)
459+
compat.add_argument("--model", "-m", default=None)
460+
compat.add_argument("--max-tokens", "-n", type=int, default=256)
461+
compat.add_argument("--temperature", "-t", type=float, default=0.7)
462+
compat.add_argument("--threads", "-j", type=int, default=4)
463+
args = compat.parse_args(argv)
464+
return cmd_chat_default(args)
465+
466+
args = parser.parse_args(argv)
437467

438468
if args.command == "pull":
439469
return cmd_pull(args)

include/turboquant/tq_engine.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ typedef struct {
6363
float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
6464
float attn_logit_softcap; /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
6565
int* per_layer_inter_dim; /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */
66+
67+
/* Phi-3 LongRoPE parameters */
68+
int rope_orig_ctx_len; /* original context length (e.g., 4096) */
69+
float rope_attn_factor; /* attention magnitude scaling */
70+
const float* rope_factors_short; /* [head_dim/2] for short context */
71+
const float* rope_factors_long; /* [head_dim/2] for long context */
72+
73+
/* Phi-3 fused-tensor flags — drive state buffer sizing */
74+
int has_fused_qkv; /* any layer has gguf_w_qkv */
75+
int has_fused_up_gate; /* any layer has gguf_w_up_gate */
6676
} tq_model_config_t;
6777

6878
/* ============================================================
@@ -173,6 +183,10 @@ typedef struct {
173183
const void* gguf_delta_a; int gguf_delta_a_type;
174184
const void* gguf_delta_b; int gguf_delta_b_type;
175185
const void* gguf_delta_out; int gguf_delta_out_type;
186+
/* Phi-3 fused projections — one matmul + memcpy split */
187+
const void* gguf_w_qkv; int gguf_w_qkv_type; /* [hidden, q+k+v] fused QKV */
188+
const void* gguf_w_up_gate; int gguf_w_up_gate_type; /* [hidden, 2*inter] fused gate||up */
189+
176190
/* GGUF FFN (dense layers in MoE models) */
177191
const void* gguf_w_gate; int gguf_w_gate_type;
178192
const void* gguf_w_up; int gguf_w_up_type;

quant.h

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12942,6 +12942,43 @@ void tq_free_model(tq_model_t* model) {
1294212942
}
1294312943
}
1294412944
free(model->moe_config);
12945+
12946+
/* Free dequantized norm/embedding buffers (GGUF path only).
12947+
* In the GGUF path, dequant_tensor_fp32() individually malloc's each
12948+
* norm weight. In the SafeTensor path, these point into _converted_data
12949+
* (freed above), so we must NOT free them again. */
12950+
if (model->gguf_ctx && model->layers) {
12951+
for (int l = 0; l < model->config.n_layers; l++) {
12952+
tq_layer_weights_t* layer = &model->layers[l];
12953+
free(layer->attn_norm);
12954+
free(layer->ffn_norm);
12955+
free(layer->q_norm);
12956+
free(layer->k_norm);
12957+
free(layer->post_attn_norm);
12958+
free(layer->post_ffn_norm);
12959+
free(layer->pre_ffn_norm);
12960+
free(layer->post_ffn_norm_1);
12961+
free(layer->pre_ffn_norm_2);
12962+
free(layer->post_ffn_norm_2);
12963+
free(layer->ple_norm);
12964+
free(layer->delta_a_log);
12965+
free(layer->delta_conv1d);
12966+
free(layer->delta_dt_bias);
12967+
free(layer->delta_in_proj_qkv);
12968+
free(layer->delta_in_proj_z);
12969+
free(layer->delta_norm);
12970+
free(layer->delta_in_proj_a);
12971+
free(layer->delta_in_proj_b);
12972+
free(layer->delta_out_proj);
12973+
}
12974+
free(model->token_embedding);
12975+
free(model->output_weight);
12976+
free(model->output_norm);
12977+
free(model->rope_freqs);
12978+
free(model->ple_proj);
12979+
free(model->ple_proj_norm);
12980+
}
12981+
1294512982
free(model->layers);
1294612983

1294712984
/* Free GGUF context (handles munmap internally) */
@@ -13317,12 +13354,16 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
1331713354
s->delta_dvec = (float*)calloc((size_t)dv, sizeof(float));
1331813355
}
1331913356

13320-
/* Quantization workspace */
13357+
/* Quantization workspace — use MAX head_dim for hybrid attention (Gemma 4).
13358+
* Sliding layers have head_dim=256, full layers have head_dim=512.
13359+
* Quantized cache must accommodate the larger dimension. (issue #61) */
1332113360
size_t block_size = tq_type_block_size(kv_type);
1332213361
size_t type_size = tq_type_type_size(kv_type);
1332313362
if (block_size == 0) block_size = TQ_BK;
1332413363
if (type_size == 0) type_size = sizeof(block_tq_uniform_4b);
13325-
size_t n_blocks_per_head = ((size_t)config->head_dim + block_size - 1) / block_size;
13364+
int max_head_dim = config->head_dim;
13365+
if (config->full_head_dim > max_head_dim) max_head_dim = config->full_head_dim;
13366+
size_t n_blocks_per_head = ((size_t)max_head_dim + block_size - 1) / block_size;
1332613367
/* quant_key_buf is used as a gather buffer for integer attention:
1332713368
* we collect quantized key blocks for one KV head across all seq positions.
1332813369
* Size needed: max_seq_len * blocks_per_head * type_size */
@@ -13337,7 +13378,10 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
1333713378
* Layout: [n_layers][max_seq_len][n_kv_heads][blocks_per_head * type_size]
1333813379
* Each key vector is quantized when stored, then reused for fast Q4xQ8 attention. */
1333913380
s->quant_head_stride = n_blocks_per_head * type_size;
13340-
size_t quant_pos_stride = s->quant_head_stride * (size_t)config->n_kv_heads;
13381+
/* Use max kv_heads for position stride (hybrid: sliding=8, full=2 but larger heads) */
13382+
int max_kv_heads = config->n_kv_heads;
13383+
if (config->full_n_kv_heads > max_kv_heads) max_kv_heads = config->full_n_kv_heads;
13384+
size_t quant_pos_stride = s->quant_head_stride * (size_t)max_kv_heads;
1334113385
s->quant_kv_stride = quant_pos_stride * (size_t)max_seq;
1334213386
if (kv_type < TQ_TYPE_COUNT) {
1334313387
s->quant_key_cache = calloc((size_t)n_layers * s->quant_kv_stride, 1);
@@ -14388,15 +14432,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
1438814432
/* Quantized KV cache: stride was allocated with sliding dims (c->n_kv_heads, c->head_dim).
1438914433
* For hybrid attention full layers with different head_dim, skip quant cache
1439014434
* (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
14435+
/* Hybrid attention KV cache: allocated with max(sliding, full) dimensions.
14436+
* quant_head_stride uses max_head_dim, quant_pos_stride uses max_kv_heads.
14437+
* Both sliding and full layers can use the quantized cache. (issue #61) */
1439114438
int cache_n_kv_heads = c->n_kv_heads;
14392-
if (head_dim != c->head_dim) {
14393-
/* Full layer: head_dim mismatch with quant cache allocation.
14394-
* Disable both quantized and integer attention → use FP32 path. */
14439+
if (c->full_n_kv_heads > cache_n_kv_heads) cache_n_kv_heads = c->full_n_kv_heads;
14440+
if (head_dim != c->head_dim && c->full_head_dim == 0) {
14441+
/* Non-hybrid head_dim mismatch — disable quantized path */
1439514442
use_quant_kv = 0;
1439614443
use_int_attn = 0;
14397-
/* Ensure K is stored in FP32 cache (may have been skipped above) */
1439814444
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
14399-
} else if (use_int_attn && head_dim != c->head_dim) {
14445+
} else if (use_int_attn && head_dim != c->head_dim && c->full_head_dim == 0) {
1440014446
use_int_attn = 0;
1440114447
memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
1440214448
}
@@ -16297,6 +16343,7 @@ static int chat_find_marker(const char* h, int hlen, const char* m) {
1629716343
static const char* const CHAT_END_MARKERS[] = {
1629816344
"<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
1629916345
"<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
16346+
"</s>", "<|end|>",
1630016347
NULL,
1630116348
};
1630216349

scripts/check_sync.sh

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/env bash
2+
# check_sync.sh — verify critical code sections are in sync between
3+
# quant.h (single header) and src/ (split sources).
4+
#
5+
# This catches the #67-class bug: a feature implemented in quant.h
6+
# but not ported to the split sources (or vice versa).
7+
#
8+
# Usage: bash scripts/check_sync.sh
9+
# Returns 0 if all checks pass, 1 if any drift is detected.
10+
11+
set -euo pipefail
12+
13+
HEADER="quant.h"
14+
RED='\033[0;31m'
15+
GREEN='\033[0;32m'
16+
YELLOW='\033[1;33m'
17+
NC='\033[0m'
18+
19+
ERRORS=0
20+
21+
check_marker_list() {
22+
local label="$1"
23+
local file1="$2"
24+
local file2="$3"
25+
local pattern="$4"
26+
27+
local list1 list2
28+
list1=$(grep -o "$pattern" "$file1" 2>/dev/null | sort -u)
29+
list2=$(grep -o "$pattern" "$file2" 2>/dev/null | sort -u)
30+
31+
if [ "$list1" = "$list2" ]; then
32+
echo -e " ${GREEN}${NC} $label"
33+
else
34+
echo -e " ${RED}${NC} $label — MISMATCH"
35+
diff <(echo "$list1") <(echo "$list2") || true
36+
ERRORS=$((ERRORS + 1))
37+
fi
38+
}
39+
40+
check_field_exists() {
41+
local label="$1"
42+
local field="$2"
43+
local file="$3"
44+
45+
if grep -q "$field" "$file" 2>/dev/null; then
46+
echo -e " ${GREEN}${NC} $label: '$field' found in $(basename $file)"
47+
else
48+
echo -e " ${RED}${NC} $label: '$field' MISSING in $(basename $file)"
49+
ERRORS=$((ERRORS + 1))
50+
fi
51+
}
52+
53+
check_both_have() {
54+
local label="$1"
55+
local pattern="$2"
56+
local file1="$3"
57+
local file2="$4"
58+
59+
local has1 has2
60+
has1=$(grep -c "$pattern" "$file1" 2>/dev/null || echo 0)
61+
has2=$(grep -c "$pattern" "$file2" 2>/dev/null || echo 0)
62+
63+
if [ "$has1" -gt 0 ] && [ "$has2" -gt 0 ]; then
64+
echo -e " ${GREEN}${NC} $label: present in both files"
65+
elif [ "$has1" -eq 0 ] && [ "$has2" -eq 0 ]; then
66+
echo -e " ${YELLOW}${NC} $label: absent in both (OK if not yet needed)"
67+
else
68+
local missing
69+
[ "$has1" -eq 0 ] && missing="$(basename $file1)" || missing="$(basename $file2)"
70+
echo -e " ${RED}${NC} $label: MISSING in $missing"
71+
ERRORS=$((ERRORS + 1))
72+
fi
73+
}
74+
75+
echo "=== quant.h ↔ split-source sync check ==="
76+
echo ""
77+
78+
# --- 1. CHAT_END_MARKERS list ---
79+
echo "[1] CHAT_END_MARKERS (template token filter)"
80+
# Extract only the markers from the CHAT_END_MARKERS array definition
81+
extract_markers() {
82+
sed -n '/CHAT_END_MARKERS\[\]/,/NULL/p' "$1" | grep -o '"[^"]*"' | sort -u
83+
}
84+
local_m1=$(extract_markers "$HEADER")
85+
local_m2=$(extract_markers "src/engine/tq_generate.c")
86+
if [ "$local_m1" = "$local_m2" ]; then
87+
echo -e " ${GREEN}${NC} End markers"
88+
else
89+
echo -e " ${RED}${NC} End markers — MISMATCH"
90+
diff <(echo "$local_m1") <(echo "$local_m2") || true
91+
ERRORS=$((ERRORS + 1))
92+
fi
93+
94+
# --- 2. Phi-3 fused tensor support ---
95+
echo ""
96+
echo "[2] Phi-3 fused tensor fields"
97+
check_field_exists "Config: has_fused_qkv" "has_fused_qkv" "include/turboquant/tq_engine.h"
98+
check_field_exists "Config: has_fused_up_gate" "has_fused_up_gate" "include/turboquant/tq_engine.h"
99+
check_field_exists "Layer: gguf_w_qkv" "gguf_w_qkv" "include/turboquant/tq_engine.h"
100+
check_field_exists "Layer: gguf_w_up_gate" "gguf_w_up_gate" "include/turboquant/tq_engine.h"
101+
check_field_exists "Config: rope_factors_short" "rope_factors_short" "include/turboquant/tq_engine.h"
102+
103+
# --- 3. Fused QKV forward path ---
104+
echo ""
105+
echo "[3] Fused QKV forward path"
106+
check_both_have "Fused QKV matmul" "gguf_w_qkv" \
107+
"$HEADER" "src/engine/tq_transformer.c"
108+
check_both_have "Fused FFN gate||up" "gguf_w_up_gate" \
109+
"$HEADER" "src/engine/tq_transformer.c"
110+
111+
# --- 4. LongRoPE ---
112+
echo ""
113+
echo "[4] LongRoPE rotation"
114+
check_both_have "rope_factors_short" "rope_factors_short" \
115+
"$HEADER" "src/engine/tq_transformer.c"
116+
check_both_have "rope_factors_long" "rope_factors_long" \
117+
"$HEADER" "src/engine/tq_transformer.c"
118+
119+
# --- 5. BOS token handling ---
120+
echo ""
121+
echo "[5] BOS token handling"
122+
check_both_have "BOS <s> lookup in tokenizer" '"<s>"' \
123+
"$HEADER" "src/engine/tq_tokenizer.c"
124+
check_both_have "BOS <s> auto-detect in generate" '"<s>"' \
125+
"$HEADER" "src/engine/tq_generate.c"
126+
check_both_have "BOS <|begin_of_text|> lookup" '"<|begin_of_text|>"' \
127+
"$HEADER" "src/engine/tq_tokenizer.c"
128+
129+
# --- 6. Hybrid attention stride (GQA fix) ---
130+
echo ""
131+
echo "[6] Hybrid attention cache stride"
132+
check_both_have "max_head_dim in quant cache" "max_head_dim" \
133+
"$HEADER" "src/engine/tq_transformer.c"
134+
check_both_have "max_kv_heads in quant cache" "max_kv_heads" \
135+
"$HEADER" "src/engine/tq_transformer.c"
136+
137+
# --- 7. Memory free completeness ---
138+
echo ""
139+
echo "[7] GGUF dequant memory free"
140+
check_both_have "free(layer->attn_norm)" "free(layer->attn_norm)" \
141+
"$HEADER" "src/engine/tq_model.c"
142+
143+
# --- Summary ---
144+
echo ""
145+
echo "========================================="
146+
if [ "$ERRORS" -eq 0 ]; then
147+
echo -e " ${GREEN}ALL CHECKS PASSED${NC}"
148+
else
149+
echo -e " ${RED}$ERRORS SYNC ISSUES DETECTED${NC}"
150+
fi
151+
echo "========================================="
152+
exit "$ERRORS"

0 commit comments

Comments
 (0)