diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index e9559ef..bb8061a 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -4,11 +4,18 @@
Quick start:
from quantcpp import Model
- m = Model.from_pretrained("Llama-3.2-1B")
+ m = Model.from_pretrained("SmolLM2-1.7B")
print(m.ask("What is gravity?"))
-Note: SmolLM2-135M downloads faster but produces low-quality output.
-Use Llama-3.2-1B (~750 MB, one-time download) for good results.
+Model selection guide:
+ SmolLM2-1.7B (1.7 GB, vocab 49K) — recommended. ~12 tok/s on Apple M3.
+ Llama-3.2-1B (750 MB, vocab 128K) — smaller download but slower
+ due to large vocab (~2 tok/s on M3).
+ SmolLM2-135M (138 MB, vocab 49K) — demo only, low quality output.
+
+Larger vocab = slower lm_head matmul → smaller params with smaller vocab
+often beats larger params with larger vocab. See docs/supported_models.md
+for the architecture support matrix.
"""
try:
@@ -53,17 +60,37 @@ class ChatContextOverflow(RuntimeError):
Path.home() / ".cache" / "quantcpp"))
# name → (HuggingFace repo, filename, approx size in MB)
+# Note: download URL is constructed as
+# https://huggingface.co/{repo}/resolve/main/{filename}
+# Verify both fields against the actual HuggingFace listing before
+# adding new entries — there is no integrity check at runtime.
_MODEL_REGISTRY = {
+ # 138 MB demo model. Tokenizer + arch are llama-compatible but the
+ # model is too small to produce coherent output for general chat.
+ # Listed only so users can verify the install/load path quickly.
"SmolLM2-135M": (
"Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct",
"smollm2-135m-instruct-q8_0.gguf",
135,
),
+ # Recommended default for first-time users on Apple Silicon / typical
+ # laptops. vocab 49K keeps the lm_head matmul small, so even on a
+ # mid-range M-series chip we measure ~12 tok/s — comfortable for
+ # interactive chat. Same llama arch family as SmolLM2-135M, so it
+ # exercises the most-tested code path.
+ "SmolLM2-1.7B": (
+ "bartowski/SmolLM2-1.7B-Instruct-GGUF",
+ "SmolLM2-1.7B-Instruct-Q8_0.gguf",
+ 1700,
+ ),
"Qwen3.5-0.8B": (
"unsloth/Qwen3.5-0.8B-GGUF",
"Qwen3.5-0.8B-Q4_K_M.gguf",
508,
),
+ # Smaller download than SmolLM2-1.7B but slower at inference time
+ # because of the 128K Llama-3 vocab (~5x slower lm_head matmul on M3).
+ # Kept in the registry for users who specifically want a Llama model.
"Llama-3.2-1B": (
"hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
"llama-3.2-1b-instruct-q4_k_m.gguf",
@@ -170,7 +197,7 @@ class Model:
Examples
--------
- >>> m = Model.from_pretrained("SmolLM2-135M")
+ >>> m = Model.from_pretrained("SmolLM2-1.7B")
>>> m.ask("What is gravity?")
'Gravity is a force that attracts ...'
diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
index 830204f..8a5fe73 100644
--- a/bindings/python/quantcpp/cli.py
+++ b/bindings/python/quantcpp/cli.py
@@ -18,9 +18,13 @@
import json
-# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
+# Ollama-style short aliases → canonical _MODEL_REGISTRY keys.
+# Plain "smollm2" without a size suffix points at the 1.7B model — that's
+# the recommended default. Users who explicitly want the 135M demo model
+# need to ask for it by full name.
MODEL_ALIASES = {
- "smollm2": "SmolLM2-135M",
+ "smollm2": "SmolLM2-1.7B",
+ "smollm2:1.7b": "SmolLM2-1.7B",
"smollm2:135m": "SmolLM2-135M",
"qwen3.5": "Qwen3.5-0.8B",
"qwen3.5:0.8b": "Qwen3.5-0.8B",
@@ -329,8 +333,13 @@ def cmd_client(args):
def cmd_chat_default(args):
- """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
- args.model = args.model or "Llama-3.2-1B"
+ """Backwards-compatible default: auto-download SmolLM2-1.7B and chat.
+
+ Default switched from Llama-3.2-1B to SmolLM2-1.7B (2026-04-12) after
+ user feedback that Llama-3.2-1B's 128K vocab makes it ~5x slower at
+ interactive chat than SmolLM2-1.7B's 49K vocab on Apple Silicon.
+ """
+ args.model = args.model or "SmolLM2-1.7B"
args.threads = getattr(args, "threads", 4)
args.max_tokens = getattr(args, "max_tokens", 256)
args.temperature = getattr(args, "temperature", 0.7)
@@ -354,19 +363,19 @@ def main():
client PROMPT Send a request to a running serve (default: SSE streaming)
examples:
- quantcpp pull llama3.2:1b
+ quantcpp pull smollm2 # recommended: small vocab → fast
quantcpp list
- quantcpp run llama3.2:1b
- quantcpp run llama3.2:1b "What is gravity?"
- quantcpp serve llama3.2:1b --port 8080
+ quantcpp run smollm2
+ quantcpp run smollm2 "What is gravity?"
+ quantcpp serve smollm2 --port 8080
quantcpp client "What is gravity?" # streams from :8080
quantcpp client "Hi" --url http://localhost:8081
quantcpp client "Hi" --no-stream # single JSON response
backwards-compat (no subcommand):
- quantcpp # default chat with Llama-3.2-1B
+ quantcpp # default chat with SmolLM2-1.7B
quantcpp "What is gravity?" # one-shot
- quantcpp --model SmolLM2-135M # different model
+ quantcpp --model llama3.2:1b # different model
""",
)
diff --git a/docs/feedback/2026-04-12_0900.md b/docs/feedback/2026-04-12_0900.md
new file mode 100644
index 0000000..c925007
--- /dev/null
+++ b/docs/feedback/2026-04-12_0900.md
@@ -0,0 +1,195 @@
+# quant.cpp User Feedback — First-Time Setup & Usage Experience
+
+**Date**: 2026-04-12
+**Environment**: macOS (Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory)
+**Version tested**: v0.10.1 → v0.12.0 (pip) + latest main (source build)
+**Tested by**: End-user (developer, first-time quant.cpp user)
+
+---
+
+## Summary
+
+pip install부터 `quantcpp serve`, Metal GPU 빌드, 채팅 웹 UI 연동, 다양한 모델 비교까지의 전 과정을 체험했습니다. 전반적으로 "설치 → 모델 다운로드 → 추론"까지의 흐름은 매우 간결했으나, 모델 호환성과 속도 면에서 개선점이 발견되었습니다.
+
+---
+
+## 1. 좋았던 점
+
+### 1.1 설치가 매우 간단
+- `pip install quantcpp` 한 줄로 설치 완료. 의존성 zero.
+- `Model.from_pretrained("Llama-3.2-1B")`으로 모델 자동 다운로드 + 캐시. 매우 편리.
+
+### 1.2 OpenAI 호환 API 서버
+- `quantcpp serve llama3.2:1b --port 8080` 한 줄로 서버 기동.
+- `/v1/chat/completions` 엔드포인트가 OpenAI SDK와 호환되어 기존 코드 재사용 가능.
+- SSE 스트리밍(`stream: true`) 정상 동작.
+- CORS 헤더 (`Access-Control-Allow-Origin: *`) 기본 포함 — 프론트엔드 연동 즉시 가능.
+
+### 1.3 v0.12.0의 CLI 추가
+- `quantcpp "What is gravity?"` 한 줄 질문이 가능해져 체험 진입장벽이 크게 낮아짐.
+- `quantcpp` (인터랙티브 모드)도 직관적.
+
+### 1.4 KV cache reuse (최신 main)
+- 연속 대화 시 두 번째 요청부터 prefill이 생략되어 응답 시간이 ~50% 단축됨.
+- 첫 요청 27초 → 두 번째 요청 14초 (Llama-3.2-1B 기준).
+
+### 1.5 Metal GPU 자동 감지
+- `TQ_BUILD_METAL=ON`으로 빌드하면 Apple Silicon GPU를 자동 감지하여 활성화.
+- 별도 설정 없이 matmul 배치 디스패치가 Metal로 전환됨.
+
+### 1.6 SmolLM2-1.7B에서의 우수한 성능
+- vocab size가 작은 모델(49K)에서 ~12.5 tok/s 달성. 실시간 대화 가능 수준.
+- 출력 품질도 깨끗하고 정확함 (예: "The capital of South Korea is Seoul.").
+
+---
+
+## 2. 개선이 필요한 점
+
+### 2.1 pip 패키지에서 CLI가 누락 (v0.10.1)
+- **문제**: PyPI v0.10.1에는 `quantcpp` CLI entry point가 없었음. `zsh: command not found: quantcpp`.
+- **해결**: v0.11.0부터 `cli.py` + entry point 추가로 해결됨.
+- **제안**: PyPI에 최신 버전을 빠르게 배포하면 첫 경험이 크게 개선될 것.
+
+### 2.2 `quantcpp serve`에 quant-server 바이너리 필요
+- **문제**: `pip install quantcpp` 후 `quantcpp serve`를 실행하면 `quant-server binary not found` 에러.
+- 사용자가 직접 CMake로 `TQ_BUILD_SERVER=ON` 빌드 후 PATH에 복사해야 함.
+- **제안**: pip 패키지에 서버 바이너리를 포함하거나, 순수 Python fallback 서버를 제공.
+
+### 2.3 Llama-3.2-1B의 극심한 느린 속도
+- **문제**: Llama-3.2-1B (Q4_K_M)가 Apple M3에서 ~2.3 tok/s로 매우 느림.
+ - 60토큰 생성에 ~27초, 200토큰에 ~67초 소요.
+ - 대화형 사용이 사실상 불가능한 수준.
+- **원인 분석**: vocab size 128,256이 병목. 매 토큰마다 128K 차원의 output projection 필요.
+- **대비**: 동일 환경에서 SmolLM2-1.7B (Q8, vocab 49K)는 ~12.5 tok/s로 5배 빠름.
+- **제안**:
+ - 기본 추천 모델을 SmolLM2-1.7B로 변경 검토.
+ - 또는 모델 선택 가이드에 "vocab size가 클수록 느려진다"는 안내 추가.
+
+### 2.4 SmolLM2-135M의 출력 품질 문제
+- **문제**: SmolLM2-135M은 속도는 빠르지만(0.3초) 출력이 HTML 쓰레기 텍스트.
+- **제안**: 135M 모델은 "quantization 데모용"으로만 안내하고, 추론 품질 기대를 낮추는 문구 추가.
+
+### 2.5 Gemma-4-E2B 호환성 문제
+- **문제**: gemma-4-E2B-it-Q4_K_M.gguf 로딩은 성공하나, 추론 출력이 완전히 깨짐 (다국어 쓰레기 토큰).
+- 서버 로그에는 정상 로딩으로 표시되어 사용자가 원인을 파악하기 어려움.
+- **제안**: 지원되는 모델/아키텍처 목록을 명시하고, 미지원 모델 로딩 시 경고 표시.
+
+### 2.6 Phi-3.5-mini-instruct 아키텍처 미지원 (신규)
+- **문제**: `Phi-3.5-mini-instruct-Q8_0.gguf` (3.9GB) 로딩은 성공하나, attention 레이어 매핑 실패.
+ - 서버 로그: `loaded 32 layers (0 self_attn)` — self_attn이 0으로 인식됨.
+ - 출력: 완전한 쓰레기 토큰 (`uffrasspkeryensonisatcreteBUG...`).
+ - 속도 자체는 0.85초/80토큰으로 극도로 빠름 (vocab 32K 효과).
+- **영향**: Phi-3/Phi-3.5는 vocab 32K로 속도 면에서 최적의 모델이나 사용 불가.
+- **제안**:
+ - Phi-3 (`phi3`) 아키텍처의 attention 레이어 매핑 지원 추가.
+ - 이 모델이 지원되면 "속도 + 품질" 모두에서 최적의 추천 모델이 될 수 있음.
+ - `self_attn=0`으로 감지된 경우 사용자에게 경고 메시지 표시 필요.
+
+### 2.7 Qwen3.5-0.8B 출력 품질 문제 (신규)
+- **문제**: Qwen3.5-0.8B (Q4_K_M) 서버 로딩은 성공하나, 출력이 완전히 깨짐.
+ - DeltaNet hybrid 아키텍처 특성으로 인한 호환성 문제 추정.
+ - 33초/60토큰으로 속도도 느림 (vocab 248K).
+- **제안**: Qwen 계열의 지원 상태를 문서에 명시.
+
+### 2.8 Metal GPU 가속 효과 제한적 (소형 모델)
+- **문제**: 1B 모델에서 Metal GPU가 활성화되어 있으나 체감 속도 차이 없음.
+- 소스 코드 주석에도 "Metal Q4 batch → 38 tok/s vs CPU Q4 → 95 tok/s (SmolLM2)" 명시.
+- 소형 모델에서는 GPU 디스패치 오버헤드가 연산 시간보다 큼.
+- **제안**: 모델 크기에 따라 CPU/GPU 자동 전환 로직 추가, 또는 `--device cpu/gpu` 옵션 제공.
+
+### 2.9 서버 단일 요청 처리 (동시성 없음)
+- **문제**: 첫 번째 요청 처리 중 두 번째 요청이 완전히 블로킹됨.
+- 채팅 UI에서 연속 질문 시 두 번째 질문이 3분+ 대기.
+- **제안**: 요청 큐잉 + 처리 중 상태 반환 (429 or retry-after), 또는 요청 취소 API.
+
+### 2.10 chat template 잔여물
+- **문제**: 응답에 `<|im_start|>`, `<|im_end|>`, `assistant` 등 template 토큰이 노출됨.
+- Llama-3.2-1B에서 특히 빈번. SmolLM2-1.7B에서는 `<|im_ennd|>` 정도로 경미.
+- **제안**: 서버 측에서 stop tokens/template markers를 자동 strip.
+
+---
+
+## 3. 모델별 벤치마크 (Apple M3, 16GB RAM, Metal GPU 빌드)
+
+| Model | Quant | File Size | Vocab | tok/s | 60-token Time | Quality | Architecture |
+|-------|-------|-----------|------:|------:|--------------:|---------|-------------|
+| SmolLM2-135M | Q8 | 138MB | 49K | ~300 | 0.3s | Unusable (garbage) | llama |
+| Qwen3.5-0.8B | Q4_K_M | 508MB | 248K | ~1.8 | ~33s | Broken (garbage) | qwen/deltanet |
+| Llama-3.2-1B | Q4_K_M | 770MB | 128K | ~2.3 | ~27s | Usable (artifacts) | llama |
+| **SmolLM2-1.7B** | **Q8** | **1.7GB** | **49K** | **~12.5** | **~5s** | **Good (clean)** | **llama** |
+| Gemma-4-E2B | Q4_K_M | 2.9GB | 262K | ~10 | ~5s | Broken (compat) | gemma4 hybrid |
+| Phi-3.5-mini | Q8 | 3.9GB | 32K | ~94* | ~0.85s* | Broken (0 self_attn) | phi3 |
+
+*\* Phi-3.5 속도는 attention이 작동하지 않아 실제 추론이 아님. 정상 지원 시 예상 속도.*
+
+### Key Insights
+
+1. **vocab size가 속도에 가장 큰 영향을 미침.** 파라미터 수보다 vocab size와 양자화 방식이 실사용 속도를 결정.
+ - SmolLM2-1.7B (vocab 49K): 12.5 tok/s
+ - Llama-3.2-1B (vocab 128K): 2.3 tok/s — 2.6x vocab → 5.4x 느림
+2. **Q8이 Q4보다 빠를 수 있음.** Q4의 디퀀타이즈 오버헤드가 Q8보다 크며, NEON SIMD에서 Q8이 더 효율적.
+3. **llama 아키텍처만 안정적으로 동작.** phi3, gemma4, qwen/deltanet 아키텍처는 로딩은 되지만 추론이 깨짐.
+4. **Phi-3.5가 지원되면 게임 체인저.** vocab 32K + 3.8B params로 "속도 + 품질" 최적 조합 가능.
+
+---
+
+## 4. 아키텍처 호환성 매트릭스 (신규)
+
+| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status |
+|-------------|-----------|-----------|-----------|-----------|--------|
+| llama (SmolLM2, Llama) | OK | OK | OK | OK | **Fully supported** |
+| llama (Llama-3.2 GQA) | OK | OK | OK | Slow | Supported (vocab bottleneck) |
+| phi3 (Phi-3.5-mini) | OK | OK | **FAIL (0 self_attn)** | Garbage | **Not supported** |
+| gemma4 (Gemma-4-E2B) | OK | OK | Partial | Garbage | **Not supported** |
+| qwen/deltanet (Qwen3.5) | OK | OK | Unknown | Garbage | **Not supported** |
+
+**제안**: 이 매트릭스를 README 또는 docs에 포함하여 사용자가 모델 선택 전에 호환성을 확인할 수 있게 해주세요.
+
+---
+
+## 5. 제안 우선순위
+
+| Priority | Item | Impact | Effort |
+|----------|------|--------|--------|
+| **P0** | Phi-3 (`phi3`) 아키텍처 attention 매핑 지원 | 최적 모델 활용 가능 | Medium |
+| **P0** | chat template 토큰 자동 strip | 출력 품질 즉시 개선 | Low |
+| **P0** | 기본 추천 모델을 SmolLM2-1.7B로 변경 | 첫 경험 대폭 개선 | Low |
+| P1 | pip 패키지에 서버 바이너리 포함 | 설치 → 서버 기동 원스텝 | Medium |
+| P1 | 미지원 아키텍처 로딩 시 경고/에러 | 디버깅 시간 절약 | Low |
+| P1 | `self_attn=0` 감지 시 경고 메시지 | 호환성 문제 즉시 인지 | Low |
+| P2 | 서버 동시 요청 처리 (또는 큐잉) | 다중 사용자/연속 대화 | High |
+| P2 | 아키텍처 호환성 매트릭스 문서화 | 모델 선택 가이드 | Low |
+| P2 | vocab size 기반 CPU/GPU 자동 전환 | 최적 성능 자동 선택 | Medium |
+| P3 | `--device cpu/gpu` CLI 옵션 | 사용자 제어권 | Low |
+
+---
+
+## 6. 테스트 환경 상세
+
+```
+Hardware: Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory
+OS: macOS 15 (Darwin 24.5.0)
+Python: 3.14.3
+Compiler: AppleClang 16.0.0
+Xcode: installed (Metal shader compilation enabled)
+quantcpp: v0.10.1 (pip) → v0.12.0 (pip) → latest main (source)
+Build: cmake -DTQ_BUILD_METAL=ON -DTQ_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
+```
+
+---
+
+## 7. 테스트한 모델 파일 목록
+
+```
+~/.cache/quantcpp/smollm2-135m-instruct-q8_0.gguf (138 MB)
+~/.cache/quantcpp/Qwen3.5-0.8B-Q4_K_M.gguf (508 MB)
+~/.cache/quantcpp/llama-3.2-1b-instruct-q4_k_m.gguf (770 MB)
+~/.cache/quantcpp/Phi-3.5-mini-instruct-Q8_0.gguf (3.9 GB) — NEW
+~/dev/projects/TurboQuant.cpp/models/SmolLM2-1.7B-Instruct-Q8_0.gguf (1.7 GB)
+~/dev/projects/TurboQuant.cpp/models/gemma-4-E2B-it-Q4_K_M.gguf (2.9 GB)
+```
+
+---
+
+*This feedback was generated based on a hands-on first-time user experience session on 2026-04-12.*
+*Updated with Phi-3.5-mini-instruct and Qwen3.5-0.8B architecture compatibility findings.*
diff --git a/docs/supported_models.md b/docs/supported_models.md
new file mode 100644
index 0000000..5e9600f
--- /dev/null
+++ b/docs/supported_models.md
@@ -0,0 +1,117 @@
+# Supported Models
+
+quant.cpp loads GGUF files from HuggingFace, but only some model
+architectures are fully wired through the inference path. This page
+tracks what works, what loads-but-fails, and how to pick a model.
+
+## TL;DR — Recommended models
+
+| Use case | Model | Why |
+|---|---|---|
+| **First-time install** | `SmolLM2-1.7B` (Q8) | Fastest end-to-end on a laptop. Vocab 49K keeps the lm_head matmul small (~12 tok/s on Apple M3). |
+| Smaller download | `Llama-3.2-1B` (Q4_K_M) | 750 MB vs 1.7 GB, but ~5x slower at inference time due to 128K vocab. |
+| Quick smoke test | `SmolLM2-135M` (Q8) | 138 MB download to verify the install path. Output quality is poor — not for real use. |
+
+```bash
+# CLI quickstart
+quantcpp run smollm2 # SmolLM2-1.7B (recommended)
+quantcpp run smollm2:135m # SmolLM2-135M (smoke test only)
+quantcpp run llama3.2:1b # smaller download, slower
+```
+
+```python
+# Python quickstart
+from quantcpp import Model
+m = Model.from_pretrained("SmolLM2-1.7B")
+print(m.ask("What is gravity?"))
+```
+
+## Architecture compatibility matrix
+
+| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status |
+|---|:---:|:---:|:---:|:---:|---|
+| **llama** (SmolLM2, Llama-3.x, Mistral) | ✅ | ✅ | ✅ | ✅ | **Fully supported** |
+| llama with 128K vocab (Llama-3.2-1B) | ✅ | ✅ | ✅ | slow | Supported, vocab is the bottleneck |
+| **gemma** (Gemma 2) | ✅ | ✅ | ✅ | ✅ | Supported |
+| **gemma3** | ✅ | ✅ | ✅ | ✅ | Supported with hybrid sliding-window attention |
+| **gemma4** (Gemma-4-E2B / E4B) | ✅ | ✅ | ⚠️ | ⚠️ | Partial — some Q4_K_M variants produce garbage; report with file SHA256 |
+| **qwen** / **qwen2** | ✅ | ✅ | ✅ | ✅ | Supported |
+| **qwen3.5** (DeltaNet hybrid) | ✅ | ✅ | partial | ⚠️ | Partial — pure-attention layers work, DeltaNet hybrid still being validated |
+| **phi3** / **phi3.5** (fused QKV) | ❌ | — | — | — | **Not supported** — uses `attn_qkv`, see "Why phi3 is hard" below |
+
+✅ = works · ⚠️ = loads but inference is unreliable · ❌ = load fails fast with a clear error (since 2026-04-12)
+
+If you load an unsupported architecture, the loader now prints:
+
+```
+tq_load_gguf: ERROR — model architecture 'phi3' is not supported.
+ Detected 0 self_attn layers and no DeltaNet weights.
+ This usually means the model uses fused QKV projection
+ (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle.
+ See docs/supported_models.md for the architecture support matrix.
+```
+
+…and `tq_load_gguf` returns NULL, so callers can fail-fast instead of
+silently producing garbage tokens.
+
+## Why vocab size dominates speed
+
+quant.cpp generates one token at a time. Every token requires a
+`lm_head` matmul of shape `[hidden_dim, vocab_size]`. For a typical 1B
+model with `hidden_dim = 2048`:
+
+| Model | vocab_size | lm_head FLOPs/token |
+|---|---:|---:|
+| SmolLM2-1.7B | 49,152 | 100 M |
+| Llama-3.2-1B | 128,256 | 263 M |
+
+Llama-3.2-1B has fewer parameters (1.0B vs 1.7B) but its lm_head matmul
+is 2.6x bigger, and on CPU it dominates wall time. External user
+benchmarks on Apple M3 (8-core CPU, 16 GB RAM):
+
+| Model | tok/s | 60-token latency |
+|---|---:|---:|
+| SmolLM2-1.7B (Q8, vocab 49K) | ~12.5 | ~5 s |
+| Llama-3.2-1B (Q4_K_M, vocab 128K) | ~2.3 | ~27 s |
+
+**Take-away**: when picking a model for an embedded / laptop scenario,
+vocab size is a better predictor of interactive latency than parameter
+count. Pick the smallest vocab that produces output you're happy with.
+
+## Why phi3 is hard
+
+Phi-3 / Phi-3.5 uses a *fused* QKV projection: instead of three separate
+tensors `attn_q.weight`, `attn_k.weight`, `attn_v.weight`, it ships one
+`attn_qkv.weight` with all three projections concatenated along the
+output dimension.
+
+quant.cpp's GGUF loader currently looks for the three-tensor layout
+(`blk.N.attn_q.weight` etc.). When it loads a Phi-3 GGUF, none of those
+names match → 0 self_attn layers detected → forward pass runs against
+zero-initialized attention weights → garbage tokens.
+
+Adding Phi-3 support requires either:
+
+1. **Loader splits** `attn_qkv.weight` into the three views at load time
+ and writes them into the existing `wq`/`wk`/`wv` slots, OR
+2. **Forward path** learns to dispatch a fused QKV matmul when the
+ loader detects the fused tensor.
+
+Option (1) is simpler but doubles the working set during load. Option
+(2) is the right long-term answer. There's a tracking issue / spike in
+progress; until then Phi-3 is the highest-value missing architecture for
+quant.cpp's "speed + quality" target (Phi-3.5-mini has vocab 32K plus
+3.8B params — it would beat both SmolLM2-1.7B and Llama-3.2-1B at
+interactive use).
+
+## Reporting an unsupported model
+
+If you tried a model that's not in the matrix above, please open an
+issue with:
+
+- The HuggingFace repo + filename
+- The exact `tq_load_gguf:` log lines (including `architecture = '...'`)
+- The first ~50 generated tokens (so we can see whether it's garbage,
+ partial garbage, or just wrong-language)
+
+Don't include the model file itself — link to the HuggingFace page.
diff --git a/quant.h b/quant.h
index 36cbbb2..136d1e4 100644
--- a/quant.h
+++ b/quant.h
@@ -11940,6 +11940,39 @@ tq_model_t* tq_load_gguf(const char* path) {
n_attn_layers, c->n_layers);
}
+ /* Hard-fail when neither standard self_attn (`blk.N.attn_q.weight`) nor
+ * DeltaNet (`blk.N.ssm_a`) was detected on any layer. The GGUF loaded
+ * fine but every layer is missing its attention block — typically
+ * because the architecture uses fused QKV (Phi-3 `attn_qkv`) or some
+ * other naming convention we don't recognize yet.
+ *
+ * Without this check the load returns successfully, the forward pass
+ * runs against zero-initialized attention weights, and the user gets
+ * pages of garbage tokens with no clear error to debug. The previous
+ * behavior was reported by an external user (2026-04-12 feedback) as
+ * the worst part of the first-time experience: "loaded 32 layers
+ * (0 self_attn)" looked like a success log.
+ *
+ * Listed architectures that hit this path:
+ * - phi3 / phi3.5 (uses fused `blk.N.attn_qkv.weight`)
+ * - any future fused-QKV architecture we haven't ported yet
+ *
+ * Hybrid models with at least ONE self_attn layer (e.g., Qwen3.5
+ * DeltaNet) are NOT affected — they hit the branch above and proceed. */
+ if (n_attn_layers == 0 && c->delta_n_heads == 0) {
+ fprintf(stderr,
+ "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n"
+ " Detected 0 self_attn layers and no DeltaNet weights.\n"
+ " This usually means the model uses fused QKV projection\n"
+ " (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle.\n"
+ " See docs/supported_models.md for the architecture support matrix.\n",
+ gguf->arch[0] ? gguf->arch : "unknown");
+ /* tq_free_model owns gguf_ctx (set above at line 11463) and will
+ * close it as part of the teardown — do not double-close. */
+ tq_free_model(model);
+ return NULL;
+ }
+
/* Set up layer_is_sliding for Gemma hybrid attention.
* Detect from K tensor shape: sliding and full layers have different K output dims.
* The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */
@@ -15874,36 +15907,197 @@ int tq_generate_continue(tq_model_t* model,
* Pass cached_text_io == NULL to disable text-prefix tracking.
* ============================================================================ */
+/* ChatML / template-marker filter ----------------------------------------
+ *
+ * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`,
+ * ``, etc. as REGULAR text bytes (not special tokens). When
+ * that happens the BPE tokenizer fragments them across multiple tokens,
+ * and a per-token strstr check (like the existing `should_stop` logic)
+ * never matches. The user sees the marker leak into their stream.
+ *
+ * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated
+ * text in `pending` and only flushes bytes that are guaranteed to NOT
+ * be the start of a marker. When a full marker is matched:
+ * - `<|im_start|>` at the very beginning of the response → header
+ * skip mode (drop until next '\n'). The model is regurgitating the
+ * `<|im_start|>assistant\n` prefix that the prompt template already
+ * contains; we silently strip it.
+ * - any END marker → emit the prefix, drop the marker and everything
+ * after, set `stop_requested` so the generation loop can break.
+ *
+ * Cost: each token is delayed by ~CHAT_LOOKAHEAD bytes worth of stream.
+ * For typical English (3-4 chars/token), that's ~8-10 tokens of latency
+ * before the first token shows up. After that, streaming is steady-state
+ * with the same latency window.
+ * ----------------------------------------------------------------------- */
+#define CHAT_PENDING_CAP 128
+#define CHAT_LOOKAHEAD 32
+
typedef struct {
char* buf;
size_t len;
size_t cap;
- int tainted; /* 1 if accumulation ever failed → buf is incomplete */
+ int tainted; /* 1 if accumulation ever failed → buf incomplete */
+ /* Lookahead filter state */
+ char pending[CHAT_PENDING_CAP];
+ int pending_len;
+ int in_header; /* skipping <|im_start|>...\n */
+ int stop_requested; /* end marker hit → caller should break */
void (*user_cb)(const char*, void*);
void* user_data;
} chat_accum_t;
-static void chat_accum_callback(const char* tok, void* u) {
- chat_accum_t* ctx = (chat_accum_t*)u;
- if (!tok) return;
- /* Always pass through to the user's callback first — losing tokens
- * from the user's stream because of an INTERNAL realloc failure is
- * far worse than a stale cached_text on the next turn. */
- if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+/* Emit n bytes from `p` to BOTH the user callback and accum.buf.
+ * Used after the marker filter has decided the bytes are safe. */
+static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) {
+ if (n <= 0) return;
+ /* User callback gets a NUL-terminated copy. */
+ char tmp[CHAT_PENDING_CAP + 1];
+ if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP;
+ memcpy(tmp, p, (size_t)n);
+ tmp[n] = '\0';
+ if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data);
if (ctx->tainted) return;
- size_t tlen = strlen(tok);
- if (ctx->len + tlen + 1 > ctx->cap) {
- size_t new_cap = (ctx->cap + tlen + 64) * 2;
+ if (ctx->len + (size_t)n + 1 > ctx->cap) {
+ size_t new_cap = (ctx->cap + (size_t)n + 64) * 2;
char* nb = (char*)realloc(ctx->buf, new_cap);
if (!nb) { ctx->tainted = 1; return; }
- ctx->buf = nb;
- ctx->cap = new_cap;
+ ctx->buf = nb; ctx->cap = new_cap;
}
- memcpy(ctx->buf + ctx->len, tok, tlen);
- ctx->len += tlen;
+ memcpy(ctx->buf + ctx->len, tmp, (size_t)n);
+ ctx->len += (size_t)n;
ctx->buf[ctx->len] = '\0';
}
+/* Drop n bytes from the front of pending. */
+static void chat_accum_drop(chat_accum_t* ctx, int n) {
+ if (n <= 0) return;
+ if (n > ctx->pending_len) n = ctx->pending_len;
+ memmove(ctx->pending, ctx->pending + n,
+ (size_t)(ctx->pending_len - n));
+ ctx->pending_len -= n;
+}
+
+/* Find first occurrence of marker `m` in haystack[0..hlen). -1 if none. */
+static int chat_find_marker(const char* h, int hlen, const char* m) {
+ int mlen = (int)strlen(m);
+ if (hlen < mlen) return -1;
+ for (int p = 0; p + mlen <= hlen; p++) {
+ if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p;
+ }
+ return -1;
+}
+
+/* Markers that signal "stop generating now". <|im_start|> is included
+ * because if the model emits it MID-response (after generating real
+ * content), it's hallucinating a new chat turn and we should stop. */
+static const char* const CHAT_END_MARKERS[] = {
+ "<|im_end|>", "<|eot_id|>", "", "<|endoftext|>",
+ "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
+ NULL,
+};
+
+static void chat_accum_callback(const char* tok, void* u) {
+ chat_accum_t* ctx = (chat_accum_t*)u;
+ if (!tok || ctx->stop_requested) return;
+ int tlen = (int)strlen(tok);
+ if (tlen == 0) return;
+
+ /* Make room. If pending would overflow, flush the safe prefix
+ * (everything but the last LOOKAHEAD bytes) first. */
+ if (ctx->pending_len + tlen > CHAT_PENDING_CAP) {
+ int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+ if (emit > 0) {
+ if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit);
+ chat_accum_drop(ctx, emit);
+ }
+ }
+ /* Pathological: token bigger than the whole pending buffer.
+ * Emit pending + token raw and bail (no marker scan). */
+ if (tlen > CHAT_PENDING_CAP) {
+ if (!ctx->in_header) {
+ chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+ chat_accum_emit(ctx, tok, tlen);
+ }
+ ctx->pending_len = 0;
+ return;
+ }
+ memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen);
+ ctx->pending_len += tlen;
+
+ /* State machine: drain pending as far as possible. */
+ int progress = 1;
+ while (progress) {
+ progress = 0;
+ if (ctx->in_header) {
+ int nl = -1;
+ for (int i = 0; i < ctx->pending_len; i++) {
+ if (ctx->pending[i] == '\n') { nl = i; break; }
+ }
+ if (nl >= 0) {
+ chat_accum_drop(ctx, nl + 1);
+ ctx->in_header = 0;
+ progress = 1;
+ } else {
+ /* No newline yet — drop everything (it's all in header) */
+ ctx->pending_len = 0;
+ return;
+ }
+ }
+ /* Scan for the EARLIEST end marker in pending. */
+ int em_pos = -1;
+ const char* em_str = NULL;
+ for (int i = 0; CHAT_END_MARKERS[i]; i++) {
+ int p = chat_find_marker(ctx->pending, ctx->pending_len,
+ CHAT_END_MARKERS[i]);
+ if (p >= 0 && (em_pos < 0 || p < em_pos)) {
+ em_pos = p; em_str = CHAT_END_MARKERS[i];
+ }
+ }
+ if (em_pos >= 0) {
+ /* Special case: <|im_start|> at the very start of the
+ * response → strip the header (don't stop). The model is
+ * echoing the chat-template prefix. */
+ if (em_pos == 0 && ctx->len == 0 && em_str &&
+ strcmp(em_str, "<|im_start|>") == 0) {
+ chat_accum_drop(ctx, 12); /* len("<|im_start|>") */
+ ctx->in_header = 1;
+ progress = 1;
+ continue;
+ }
+ /* Otherwise: emit clean prefix, discard rest, request stop. */
+ if (em_pos > 0) {
+ chat_accum_emit(ctx, ctx->pending, em_pos);
+ }
+ ctx->pending_len = 0;
+ ctx->stop_requested = 1;
+ return;
+ }
+ }
+
+ /* Safe portion: keep the trailing LOOKAHEAD bytes (any in-flight
+ * marker is at most this long), flush the rest. */
+ if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) {
+ int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+ chat_accum_emit(ctx, ctx->pending, emit);
+ chat_accum_drop(ctx, emit);
+ }
+}
+
+/* Generation finished — flush any leftover pending bytes. Called once
+ * before reading accum.buf for the cached_text update. */
+static void chat_accum_finish(chat_accum_t* ctx) {
+ if (ctx->in_header) {
+ /* Stuck mid-header (no '\n' arrived) → drop the rest. */
+ ctx->pending_len = 0;
+ return;
+ }
+ if (ctx->pending_len > 0) {
+ chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+ ctx->pending_len = 0;
+ }
+}
+
int tq_generate_chat_text(tq_model_t* model,
tq_tokenizer_t* tokenizer,
tq_state_t* state,
@@ -15929,9 +16123,10 @@ int tq_generate_chat_text(tq_model_t* model,
}
}
- chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
- .user_cb = config->on_token,
- .user_data = config->user_data };
+ chat_accum_t accum;
+ memset(&accum, 0, sizeof(accum));
+ accum.user_cb = config->on_token;
+ accum.user_data = config->user_data;
void (*orig_cb)(const char*, void*) = config->on_token;
void* orig_ud = config->user_data;
config->on_token = chat_accum_callback;
@@ -16052,6 +16247,9 @@ int tq_generate_chat_text(tq_model_t* model,
int piece_len = (int)strlen(piece ? piece : "");
if (config->on_token && piece) config->on_token(piece, config->user_data);
+ /* The chat_accum filter may have detected an end marker
+ * spanning multiple tokens — break before forwarding more. */
+ if (accum.stop_requested) break;
if (output && piece && output_pos + piece_len < output_size - 1) {
memcpy(output + output_pos, piece, piece_len);
output_pos += piece_len;
@@ -16100,6 +16298,11 @@ int tq_generate_chat_text(tq_model_t* model,
output, output_size);
}
+ /* Drain the marker filter's lookahead buffer before reading
+ * accum.buf for the cached_text update. Without this, the last
+ * ~32 bytes of clean output would be silently lost. */
+ chat_accum_finish(&accum);
+
config->on_token = orig_cb;
config->user_data = orig_ud;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
index 0211a83..f3a69a4 100644
--- a/src/engine/tq_generate.c
+++ b/src/engine/tq_generate.c
@@ -834,36 +834,165 @@ int tq_generate_continue(tq_model_t* model,
* exactly like tq_generate_continue.
* ============================================================================ */
+/* ChatML / template-marker filter ----------------------------------------
+ *
+ * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`,
+ * ``, etc. as REGULAR text bytes (not special tokens). When
+ * that happens the BPE tokenizer fragments them across multiple tokens,
+ * and a per-token strstr check (like the existing `should_stop` logic)
+ * never matches. The user sees the marker leak into their stream.
+ *
+ * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated
+ * text in `pending` and only flushes bytes that are guaranteed to NOT
+ * be the start of a marker. When a full marker is matched:
+ * - `<|im_start|>` at the very beginning of the response → header
+ * skip mode (drop until next '\n').
+ * - any END marker → emit prefix, drop the rest, set stop_requested.
+ *
+ * Mirrored byte-for-byte with the version in quant.h. ---------------------- */
+#define CHAT_PENDING_CAP 128
+#define CHAT_LOOKAHEAD 32
+
typedef struct {
char* buf;
size_t len;
size_t cap;
- int tainted; /* 1 if accumulation ever failed → buf is incomplete */
+ int tainted;
+ char pending[CHAT_PENDING_CAP];
+ int pending_len;
+ int in_header;
+ int stop_requested;
void (*user_cb)(const char*, void*);
void* user_data;
} chat_accum_t;
-static void chat_accum_callback(const char* tok, void* u) {
- chat_accum_t* ctx = (chat_accum_t*)u;
- if (!tok) return;
- /* Always pass through to the user's callback first — losing tokens
- * from the user's stream because of an INTERNAL realloc failure is
- * far worse than a stale cached_text on the next turn. */
- if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) {
+ if (n <= 0) return;
+ char tmp[CHAT_PENDING_CAP + 1];
+ if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP;
+ memcpy(tmp, p, (size_t)n);
+ tmp[n] = '\0';
+ if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data);
if (ctx->tainted) return;
- size_t tlen = strlen(tok);
- if (ctx->len + tlen + 1 > ctx->cap) {
- size_t new_cap = (ctx->cap + tlen + 64) * 2;
+ if (ctx->len + (size_t)n + 1 > ctx->cap) {
+ size_t new_cap = (ctx->cap + (size_t)n + 64) * 2;
char* nb = (char*)realloc(ctx->buf, new_cap);
if (!nb) { ctx->tainted = 1; return; }
- ctx->buf = nb;
- ctx->cap = new_cap;
+ ctx->buf = nb; ctx->cap = new_cap;
}
- memcpy(ctx->buf + ctx->len, tok, tlen);
- ctx->len += tlen;
+ memcpy(ctx->buf + ctx->len, tmp, (size_t)n);
+ ctx->len += (size_t)n;
ctx->buf[ctx->len] = '\0';
}
+static void chat_accum_drop(chat_accum_t* ctx, int n) {
+ if (n <= 0) return;
+ if (n > ctx->pending_len) n = ctx->pending_len;
+ memmove(ctx->pending, ctx->pending + n,
+ (size_t)(ctx->pending_len - n));
+ ctx->pending_len -= n;
+}
+
+static int chat_find_marker(const char* h, int hlen, const char* m) {
+ int mlen = (int)strlen(m);
+ if (hlen < mlen) return -1;
+ for (int p = 0; p + mlen <= hlen; p++) {
+ if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p;
+ }
+ return -1;
+}
+
+static const char* const CHAT_END_MARKERS[] = {
+ "<|im_end|>", "<|eot_id|>", "", "<|endoftext|>",
+ "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
+ NULL,
+};
+
+static void chat_accum_callback(const char* tok, void* u) {
+ chat_accum_t* ctx = (chat_accum_t*)u;
+ if (!tok || ctx->stop_requested) return;
+ int tlen = (int)strlen(tok);
+ if (tlen == 0) return;
+
+ if (ctx->pending_len + tlen > CHAT_PENDING_CAP) {
+ int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+ if (emit > 0) {
+ if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit);
+ chat_accum_drop(ctx, emit);
+ }
+ }
+ if (tlen > CHAT_PENDING_CAP) {
+ if (!ctx->in_header) {
+ chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+ chat_accum_emit(ctx, tok, tlen);
+ }
+ ctx->pending_len = 0;
+ return;
+ }
+ memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen);
+ ctx->pending_len += tlen;
+
+ int progress = 1;
+ while (progress) {
+ progress = 0;
+ if (ctx->in_header) {
+ int nl = -1;
+ for (int i = 0; i < ctx->pending_len; i++) {
+ if (ctx->pending[i] == '\n') { nl = i; break; }
+ }
+ if (nl >= 0) {
+ chat_accum_drop(ctx, nl + 1);
+ ctx->in_header = 0;
+ progress = 1;
+ } else {
+ ctx->pending_len = 0;
+ return;
+ }
+ }
+ int em_pos = -1;
+ const char* em_str = NULL;
+ for (int i = 0; CHAT_END_MARKERS[i]; i++) {
+ int p = chat_find_marker(ctx->pending, ctx->pending_len,
+ CHAT_END_MARKERS[i]);
+ if (p >= 0 && (em_pos < 0 || p < em_pos)) {
+ em_pos = p; em_str = CHAT_END_MARKERS[i];
+ }
+ }
+ if (em_pos >= 0) {
+ if (em_pos == 0 && ctx->len == 0 && em_str &&
+ strcmp(em_str, "<|im_start|>") == 0) {
+ chat_accum_drop(ctx, 12);
+ ctx->in_header = 1;
+ progress = 1;
+ continue;
+ }
+ if (em_pos > 0) {
+ chat_accum_emit(ctx, ctx->pending, em_pos);
+ }
+ ctx->pending_len = 0;
+ ctx->stop_requested = 1;
+ return;
+ }
+ }
+
+ if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) {
+ int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+ chat_accum_emit(ctx, ctx->pending, emit);
+ chat_accum_drop(ctx, emit);
+ }
+}
+
+static void chat_accum_finish(chat_accum_t* ctx) {
+ if (ctx->in_header) {
+ ctx->pending_len = 0;
+ return;
+ }
+ if (ctx->pending_len > 0) {
+ chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+ ctx->pending_len = 0;
+ }
+}
+
int tq_generate_chat_text(tq_model_t* model,
tq_tokenizer_t* tokenizer,
tq_state_t* state,
@@ -905,9 +1034,10 @@ int tq_generate_chat_text(tq_model_t* model,
/* Wrap user callback to capture generated text into a buffer for the
* next call's cached_text update. */
- chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
- .user_cb = config->on_token,
- .user_data = config->user_data };
+ chat_accum_t accum;
+ memset(&accum, 0, sizeof(accum));
+ accum.user_cb = config->on_token;
+ accum.user_data = config->user_data;
void (*orig_cb)(const char*, void*) = config->on_token;
void* orig_ud = config->user_data;
config->on_token = chat_accum_callback;
@@ -1039,6 +1169,9 @@ int tq_generate_chat_text(tq_model_t* model,
int piece_len = (int)strlen(piece ? piece : "");
if (config->on_token && piece) config->on_token(piece, config->user_data);
+ /* The chat_accum filter may have detected an end marker
+ * spanning multiple tokens — break before forwarding more. */
+ if (accum.stop_requested) break;
if (output && piece && output_pos + piece_len < output_size - 1) {
memcpy(output + output_pos, piece, piece_len);
output_pos += piece_len;
@@ -1088,6 +1221,11 @@ int tq_generate_chat_text(tq_model_t* model,
output, output_size);
}
+ /* Drain the marker filter's lookahead buffer before reading
+ * accum.buf for the cached_text update. Without this, the last
+ * ~32 bytes of clean output would be silently lost. */
+ chat_accum_finish(&accum);
+
/* Restore the original callback before returning to caller */
config->on_token = orig_cb;
config->user_data = orig_ud;
diff --git a/wasm/quant.wasm b/wasm/quant.wasm
index f018484..477218d 100755
Binary files a/wasm/quant.wasm and b/wasm/quant.wasm differ