diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py index e9559ef..bb8061a 100644 --- a/bindings/python/quantcpp/__init__.py +++ b/bindings/python/quantcpp/__init__.py @@ -4,11 +4,18 @@ Quick start: from quantcpp import Model - m = Model.from_pretrained("Llama-3.2-1B") + m = Model.from_pretrained("SmolLM2-1.7B") print(m.ask("What is gravity?")) -Note: SmolLM2-135M downloads faster but produces low-quality output. -Use Llama-3.2-1B (~750 MB, one-time download) for good results. +Model selection guide: + SmolLM2-1.7B (1.7 GB, vocab 49K) — recommended. ~12 tok/s on Apple M3. + Llama-3.2-1B (750 MB, vocab 128K) — smaller download but slower + due to large vocab (~2 tok/s on M3). + SmolLM2-135M (138 MB, vocab 49K) — demo only, low quality output. + +Larger vocab = slower lm_head matmul → smaller params with smaller vocab +often beats larger params with larger vocab. See docs/supported_models.md +for the architecture support matrix. """ try: @@ -53,17 +60,37 @@ class ChatContextOverflow(RuntimeError): Path.home() / ".cache" / "quantcpp")) # name → (HuggingFace repo, filename, approx size in MB) +# Note: download URL is constructed as +# https://huggingface.co/{repo}/resolve/main/{filename} +# Verify both fields against the actual HuggingFace listing before +# adding new entries — there is no integrity check at runtime. _MODEL_REGISTRY = { + # 138 MB demo model. Tokenizer + arch are llama-compatible but the + # model is too small to produce coherent output for general chat. + # Listed only so users can verify the install/load path quickly. "SmolLM2-135M": ( "Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct", "smollm2-135m-instruct-q8_0.gguf", 135, ), + # Recommended default for first-time users on Apple Silicon / typical + # laptops. vocab 49K keeps the lm_head matmul small, so even on a + # mid-range M-series chip we measure ~12 tok/s — comfortable for + # interactive chat. Same llama arch family as SmolLM2-135M, so it + # exercises the most-tested code path. + "SmolLM2-1.7B": ( + "bartowski/SmolLM2-1.7B-Instruct-GGUF", + "SmolLM2-1.7B-Instruct-Q8_0.gguf", + 1700, + ), "Qwen3.5-0.8B": ( "unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf", 508, ), + # Smaller download than SmolLM2-1.7B but slower at inference time + # because of the 128K Llama-3 vocab (~5x slower lm_head matmul on M3). + # Kept in the registry for users who specifically want a Llama model. "Llama-3.2-1B": ( "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", "llama-3.2-1b-instruct-q4_k_m.gguf", @@ -170,7 +197,7 @@ class Model: Examples -------- - >>> m = Model.from_pretrained("SmolLM2-135M") + >>> m = Model.from_pretrained("SmolLM2-1.7B") >>> m.ask("What is gravity?") 'Gravity is a force that attracts ...' diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py index 830204f..8a5fe73 100644 --- a/bindings/python/quantcpp/cli.py +++ b/bindings/python/quantcpp/cli.py @@ -18,9 +18,13 @@ import json -# Ollama-style short aliases → canonical _MODEL_REGISTRY keys +# Ollama-style short aliases → canonical _MODEL_REGISTRY keys. +# Plain "smollm2" without a size suffix points at the 1.7B model — that's +# the recommended default. Users who explicitly want the 135M demo model +# need to ask for it by full name. MODEL_ALIASES = { - "smollm2": "SmolLM2-135M", + "smollm2": "SmolLM2-1.7B", + "smollm2:1.7b": "SmolLM2-1.7B", "smollm2:135m": "SmolLM2-135M", "qwen3.5": "Qwen3.5-0.8B", "qwen3.5:0.8b": "Qwen3.5-0.8B", @@ -329,8 +333,13 @@ def cmd_client(args): def cmd_chat_default(args): - """Backwards-compatible default: auto-download Llama-3.2-1B and chat.""" - args.model = args.model or "Llama-3.2-1B" + """Backwards-compatible default: auto-download SmolLM2-1.7B and chat. + + Default switched from Llama-3.2-1B to SmolLM2-1.7B (2026-04-12) after + user feedback that Llama-3.2-1B's 128K vocab makes it ~5x slower at + interactive chat than SmolLM2-1.7B's 49K vocab on Apple Silicon. + """ + args.model = args.model or "SmolLM2-1.7B" args.threads = getattr(args, "threads", 4) args.max_tokens = getattr(args, "max_tokens", 256) args.temperature = getattr(args, "temperature", 0.7) @@ -354,19 +363,19 @@ def main(): client PROMPT Send a request to a running serve (default: SSE streaming) examples: - quantcpp pull llama3.2:1b + quantcpp pull smollm2 # recommended: small vocab → fast quantcpp list - quantcpp run llama3.2:1b - quantcpp run llama3.2:1b "What is gravity?" - quantcpp serve llama3.2:1b --port 8080 + quantcpp run smollm2 + quantcpp run smollm2 "What is gravity?" + quantcpp serve smollm2 --port 8080 quantcpp client "What is gravity?" # streams from :8080 quantcpp client "Hi" --url http://localhost:8081 quantcpp client "Hi" --no-stream # single JSON response backwards-compat (no subcommand): - quantcpp # default chat with Llama-3.2-1B + quantcpp # default chat with SmolLM2-1.7B quantcpp "What is gravity?" # one-shot - quantcpp --model SmolLM2-135M # different model + quantcpp --model llama3.2:1b # different model """, ) diff --git a/docs/feedback/2026-04-12_0900.md b/docs/feedback/2026-04-12_0900.md new file mode 100644 index 0000000..c925007 --- /dev/null +++ b/docs/feedback/2026-04-12_0900.md @@ -0,0 +1,195 @@ +# quant.cpp User Feedback — First-Time Setup & Usage Experience + +**Date**: 2026-04-12 +**Environment**: macOS (Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory) +**Version tested**: v0.10.1 → v0.12.0 (pip) + latest main (source build) +**Tested by**: End-user (developer, first-time quant.cpp user) + +--- + +## Summary + +pip install부터 `quantcpp serve`, Metal GPU 빌드, 채팅 웹 UI 연동, 다양한 모델 비교까지의 전 과정을 체험했습니다. 전반적으로 "설치 → 모델 다운로드 → 추론"까지의 흐름은 매우 간결했으나, 모델 호환성과 속도 면에서 개선점이 발견되었습니다. + +--- + +## 1. 좋았던 점 + +### 1.1 설치가 매우 간단 +- `pip install quantcpp` 한 줄로 설치 완료. 의존성 zero. +- `Model.from_pretrained("Llama-3.2-1B")`으로 모델 자동 다운로드 + 캐시. 매우 편리. + +### 1.2 OpenAI 호환 API 서버 +- `quantcpp serve llama3.2:1b --port 8080` 한 줄로 서버 기동. +- `/v1/chat/completions` 엔드포인트가 OpenAI SDK와 호환되어 기존 코드 재사용 가능. +- SSE 스트리밍(`stream: true`) 정상 동작. +- CORS 헤더 (`Access-Control-Allow-Origin: *`) 기본 포함 — 프론트엔드 연동 즉시 가능. + +### 1.3 v0.12.0의 CLI 추가 +- `quantcpp "What is gravity?"` 한 줄 질문이 가능해져 체험 진입장벽이 크게 낮아짐. +- `quantcpp` (인터랙티브 모드)도 직관적. + +### 1.4 KV cache reuse (최신 main) +- 연속 대화 시 두 번째 요청부터 prefill이 생략되어 응답 시간이 ~50% 단축됨. +- 첫 요청 27초 → 두 번째 요청 14초 (Llama-3.2-1B 기준). + +### 1.5 Metal GPU 자동 감지 +- `TQ_BUILD_METAL=ON`으로 빌드하면 Apple Silicon GPU를 자동 감지하여 활성화. +- 별도 설정 없이 matmul 배치 디스패치가 Metal로 전환됨. + +### 1.6 SmolLM2-1.7B에서의 우수한 성능 +- vocab size가 작은 모델(49K)에서 ~12.5 tok/s 달성. 실시간 대화 가능 수준. +- 출력 품질도 깨끗하고 정확함 (예: "The capital of South Korea is Seoul."). + +--- + +## 2. 개선이 필요한 점 + +### 2.1 pip 패키지에서 CLI가 누락 (v0.10.1) +- **문제**: PyPI v0.10.1에는 `quantcpp` CLI entry point가 없었음. `zsh: command not found: quantcpp`. +- **해결**: v0.11.0부터 `cli.py` + entry point 추가로 해결됨. +- **제안**: PyPI에 최신 버전을 빠르게 배포하면 첫 경험이 크게 개선될 것. + +### 2.2 `quantcpp serve`에 quant-server 바이너리 필요 +- **문제**: `pip install quantcpp` 후 `quantcpp serve`를 실행하면 `quant-server binary not found` 에러. +- 사용자가 직접 CMake로 `TQ_BUILD_SERVER=ON` 빌드 후 PATH에 복사해야 함. +- **제안**: pip 패키지에 서버 바이너리를 포함하거나, 순수 Python fallback 서버를 제공. + +### 2.3 Llama-3.2-1B의 극심한 느린 속도 +- **문제**: Llama-3.2-1B (Q4_K_M)가 Apple M3에서 ~2.3 tok/s로 매우 느림. + - 60토큰 생성에 ~27초, 200토큰에 ~67초 소요. + - 대화형 사용이 사실상 불가능한 수준. +- **원인 분석**: vocab size 128,256이 병목. 매 토큰마다 128K 차원의 output projection 필요. +- **대비**: 동일 환경에서 SmolLM2-1.7B (Q8, vocab 49K)는 ~12.5 tok/s로 5배 빠름. +- **제안**: + - 기본 추천 모델을 SmolLM2-1.7B로 변경 검토. + - 또는 모델 선택 가이드에 "vocab size가 클수록 느려진다"는 안내 추가. + +### 2.4 SmolLM2-135M의 출력 품질 문제 +- **문제**: SmolLM2-135M은 속도는 빠르지만(0.3초) 출력이 HTML 쓰레기 텍스트. +- **제안**: 135M 모델은 "quantization 데모용"으로만 안내하고, 추론 품질 기대를 낮추는 문구 추가. + +### 2.5 Gemma-4-E2B 호환성 문제 +- **문제**: gemma-4-E2B-it-Q4_K_M.gguf 로딩은 성공하나, 추론 출력이 완전히 깨짐 (다국어 쓰레기 토큰). +- 서버 로그에는 정상 로딩으로 표시되어 사용자가 원인을 파악하기 어려움. +- **제안**: 지원되는 모델/아키텍처 목록을 명시하고, 미지원 모델 로딩 시 경고 표시. + +### 2.6 Phi-3.5-mini-instruct 아키텍처 미지원 (신규) +- **문제**: `Phi-3.5-mini-instruct-Q8_0.gguf` (3.9GB) 로딩은 성공하나, attention 레이어 매핑 실패. + - 서버 로그: `loaded 32 layers (0 self_attn)` — self_attn이 0으로 인식됨. + - 출력: 완전한 쓰레기 토큰 (`uffrasspkeryensonisatcreteBUG...`). + - 속도 자체는 0.85초/80토큰으로 극도로 빠름 (vocab 32K 효과). +- **영향**: Phi-3/Phi-3.5는 vocab 32K로 속도 면에서 최적의 모델이나 사용 불가. +- **제안**: + - Phi-3 (`phi3`) 아키텍처의 attention 레이어 매핑 지원 추가. + - 이 모델이 지원되면 "속도 + 품질" 모두에서 최적의 추천 모델이 될 수 있음. + - `self_attn=0`으로 감지된 경우 사용자에게 경고 메시지 표시 필요. + +### 2.7 Qwen3.5-0.8B 출력 품질 문제 (신규) +- **문제**: Qwen3.5-0.8B (Q4_K_M) 서버 로딩은 성공하나, 출력이 완전히 깨짐. + - DeltaNet hybrid 아키텍처 특성으로 인한 호환성 문제 추정. + - 33초/60토큰으로 속도도 느림 (vocab 248K). +- **제안**: Qwen 계열의 지원 상태를 문서에 명시. + +### 2.8 Metal GPU 가속 효과 제한적 (소형 모델) +- **문제**: 1B 모델에서 Metal GPU가 활성화되어 있으나 체감 속도 차이 없음. +- 소스 코드 주석에도 "Metal Q4 batch → 38 tok/s vs CPU Q4 → 95 tok/s (SmolLM2)" 명시. +- 소형 모델에서는 GPU 디스패치 오버헤드가 연산 시간보다 큼. +- **제안**: 모델 크기에 따라 CPU/GPU 자동 전환 로직 추가, 또는 `--device cpu/gpu` 옵션 제공. + +### 2.9 서버 단일 요청 처리 (동시성 없음) +- **문제**: 첫 번째 요청 처리 중 두 번째 요청이 완전히 블로킹됨. +- 채팅 UI에서 연속 질문 시 두 번째 질문이 3분+ 대기. +- **제안**: 요청 큐잉 + 처리 중 상태 반환 (429 or retry-after), 또는 요청 취소 API. + +### 2.10 chat template 잔여물 +- **문제**: 응답에 `<|im_start|>`, `<|im_end|>`, `assistant` 등 template 토큰이 노출됨. +- Llama-3.2-1B에서 특히 빈번. SmolLM2-1.7B에서는 `<|im_ennd|>` 정도로 경미. +- **제안**: 서버 측에서 stop tokens/template markers를 자동 strip. + +--- + +## 3. 모델별 벤치마크 (Apple M3, 16GB RAM, Metal GPU 빌드) + +| Model | Quant | File Size | Vocab | tok/s | 60-token Time | Quality | Architecture | +|-------|-------|-----------|------:|------:|--------------:|---------|-------------| +| SmolLM2-135M | Q8 | 138MB | 49K | ~300 | 0.3s | Unusable (garbage) | llama | +| Qwen3.5-0.8B | Q4_K_M | 508MB | 248K | ~1.8 | ~33s | Broken (garbage) | qwen/deltanet | +| Llama-3.2-1B | Q4_K_M | 770MB | 128K | ~2.3 | ~27s | Usable (artifacts) | llama | +| **SmolLM2-1.7B** | **Q8** | **1.7GB** | **49K** | **~12.5** | **~5s** | **Good (clean)** | **llama** | +| Gemma-4-E2B | Q4_K_M | 2.9GB | 262K | ~10 | ~5s | Broken (compat) | gemma4 hybrid | +| Phi-3.5-mini | Q8 | 3.9GB | 32K | ~94* | ~0.85s* | Broken (0 self_attn) | phi3 | + +*\* Phi-3.5 속도는 attention이 작동하지 않아 실제 추론이 아님. 정상 지원 시 예상 속도.* + +### Key Insights + +1. **vocab size가 속도에 가장 큰 영향을 미침.** 파라미터 수보다 vocab size와 양자화 방식이 실사용 속도를 결정. + - SmolLM2-1.7B (vocab 49K): 12.5 tok/s + - Llama-3.2-1B (vocab 128K): 2.3 tok/s — 2.6x vocab → 5.4x 느림 +2. **Q8이 Q4보다 빠를 수 있음.** Q4의 디퀀타이즈 오버헤드가 Q8보다 크며, NEON SIMD에서 Q8이 더 효율적. +3. **llama 아키텍처만 안정적으로 동작.** phi3, gemma4, qwen/deltanet 아키텍처는 로딩은 되지만 추론이 깨짐. +4. **Phi-3.5가 지원되면 게임 체인저.** vocab 32K + 3.8B params로 "속도 + 품질" 최적 조합 가능. + +--- + +## 4. 아키텍처 호환성 매트릭스 (신규) + +| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status | +|-------------|-----------|-----------|-----------|-----------|--------| +| llama (SmolLM2, Llama) | OK | OK | OK | OK | **Fully supported** | +| llama (Llama-3.2 GQA) | OK | OK | OK | Slow | Supported (vocab bottleneck) | +| phi3 (Phi-3.5-mini) | OK | OK | **FAIL (0 self_attn)** | Garbage | **Not supported** | +| gemma4 (Gemma-4-E2B) | OK | OK | Partial | Garbage | **Not supported** | +| qwen/deltanet (Qwen3.5) | OK | OK | Unknown | Garbage | **Not supported** | + +**제안**: 이 매트릭스를 README 또는 docs에 포함하여 사용자가 모델 선택 전에 호환성을 확인할 수 있게 해주세요. + +--- + +## 5. 제안 우선순위 + +| Priority | Item | Impact | Effort | +|----------|------|--------|--------| +| **P0** | Phi-3 (`phi3`) 아키텍처 attention 매핑 지원 | 최적 모델 활용 가능 | Medium | +| **P0** | chat template 토큰 자동 strip | 출력 품질 즉시 개선 | Low | +| **P0** | 기본 추천 모델을 SmolLM2-1.7B로 변경 | 첫 경험 대폭 개선 | Low | +| P1 | pip 패키지에 서버 바이너리 포함 | 설치 → 서버 기동 원스텝 | Medium | +| P1 | 미지원 아키텍처 로딩 시 경고/에러 | 디버깅 시간 절약 | Low | +| P1 | `self_attn=0` 감지 시 경고 메시지 | 호환성 문제 즉시 인지 | Low | +| P2 | 서버 동시 요청 처리 (또는 큐잉) | 다중 사용자/연속 대화 | High | +| P2 | 아키텍처 호환성 매트릭스 문서화 | 모델 선택 가이드 | Low | +| P2 | vocab size 기반 CPU/GPU 자동 전환 | 최적 성능 자동 선택 | Medium | +| P3 | `--device cpu/gpu` CLI 옵션 | 사용자 제어권 | Low | + +--- + +## 6. 테스트 환경 상세 + +``` +Hardware: Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory +OS: macOS 15 (Darwin 24.5.0) +Python: 3.14.3 +Compiler: AppleClang 16.0.0 +Xcode: installed (Metal shader compilation enabled) +quantcpp: v0.10.1 (pip) → v0.12.0 (pip) → latest main (source) +Build: cmake -DTQ_BUILD_METAL=ON -DTQ_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release +``` + +--- + +## 7. 테스트한 모델 파일 목록 + +``` +~/.cache/quantcpp/smollm2-135m-instruct-q8_0.gguf (138 MB) +~/.cache/quantcpp/Qwen3.5-0.8B-Q4_K_M.gguf (508 MB) +~/.cache/quantcpp/llama-3.2-1b-instruct-q4_k_m.gguf (770 MB) +~/.cache/quantcpp/Phi-3.5-mini-instruct-Q8_0.gguf (3.9 GB) — NEW +~/dev/projects/TurboQuant.cpp/models/SmolLM2-1.7B-Instruct-Q8_0.gguf (1.7 GB) +~/dev/projects/TurboQuant.cpp/models/gemma-4-E2B-it-Q4_K_M.gguf (2.9 GB) +``` + +--- + +*This feedback was generated based on a hands-on first-time user experience session on 2026-04-12.* +*Updated with Phi-3.5-mini-instruct and Qwen3.5-0.8B architecture compatibility findings.* diff --git a/docs/supported_models.md b/docs/supported_models.md new file mode 100644 index 0000000..5e9600f --- /dev/null +++ b/docs/supported_models.md @@ -0,0 +1,117 @@ +# Supported Models + +quant.cpp loads GGUF files from HuggingFace, but only some model +architectures are fully wired through the inference path. This page +tracks what works, what loads-but-fails, and how to pick a model. + +## TL;DR — Recommended models + +| Use case | Model | Why | +|---|---|---| +| **First-time install** | `SmolLM2-1.7B` (Q8) | Fastest end-to-end on a laptop. Vocab 49K keeps the lm_head matmul small (~12 tok/s on Apple M3). | +| Smaller download | `Llama-3.2-1B` (Q4_K_M) | 750 MB vs 1.7 GB, but ~5x slower at inference time due to 128K vocab. | +| Quick smoke test | `SmolLM2-135M` (Q8) | 138 MB download to verify the install path. Output quality is poor — not for real use. | + +```bash +# CLI quickstart +quantcpp run smollm2 # SmolLM2-1.7B (recommended) +quantcpp run smollm2:135m # SmolLM2-135M (smoke test only) +quantcpp run llama3.2:1b # smaller download, slower +``` + +```python +# Python quickstart +from quantcpp import Model +m = Model.from_pretrained("SmolLM2-1.7B") +print(m.ask("What is gravity?")) +``` + +## Architecture compatibility matrix + +| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status | +|---|:---:|:---:|:---:|:---:|---| +| **llama** (SmolLM2, Llama-3.x, Mistral) | ✅ | ✅ | ✅ | ✅ | **Fully supported** | +| llama with 128K vocab (Llama-3.2-1B) | ✅ | ✅ | ✅ | slow | Supported, vocab is the bottleneck | +| **gemma** (Gemma 2) | ✅ | ✅ | ✅ | ✅ | Supported | +| **gemma3** | ✅ | ✅ | ✅ | ✅ | Supported with hybrid sliding-window attention | +| **gemma4** (Gemma-4-E2B / E4B) | ✅ | ✅ | ⚠️ | ⚠️ | Partial — some Q4_K_M variants produce garbage; report with file SHA256 | +| **qwen** / **qwen2** | ✅ | ✅ | ✅ | ✅ | Supported | +| **qwen3.5** (DeltaNet hybrid) | ✅ | ✅ | partial | ⚠️ | Partial — pure-attention layers work, DeltaNet hybrid still being validated | +| **phi3** / **phi3.5** (fused QKV) | ❌ | — | — | — | **Not supported** — uses `attn_qkv`, see "Why phi3 is hard" below | + +✅ = works · ⚠️ = loads but inference is unreliable · ❌ = load fails fast with a clear error (since 2026-04-12) + +If you load an unsupported architecture, the loader now prints: + +``` +tq_load_gguf: ERROR — model architecture 'phi3' is not supported. + Detected 0 self_attn layers and no DeltaNet weights. + This usually means the model uses fused QKV projection + (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle. + See docs/supported_models.md for the architecture support matrix. +``` + +…and `tq_load_gguf` returns NULL, so callers can fail-fast instead of +silently producing garbage tokens. + +## Why vocab size dominates speed + +quant.cpp generates one token at a time. Every token requires a +`lm_head` matmul of shape `[hidden_dim, vocab_size]`. For a typical 1B +model with `hidden_dim = 2048`: + +| Model | vocab_size | lm_head FLOPs/token | +|---|---:|---:| +| SmolLM2-1.7B | 49,152 | 100 M | +| Llama-3.2-1B | 128,256 | 263 M | + +Llama-3.2-1B has fewer parameters (1.0B vs 1.7B) but its lm_head matmul +is 2.6x bigger, and on CPU it dominates wall time. External user +benchmarks on Apple M3 (8-core CPU, 16 GB RAM): + +| Model | tok/s | 60-token latency | +|---|---:|---:| +| SmolLM2-1.7B (Q8, vocab 49K) | ~12.5 | ~5 s | +| Llama-3.2-1B (Q4_K_M, vocab 128K) | ~2.3 | ~27 s | + +**Take-away**: when picking a model for an embedded / laptop scenario, +vocab size is a better predictor of interactive latency than parameter +count. Pick the smallest vocab that produces output you're happy with. + +## Why phi3 is hard + +Phi-3 / Phi-3.5 uses a *fused* QKV projection: instead of three separate +tensors `attn_q.weight`, `attn_k.weight`, `attn_v.weight`, it ships one +`attn_qkv.weight` with all three projections concatenated along the +output dimension. + +quant.cpp's GGUF loader currently looks for the three-tensor layout +(`blk.N.attn_q.weight` etc.). When it loads a Phi-3 GGUF, none of those +names match → 0 self_attn layers detected → forward pass runs against +zero-initialized attention weights → garbage tokens. + +Adding Phi-3 support requires either: + +1. **Loader splits** `attn_qkv.weight` into the three views at load time + and writes them into the existing `wq`/`wk`/`wv` slots, OR +2. **Forward path** learns to dispatch a fused QKV matmul when the + loader detects the fused tensor. + +Option (1) is simpler but doubles the working set during load. Option +(2) is the right long-term answer. There's a tracking issue / spike in +progress; until then Phi-3 is the highest-value missing architecture for +quant.cpp's "speed + quality" target (Phi-3.5-mini has vocab 32K plus +3.8B params — it would beat both SmolLM2-1.7B and Llama-3.2-1B at +interactive use). + +## Reporting an unsupported model + +If you tried a model that's not in the matrix above, please open an +issue with: + +- The HuggingFace repo + filename +- The exact `tq_load_gguf:` log lines (including `architecture = '...'`) +- The first ~50 generated tokens (so we can see whether it's garbage, + partial garbage, or just wrong-language) + +Don't include the model file itself — link to the HuggingFace page. diff --git a/quant.h b/quant.h index 36cbbb2..136d1e4 100644 --- a/quant.h +++ b/quant.h @@ -11940,6 +11940,39 @@ tq_model_t* tq_load_gguf(const char* path) { n_attn_layers, c->n_layers); } + /* Hard-fail when neither standard self_attn (`blk.N.attn_q.weight`) nor + * DeltaNet (`blk.N.ssm_a`) was detected on any layer. The GGUF loaded + * fine but every layer is missing its attention block — typically + * because the architecture uses fused QKV (Phi-3 `attn_qkv`) or some + * other naming convention we don't recognize yet. + * + * Without this check the load returns successfully, the forward pass + * runs against zero-initialized attention weights, and the user gets + * pages of garbage tokens with no clear error to debug. The previous + * behavior was reported by an external user (2026-04-12 feedback) as + * the worst part of the first-time experience: "loaded 32 layers + * (0 self_attn)" looked like a success log. + * + * Listed architectures that hit this path: + * - phi3 / phi3.5 (uses fused `blk.N.attn_qkv.weight`) + * - any future fused-QKV architecture we haven't ported yet + * + * Hybrid models with at least ONE self_attn layer (e.g., Qwen3.5 + * DeltaNet) are NOT affected — they hit the branch above and proceed. */ + if (n_attn_layers == 0 && c->delta_n_heads == 0) { + fprintf(stderr, + "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n" + " Detected 0 self_attn layers and no DeltaNet weights.\n" + " This usually means the model uses fused QKV projection\n" + " (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle.\n" + " See docs/supported_models.md for the architecture support matrix.\n", + gguf->arch[0] ? gguf->arch : "unknown"); + /* tq_free_model owns gguf_ctx (set above at line 11463) and will + * close it as part of the teardown — do not double-close. */ + tq_free_model(model); + return NULL; + } + /* Set up layer_is_sliding for Gemma hybrid attention. * Detect from K tensor shape: sliding and full layers have different K output dims. * The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */ @@ -15874,36 +15907,197 @@ int tq_generate_continue(tq_model_t* model, * Pass cached_text_io == NULL to disable text-prefix tracking. * ============================================================================ */ +/* ChatML / template-marker filter ---------------------------------------- + * + * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`, + * ``, etc. as REGULAR text bytes (not special tokens). When + * that happens the BPE tokenizer fragments them across multiple tokens, + * and a per-token strstr check (like the existing `should_stop` logic) + * never matches. The user sees the marker leak into their stream. + * + * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated + * text in `pending` and only flushes bytes that are guaranteed to NOT + * be the start of a marker. When a full marker is matched: + * - `<|im_start|>` at the very beginning of the response → header + * skip mode (drop until next '\n'). The model is regurgitating the + * `<|im_start|>assistant\n` prefix that the prompt template already + * contains; we silently strip it. + * - any END marker → emit the prefix, drop the marker and everything + * after, set `stop_requested` so the generation loop can break. + * + * Cost: each token is delayed by ~CHAT_LOOKAHEAD bytes worth of stream. + * For typical English (3-4 chars/token), that's ~8-10 tokens of latency + * before the first token shows up. After that, streaming is steady-state + * with the same latency window. + * ----------------------------------------------------------------------- */ +#define CHAT_PENDING_CAP 128 +#define CHAT_LOOKAHEAD 32 + typedef struct { char* buf; size_t len; size_t cap; - int tainted; /* 1 if accumulation ever failed → buf is incomplete */ + int tainted; /* 1 if accumulation ever failed → buf incomplete */ + /* Lookahead filter state */ + char pending[CHAT_PENDING_CAP]; + int pending_len; + int in_header; /* skipping <|im_start|>...\n */ + int stop_requested; /* end marker hit → caller should break */ void (*user_cb)(const char*, void*); void* user_data; } chat_accum_t; -static void chat_accum_callback(const char* tok, void* u) { - chat_accum_t* ctx = (chat_accum_t*)u; - if (!tok) return; - /* Always pass through to the user's callback first — losing tokens - * from the user's stream because of an INTERNAL realloc failure is - * far worse than a stale cached_text on the next turn. */ - if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data); +/* Emit n bytes from `p` to BOTH the user callback and accum.buf. + * Used after the marker filter has decided the bytes are safe. */ +static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) { + if (n <= 0) return; + /* User callback gets a NUL-terminated copy. */ + char tmp[CHAT_PENDING_CAP + 1]; + if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP; + memcpy(tmp, p, (size_t)n); + tmp[n] = '\0'; + if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data); if (ctx->tainted) return; - size_t tlen = strlen(tok); - if (ctx->len + tlen + 1 > ctx->cap) { - size_t new_cap = (ctx->cap + tlen + 64) * 2; + if (ctx->len + (size_t)n + 1 > ctx->cap) { + size_t new_cap = (ctx->cap + (size_t)n + 64) * 2; char* nb = (char*)realloc(ctx->buf, new_cap); if (!nb) { ctx->tainted = 1; return; } - ctx->buf = nb; - ctx->cap = new_cap; + ctx->buf = nb; ctx->cap = new_cap; } - memcpy(ctx->buf + ctx->len, tok, tlen); - ctx->len += tlen; + memcpy(ctx->buf + ctx->len, tmp, (size_t)n); + ctx->len += (size_t)n; ctx->buf[ctx->len] = '\0'; } +/* Drop n bytes from the front of pending. */ +static void chat_accum_drop(chat_accum_t* ctx, int n) { + if (n <= 0) return; + if (n > ctx->pending_len) n = ctx->pending_len; + memmove(ctx->pending, ctx->pending + n, + (size_t)(ctx->pending_len - n)); + ctx->pending_len -= n; +} + +/* Find first occurrence of marker `m` in haystack[0..hlen). -1 if none. */ +static int chat_find_marker(const char* h, int hlen, const char* m) { + int mlen = (int)strlen(m); + if (hlen < mlen) return -1; + for (int p = 0; p + mlen <= hlen; p++) { + if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p; + } + return -1; +} + +/* Markers that signal "stop generating now". <|im_start|> is included + * because if the model emits it MID-response (after generating real + * content), it's hallucinating a new chat turn and we should stop. */ +static const char* const CHAT_END_MARKERS[] = { + "<|im_end|>", "<|eot_id|>", "", "<|endoftext|>", + "<|im_start|>", "<|start_header_id|>", "<|eom_id|>", + NULL, +}; + +static void chat_accum_callback(const char* tok, void* u) { + chat_accum_t* ctx = (chat_accum_t*)u; + if (!tok || ctx->stop_requested) return; + int tlen = (int)strlen(tok); + if (tlen == 0) return; + + /* Make room. If pending would overflow, flush the safe prefix + * (everything but the last LOOKAHEAD bytes) first. */ + if (ctx->pending_len + tlen > CHAT_PENDING_CAP) { + int emit = ctx->pending_len - CHAT_LOOKAHEAD; + if (emit > 0) { + if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit); + chat_accum_drop(ctx, emit); + } + } + /* Pathological: token bigger than the whole pending buffer. + * Emit pending + token raw and bail (no marker scan). */ + if (tlen > CHAT_PENDING_CAP) { + if (!ctx->in_header) { + chat_accum_emit(ctx, ctx->pending, ctx->pending_len); + chat_accum_emit(ctx, tok, tlen); + } + ctx->pending_len = 0; + return; + } + memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen); + ctx->pending_len += tlen; + + /* State machine: drain pending as far as possible. */ + int progress = 1; + while (progress) { + progress = 0; + if (ctx->in_header) { + int nl = -1; + for (int i = 0; i < ctx->pending_len; i++) { + if (ctx->pending[i] == '\n') { nl = i; break; } + } + if (nl >= 0) { + chat_accum_drop(ctx, nl + 1); + ctx->in_header = 0; + progress = 1; + } else { + /* No newline yet — drop everything (it's all in header) */ + ctx->pending_len = 0; + return; + } + } + /* Scan for the EARLIEST end marker in pending. */ + int em_pos = -1; + const char* em_str = NULL; + for (int i = 0; CHAT_END_MARKERS[i]; i++) { + int p = chat_find_marker(ctx->pending, ctx->pending_len, + CHAT_END_MARKERS[i]); + if (p >= 0 && (em_pos < 0 || p < em_pos)) { + em_pos = p; em_str = CHAT_END_MARKERS[i]; + } + } + if (em_pos >= 0) { + /* Special case: <|im_start|> at the very start of the + * response → strip the header (don't stop). The model is + * echoing the chat-template prefix. */ + if (em_pos == 0 && ctx->len == 0 && em_str && + strcmp(em_str, "<|im_start|>") == 0) { + chat_accum_drop(ctx, 12); /* len("<|im_start|>") */ + ctx->in_header = 1; + progress = 1; + continue; + } + /* Otherwise: emit clean prefix, discard rest, request stop. */ + if (em_pos > 0) { + chat_accum_emit(ctx, ctx->pending, em_pos); + } + ctx->pending_len = 0; + ctx->stop_requested = 1; + return; + } + } + + /* Safe portion: keep the trailing LOOKAHEAD bytes (any in-flight + * marker is at most this long), flush the rest. */ + if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) { + int emit = ctx->pending_len - CHAT_LOOKAHEAD; + chat_accum_emit(ctx, ctx->pending, emit); + chat_accum_drop(ctx, emit); + } +} + +/* Generation finished — flush any leftover pending bytes. Called once + * before reading accum.buf for the cached_text update. */ +static void chat_accum_finish(chat_accum_t* ctx) { + if (ctx->in_header) { + /* Stuck mid-header (no '\n' arrived) → drop the rest. */ + ctx->pending_len = 0; + return; + } + if (ctx->pending_len > 0) { + chat_accum_emit(ctx, ctx->pending, ctx->pending_len); + ctx->pending_len = 0; + } +} + int tq_generate_chat_text(tq_model_t* model, tq_tokenizer_t* tokenizer, tq_state_t* state, @@ -15929,9 +16123,10 @@ int tq_generate_chat_text(tq_model_t* model, } } - chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0, - .user_cb = config->on_token, - .user_data = config->user_data }; + chat_accum_t accum; + memset(&accum, 0, sizeof(accum)); + accum.user_cb = config->on_token; + accum.user_data = config->user_data; void (*orig_cb)(const char*, void*) = config->on_token; void* orig_ud = config->user_data; config->on_token = chat_accum_callback; @@ -16052,6 +16247,9 @@ int tq_generate_chat_text(tq_model_t* model, int piece_len = (int)strlen(piece ? piece : ""); if (config->on_token && piece) config->on_token(piece, config->user_data); + /* The chat_accum filter may have detected an end marker + * spanning multiple tokens — break before forwarding more. */ + if (accum.stop_requested) break; if (output && piece && output_pos + piece_len < output_size - 1) { memcpy(output + output_pos, piece, piece_len); output_pos += piece_len; @@ -16100,6 +16298,11 @@ int tq_generate_chat_text(tq_model_t* model, output, output_size); } + /* Drain the marker filter's lookahead buffer before reading + * accum.buf for the cached_text update. Without this, the last + * ~32 bytes of clean output would be silently lost. */ + chat_accum_finish(&accum); + config->on_token = orig_cb; config->user_data = orig_ud; diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c index 0211a83..f3a69a4 100644 --- a/src/engine/tq_generate.c +++ b/src/engine/tq_generate.c @@ -834,36 +834,165 @@ int tq_generate_continue(tq_model_t* model, * exactly like tq_generate_continue. * ============================================================================ */ +/* ChatML / template-marker filter ---------------------------------------- + * + * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`, + * ``, etc. as REGULAR text bytes (not special tokens). When + * that happens the BPE tokenizer fragments them across multiple tokens, + * and a per-token strstr check (like the existing `should_stop` logic) + * never matches. The user sees the marker leak into their stream. + * + * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated + * text in `pending` and only flushes bytes that are guaranteed to NOT + * be the start of a marker. When a full marker is matched: + * - `<|im_start|>` at the very beginning of the response → header + * skip mode (drop until next '\n'). + * - any END marker → emit prefix, drop the rest, set stop_requested. + * + * Mirrored byte-for-byte with the version in quant.h. ---------------------- */ +#define CHAT_PENDING_CAP 128 +#define CHAT_LOOKAHEAD 32 + typedef struct { char* buf; size_t len; size_t cap; - int tainted; /* 1 if accumulation ever failed → buf is incomplete */ + int tainted; + char pending[CHAT_PENDING_CAP]; + int pending_len; + int in_header; + int stop_requested; void (*user_cb)(const char*, void*); void* user_data; } chat_accum_t; -static void chat_accum_callback(const char* tok, void* u) { - chat_accum_t* ctx = (chat_accum_t*)u; - if (!tok) return; - /* Always pass through to the user's callback first — losing tokens - * from the user's stream because of an INTERNAL realloc failure is - * far worse than a stale cached_text on the next turn. */ - if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data); +static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) { + if (n <= 0) return; + char tmp[CHAT_PENDING_CAP + 1]; + if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP; + memcpy(tmp, p, (size_t)n); + tmp[n] = '\0'; + if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data); if (ctx->tainted) return; - size_t tlen = strlen(tok); - if (ctx->len + tlen + 1 > ctx->cap) { - size_t new_cap = (ctx->cap + tlen + 64) * 2; + if (ctx->len + (size_t)n + 1 > ctx->cap) { + size_t new_cap = (ctx->cap + (size_t)n + 64) * 2; char* nb = (char*)realloc(ctx->buf, new_cap); if (!nb) { ctx->tainted = 1; return; } - ctx->buf = nb; - ctx->cap = new_cap; + ctx->buf = nb; ctx->cap = new_cap; } - memcpy(ctx->buf + ctx->len, tok, tlen); - ctx->len += tlen; + memcpy(ctx->buf + ctx->len, tmp, (size_t)n); + ctx->len += (size_t)n; ctx->buf[ctx->len] = '\0'; } +static void chat_accum_drop(chat_accum_t* ctx, int n) { + if (n <= 0) return; + if (n > ctx->pending_len) n = ctx->pending_len; + memmove(ctx->pending, ctx->pending + n, + (size_t)(ctx->pending_len - n)); + ctx->pending_len -= n; +} + +static int chat_find_marker(const char* h, int hlen, const char* m) { + int mlen = (int)strlen(m); + if (hlen < mlen) return -1; + for (int p = 0; p + mlen <= hlen; p++) { + if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p; + } + return -1; +} + +static const char* const CHAT_END_MARKERS[] = { + "<|im_end|>", "<|eot_id|>", "", "<|endoftext|>", + "<|im_start|>", "<|start_header_id|>", "<|eom_id|>", + NULL, +}; + +static void chat_accum_callback(const char* tok, void* u) { + chat_accum_t* ctx = (chat_accum_t*)u; + if (!tok || ctx->stop_requested) return; + int tlen = (int)strlen(tok); + if (tlen == 0) return; + + if (ctx->pending_len + tlen > CHAT_PENDING_CAP) { + int emit = ctx->pending_len - CHAT_LOOKAHEAD; + if (emit > 0) { + if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit); + chat_accum_drop(ctx, emit); + } + } + if (tlen > CHAT_PENDING_CAP) { + if (!ctx->in_header) { + chat_accum_emit(ctx, ctx->pending, ctx->pending_len); + chat_accum_emit(ctx, tok, tlen); + } + ctx->pending_len = 0; + return; + } + memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen); + ctx->pending_len += tlen; + + int progress = 1; + while (progress) { + progress = 0; + if (ctx->in_header) { + int nl = -1; + for (int i = 0; i < ctx->pending_len; i++) { + if (ctx->pending[i] == '\n') { nl = i; break; } + } + if (nl >= 0) { + chat_accum_drop(ctx, nl + 1); + ctx->in_header = 0; + progress = 1; + } else { + ctx->pending_len = 0; + return; + } + } + int em_pos = -1; + const char* em_str = NULL; + for (int i = 0; CHAT_END_MARKERS[i]; i++) { + int p = chat_find_marker(ctx->pending, ctx->pending_len, + CHAT_END_MARKERS[i]); + if (p >= 0 && (em_pos < 0 || p < em_pos)) { + em_pos = p; em_str = CHAT_END_MARKERS[i]; + } + } + if (em_pos >= 0) { + if (em_pos == 0 && ctx->len == 0 && em_str && + strcmp(em_str, "<|im_start|>") == 0) { + chat_accum_drop(ctx, 12); + ctx->in_header = 1; + progress = 1; + continue; + } + if (em_pos > 0) { + chat_accum_emit(ctx, ctx->pending, em_pos); + } + ctx->pending_len = 0; + ctx->stop_requested = 1; + return; + } + } + + if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) { + int emit = ctx->pending_len - CHAT_LOOKAHEAD; + chat_accum_emit(ctx, ctx->pending, emit); + chat_accum_drop(ctx, emit); + } +} + +static void chat_accum_finish(chat_accum_t* ctx) { + if (ctx->in_header) { + ctx->pending_len = 0; + return; + } + if (ctx->pending_len > 0) { + chat_accum_emit(ctx, ctx->pending, ctx->pending_len); + ctx->pending_len = 0; + } +} + int tq_generate_chat_text(tq_model_t* model, tq_tokenizer_t* tokenizer, tq_state_t* state, @@ -905,9 +1034,10 @@ int tq_generate_chat_text(tq_model_t* model, /* Wrap user callback to capture generated text into a buffer for the * next call's cached_text update. */ - chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0, - .user_cb = config->on_token, - .user_data = config->user_data }; + chat_accum_t accum; + memset(&accum, 0, sizeof(accum)); + accum.user_cb = config->on_token; + accum.user_data = config->user_data; void (*orig_cb)(const char*, void*) = config->on_token; void* orig_ud = config->user_data; config->on_token = chat_accum_callback; @@ -1039,6 +1169,9 @@ int tq_generate_chat_text(tq_model_t* model, int piece_len = (int)strlen(piece ? piece : ""); if (config->on_token && piece) config->on_token(piece, config->user_data); + /* The chat_accum filter may have detected an end marker + * spanning multiple tokens — break before forwarding more. */ + if (accum.stop_requested) break; if (output && piece && output_pos + piece_len < output_size - 1) { memcpy(output + output_pos, piece, piece_len); output_pos += piece_len; @@ -1088,6 +1221,11 @@ int tq_generate_chat_text(tq_model_t* model, output, output_size); } + /* Drain the marker filter's lookahead buffer before reading + * accum.buf for the cached_text update. Without this, the last + * ~32 bytes of clean output would be silently lost. */ + chat_accum_finish(&accum); + /* Restore the original callback before returning to caller */ config->on_token = orig_cb; config->user_data = orig_ud; diff --git a/wasm/quant.wasm b/wasm/quant.wasm index f018484..477218d 100755 Binary files a/wasm/quant.wasm and b/wasm/quant.wasm differ