diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
index e9559ef..bb8061a 100644
--- a/bindings/python/quantcpp/__init__.py
+++ b/bindings/python/quantcpp/__init__.py
@@ -4,11 +4,18 @@
 Quick start:
 
     from quantcpp import Model
-    m = Model.from_pretrained("Llama-3.2-1B")
+    m = Model.from_pretrained("SmolLM2-1.7B")
     print(m.ask("What is gravity?"))
 
-Note: SmolLM2-135M downloads faster but produces low-quality output.
-Use Llama-3.2-1B (~750 MB, one-time download) for good results.
+Model selection guide:
+    SmolLM2-1.7B  (1.7 GB, vocab 49K)  — recommended. ~12 tok/s on Apple M3.
+    Llama-3.2-1B  (750 MB, vocab 128K) — smaller download but slower
+                                          due to large vocab (~2 tok/s on M3).
+    SmolLM2-135M  (138 MB, vocab 49K)  — demo only, low quality output.
+
+Larger vocab = slower lm_head matmul → smaller params with smaller vocab
+often beats larger params with larger vocab. See docs/supported_models.md
+for the architecture support matrix.
 """
 
 try:
@@ -53,17 +60,37 @@ class ChatContextOverflow(RuntimeError):
                                   Path.home() / ".cache" / "quantcpp"))
 
 # name → (HuggingFace repo, filename, approx size in MB)
+# Note: download URL is constructed as
+#   https://huggingface.co/{repo}/resolve/main/{filename}
+# Verify both fields against the actual HuggingFace listing before
+# adding new entries — there is no integrity check at runtime.
 _MODEL_REGISTRY = {
+    # 138 MB demo model. Tokenizer + arch are llama-compatible but the
+    # model is too small to produce coherent output for general chat.
+    # Listed only so users can verify the install/load path quickly.
     "SmolLM2-135M": (
         "Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct",
         "smollm2-135m-instruct-q8_0.gguf",
         135,
     ),
+    # Recommended default for first-time users on Apple Silicon / typical
+    # laptops. vocab 49K keeps the lm_head matmul small, so even on a
+    # mid-range M-series chip we measure ~12 tok/s — comfortable for
+    # interactive chat. Same llama arch family as SmolLM2-135M, so it
+    # exercises the most-tested code path.
+    "SmolLM2-1.7B": (
+        "bartowski/SmolLM2-1.7B-Instruct-GGUF",
+        "SmolLM2-1.7B-Instruct-Q8_0.gguf",
+        1700,
+    ),
     "Qwen3.5-0.8B": (
         "unsloth/Qwen3.5-0.8B-GGUF",
         "Qwen3.5-0.8B-Q4_K_M.gguf",
         508,
     ),
+    # Smaller download than SmolLM2-1.7B but slower at inference time
+    # because of the 128K Llama-3 vocab (~5x slower lm_head matmul on M3).
+    # Kept in the registry for users who specifically want a Llama model.
     "Llama-3.2-1B": (
         "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
         "llama-3.2-1b-instruct-q4_k_m.gguf",
@@ -170,7 +197,7 @@ class Model:
 
     Examples
     --------
-    >>> m = Model.from_pretrained("SmolLM2-135M")
+    >>> m = Model.from_pretrained("SmolLM2-1.7B")
     >>> m.ask("What is gravity?")
     'Gravity is a force that attracts ...'
 
diff --git a/bindings/python/quantcpp/cli.py b/bindings/python/quantcpp/cli.py
index 830204f..8a5fe73 100644
--- a/bindings/python/quantcpp/cli.py
+++ b/bindings/python/quantcpp/cli.py
@@ -18,9 +18,13 @@
 import json
 
 
-# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
+# Ollama-style short aliases → canonical _MODEL_REGISTRY keys.
+# Plain "smollm2" without a size suffix points at the 1.7B model — that's
+# the recommended default. Users who explicitly want the 135M demo model
+# need to ask for it by full name.
 MODEL_ALIASES = {
-    "smollm2":      "SmolLM2-135M",
+    "smollm2":      "SmolLM2-1.7B",
+    "smollm2:1.7b": "SmolLM2-1.7B",
     "smollm2:135m": "SmolLM2-135M",
     "qwen3.5":      "Qwen3.5-0.8B",
     "qwen3.5:0.8b": "Qwen3.5-0.8B",
@@ -329,8 +333,13 @@ def cmd_client(args):
 
 
 def cmd_chat_default(args):
-    """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
-    args.model = args.model or "Llama-3.2-1B"
+    """Backwards-compatible default: auto-download SmolLM2-1.7B and chat.
+
+    Default switched from Llama-3.2-1B to SmolLM2-1.7B (2026-04-12) after
+    user feedback that Llama-3.2-1B's 128K vocab makes it ~5x slower at
+    interactive chat than SmolLM2-1.7B's 49K vocab on Apple Silicon.
+    """
+    args.model = args.model or "SmolLM2-1.7B"
     args.threads = getattr(args, "threads", 4)
     args.max_tokens = getattr(args, "max_tokens", 256)
     args.temperature = getattr(args, "temperature", 0.7)
@@ -354,19 +363,19 @@ def main():
   client PROMPT         Send a request to a running serve (default: SSE streaming)
 
 examples:
-  quantcpp pull llama3.2:1b
+  quantcpp pull smollm2              # recommended: small vocab → fast
   quantcpp list
-  quantcpp run llama3.2:1b
-  quantcpp run llama3.2:1b "What is gravity?"
-  quantcpp serve llama3.2:1b --port 8080
+  quantcpp run smollm2
+  quantcpp run smollm2 "What is gravity?"
+  quantcpp serve smollm2 --port 8080
   quantcpp client "What is gravity?"                  # streams from :8080
   quantcpp client "Hi" --url http://localhost:8081
   quantcpp client "Hi" --no-stream                    # single JSON response
 
 backwards-compat (no subcommand):
-  quantcpp                          # default chat with Llama-3.2-1B
+  quantcpp                          # default chat with SmolLM2-1.7B
   quantcpp "What is gravity?"       # one-shot
-  quantcpp --model SmolLM2-135M     # different model
+  quantcpp --model llama3.2:1b      # different model
 """,
     )
 
diff --git a/docs/feedback/2026-04-12_0900.md b/docs/feedback/2026-04-12_0900.md
new file mode 100644
index 0000000..c925007
--- /dev/null
+++ b/docs/feedback/2026-04-12_0900.md
@@ -0,0 +1,195 @@
+# quant.cpp User Feedback — First-Time Setup & Usage Experience
+
+**Date**: 2026-04-12
+**Environment**: macOS (Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory)
+**Version tested**: v0.10.1 → v0.12.0 (pip) + latest main (source build)
+**Tested by**: End-user (developer, first-time quant.cpp user)
+
+---
+
+## Summary
+
+pip install부터 `quantcpp serve`, Metal GPU 빌드, 채팅 웹 UI 연동, 다양한 모델 비교까지의 전 과정을 체험했습니다. 전반적으로 "설치 → 모델 다운로드 → 추론"까지의 흐름은 매우 간결했으나, 모델 호환성과 속도 면에서 개선점이 발견되었습니다.
+
+---
+
+## 1. 좋았던 점
+
+### 1.1 설치가 매우 간단
+- `pip install quantcpp` 한 줄로 설치 완료. 의존성 zero.
+- `Model.from_pretrained("Llama-3.2-1B")`으로 모델 자동 다운로드 + 캐시. 매우 편리.
+
+### 1.2 OpenAI 호환 API 서버
+- `quantcpp serve llama3.2:1b --port 8080` 한 줄로 서버 기동.
+- `/v1/chat/completions` 엔드포인트가 OpenAI SDK와 호환되어 기존 코드 재사용 가능.
+- SSE 스트리밍(`stream: true`) 정상 동작.
+- CORS 헤더 (`Access-Control-Allow-Origin: *`) 기본 포함 — 프론트엔드 연동 즉시 가능.
+
+### 1.3 v0.12.0의 CLI 추가
+- `quantcpp "What is gravity?"` 한 줄 질문이 가능해져 체험 진입장벽이 크게 낮아짐.
+- `quantcpp` (인터랙티브 모드)도 직관적.
+
+### 1.4 KV cache reuse (최신 main)
+- 연속 대화 시 두 번째 요청부터 prefill이 생략되어 응답 시간이 ~50% 단축됨.
+- 첫 요청 27초 → 두 번째 요청 14초 (Llama-3.2-1B 기준).
+
+### 1.5 Metal GPU 자동 감지
+- `TQ_BUILD_METAL=ON`으로 빌드하면 Apple Silicon GPU를 자동 감지하여 활성화.
+- 별도 설정 없이 matmul 배치 디스패치가 Metal로 전환됨.
+
+### 1.6 SmolLM2-1.7B에서의 우수한 성능
+- vocab size가 작은 모델(49K)에서 ~12.5 tok/s 달성. 실시간 대화 가능 수준.
+- 출력 품질도 깨끗하고 정확함 (예: "The capital of South Korea is Seoul.").
+
+---
+
+## 2. 개선이 필요한 점
+
+### 2.1 pip 패키지에서 CLI가 누락 (v0.10.1)
+- **문제**: PyPI v0.10.1에는 `quantcpp` CLI entry point가 없었음. `zsh: command not found: quantcpp`.
+- **해결**: v0.11.0부터 `cli.py` + entry point 추가로 해결됨.
+- **제안**: PyPI에 최신 버전을 빠르게 배포하면 첫 경험이 크게 개선될 것.
+
+### 2.2 `quantcpp serve`에 quant-server 바이너리 필요
+- **문제**: `pip install quantcpp` 후 `quantcpp serve`를 실행하면 `quant-server binary not found` 에러.
+- 사용자가 직접 CMake로 `TQ_BUILD_SERVER=ON` 빌드 후 PATH에 복사해야 함.
+- **제안**: pip 패키지에 서버 바이너리를 포함하거나, 순수 Python fallback 서버를 제공.
+
+### 2.3 Llama-3.2-1B의 극심한 느린 속도
+- **문제**: Llama-3.2-1B (Q4_K_M)가 Apple M3에서 ~2.3 tok/s로 매우 느림.
+  - 60토큰 생성에 ~27초, 200토큰에 ~67초 소요.
+  - 대화형 사용이 사실상 불가능한 수준.
+- **원인 분석**: vocab size 128,256이 병목. 매 토큰마다 128K 차원의 output projection 필요.
+- **대비**: 동일 환경에서 SmolLM2-1.7B (Q8, vocab 49K)는 ~12.5 tok/s로 5배 빠름.
+- **제안**:
+  - 기본 추천 모델을 SmolLM2-1.7B로 변경 검토.
+  - 또는 모델 선택 가이드에 "vocab size가 클수록 느려진다"는 안내 추가.
+
+### 2.4 SmolLM2-135M의 출력 품질 문제
+- **문제**: SmolLM2-135M은 속도는 빠르지만(0.3초) 출력이 HTML 쓰레기 텍스트.
+- **제안**: 135M 모델은 "quantization 데모용"으로만 안내하고, 추론 품질 기대를 낮추는 문구 추가.
+
+### 2.5 Gemma-4-E2B 호환성 문제
+- **문제**: gemma-4-E2B-it-Q4_K_M.gguf 로딩은 성공하나, 추론 출력이 완전히 깨짐 (다국어 쓰레기 토큰).
+- 서버 로그에는 정상 로딩으로 표시되어 사용자가 원인을 파악하기 어려움.
+- **제안**: 지원되는 모델/아키텍처 목록을 명시하고, 미지원 모델 로딩 시 경고 표시.
+
+### 2.6 Phi-3.5-mini-instruct 아키텍처 미지원 (신규)
+- **문제**: `Phi-3.5-mini-instruct-Q8_0.gguf` (3.9GB) 로딩은 성공하나, attention 레이어 매핑 실패.
+  - 서버 로그: `loaded 32 layers (0 self_attn)` — self_attn이 0으로 인식됨.
+  - 출력: 완전한 쓰레기 토큰 (`uffrasspkeryensonisatcreteBUG...`).
+  - 속도 자체는 0.85초/80토큰으로 극도로 빠름 (vocab 32K 효과).
+- **영향**: Phi-3/Phi-3.5는 vocab 32K로 속도 면에서 최적의 모델이나 사용 불가.
+- **제안**:
+  - Phi-3 (`phi3`) 아키텍처의 attention 레이어 매핑 지원 추가.
+  - 이 모델이 지원되면 "속도 + 품질" 모두에서 최적의 추천 모델이 될 수 있음.
+  - `self_attn=0`으로 감지된 경우 사용자에게 경고 메시지 표시 필요.
+
+### 2.7 Qwen3.5-0.8B 출력 품질 문제 (신규)
+- **문제**: Qwen3.5-0.8B (Q4_K_M) 서버 로딩은 성공하나, 출력이 완전히 깨짐.
+  - DeltaNet hybrid 아키텍처 특성으로 인한 호환성 문제 추정.
+  - 33초/60토큰으로 속도도 느림 (vocab 248K).
+- **제안**: Qwen 계열의 지원 상태를 문서에 명시.
+
+### 2.8 Metal GPU 가속 효과 제한적 (소형 모델)
+- **문제**: 1B 모델에서 Metal GPU가 활성화되어 있으나 체감 속도 차이 없음.
+- 소스 코드 주석에도 "Metal Q4 batch → 38 tok/s vs CPU Q4 → 95 tok/s (SmolLM2)" 명시.
+- 소형 모델에서는 GPU 디스패치 오버헤드가 연산 시간보다 큼.
+- **제안**: 모델 크기에 따라 CPU/GPU 자동 전환 로직 추가, 또는 `--device cpu/gpu` 옵션 제공.
+
+### 2.9 서버 단일 요청 처리 (동시성 없음)
+- **문제**: 첫 번째 요청 처리 중 두 번째 요청이 완전히 블로킹됨.
+- 채팅 UI에서 연속 질문 시 두 번째 질문이 3분+ 대기.
+- **제안**: 요청 큐잉 + 처리 중 상태 반환 (429 or retry-after), 또는 요청 취소 API.
+
+### 2.10 chat template 잔여물
+- **문제**: 응답에 `<|im_start|>`, `<|im_end|>`, `<line>assistant</line>` 등 template 토큰이 노출됨.
+- Llama-3.2-1B에서 특히 빈번. SmolLM2-1.7B에서는 `<|im_ennd|>` 정도로 경미.
+- **제안**: 서버 측에서 stop tokens/template markers를 자동 strip.
+
+---
+
+## 3. 모델별 벤치마크 (Apple M3, 16GB RAM, Metal GPU 빌드)
+
+| Model | Quant | File Size | Vocab | tok/s | 60-token Time | Quality | Architecture |
+|-------|-------|-----------|------:|------:|--------------:|---------|-------------|
+| SmolLM2-135M | Q8 | 138MB | 49K | ~300 | 0.3s | Unusable (garbage) | llama |
+| Qwen3.5-0.8B | Q4_K_M | 508MB | 248K | ~1.8 | ~33s | Broken (garbage) | qwen/deltanet |
+| Llama-3.2-1B | Q4_K_M | 770MB | 128K | ~2.3 | ~27s | Usable (artifacts) | llama |
+| **SmolLM2-1.7B** | **Q8** | **1.7GB** | **49K** | **~12.5** | **~5s** | **Good (clean)** | **llama** |
+| Gemma-4-E2B | Q4_K_M | 2.9GB | 262K | ~10 | ~5s | Broken (compat) | gemma4 hybrid |
+| Phi-3.5-mini | Q8 | 3.9GB | 32K | ~94* | ~0.85s* | Broken (0 self_attn) | phi3 |
+
+*\* Phi-3.5 속도는 attention이 작동하지 않아 실제 추론이 아님. 정상 지원 시 예상 속도.*
+
+### Key Insights
+
+1. **vocab size가 속도에 가장 큰 영향을 미침.** 파라미터 수보다 vocab size와 양자화 방식이 실사용 속도를 결정.
+   - SmolLM2-1.7B (vocab 49K): 12.5 tok/s
+   - Llama-3.2-1B (vocab 128K): 2.3 tok/s — 2.6x vocab → 5.4x 느림
+2. **Q8이 Q4보다 빠를 수 있음.** Q4의 디퀀타이즈 오버헤드가 Q8보다 크며, NEON SIMD에서 Q8이 더 효율적.
+3. **llama 아키텍처만 안정적으로 동작.** phi3, gemma4, qwen/deltanet 아키텍처는 로딩은 되지만 추론이 깨짐.
+4. **Phi-3.5가 지원되면 게임 체인저.** vocab 32K + 3.8B params로 "속도 + 품질" 최적 조합 가능.
+
+---
+
+## 4. 아키텍처 호환성 매트릭스 (신규)
+
+| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status |
+|-------------|-----------|-----------|-----------|-----------|--------|
+| llama (SmolLM2, Llama) | OK | OK | OK | OK | **Fully supported** |
+| llama (Llama-3.2 GQA) | OK | OK | OK | Slow | Supported (vocab bottleneck) |
+| phi3 (Phi-3.5-mini) | OK | OK | **FAIL (0 self_attn)** | Garbage | **Not supported** |
+| gemma4 (Gemma-4-E2B) | OK | OK | Partial | Garbage | **Not supported** |
+| qwen/deltanet (Qwen3.5) | OK | OK | Unknown | Garbage | **Not supported** |
+
+**제안**: 이 매트릭스를 README 또는 docs에 포함하여 사용자가 모델 선택 전에 호환성을 확인할 수 있게 해주세요.
+
+---
+
+## 5. 제안 우선순위
+
+| Priority | Item | Impact | Effort |
+|----------|------|--------|--------|
+| **P0** | Phi-3 (`phi3`) 아키텍처 attention 매핑 지원 | 최적 모델 활용 가능 | Medium |
+| **P0** | chat template 토큰 자동 strip | 출력 품질 즉시 개선 | Low |
+| **P0** | 기본 추천 모델을 SmolLM2-1.7B로 변경 | 첫 경험 대폭 개선 | Low |
+| P1 | pip 패키지에 서버 바이너리 포함 | 설치 → 서버 기동 원스텝 | Medium |
+| P1 | 미지원 아키텍처 로딩 시 경고/에러 | 디버깅 시간 절약 | Low |
+| P1 | `self_attn=0` 감지 시 경고 메시지 | 호환성 문제 즉시 인지 | Low |
+| P2 | 서버 동시 요청 처리 (또는 큐잉) | 다중 사용자/연속 대화 | High |
+| P2 | 아키텍처 호환성 매트릭스 문서화 | 모델 선택 가이드 | Low |
+| P2 | vocab size 기반 CPU/GPU 자동 전환 | 최적 성능 자동 선택 | Medium |
+| P3 | `--device cpu/gpu` CLI 옵션 | 사용자 제어권 | Low |
+
+---
+
+## 6. 테스트 환경 상세
+
+```
+Hardware: Apple M3, 8-core CPU, 10-core GPU, 16GB Unified Memory
+OS: macOS 15 (Darwin 24.5.0)
+Python: 3.14.3
+Compiler: AppleClang 16.0.0
+Xcode: installed (Metal shader compilation enabled)
+quantcpp: v0.10.1 (pip) → v0.12.0 (pip) → latest main (source)
+Build: cmake -DTQ_BUILD_METAL=ON -DTQ_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
+```
+
+---
+
+## 7. 테스트한 모델 파일 목록
+
+```
+~/.cache/quantcpp/smollm2-135m-instruct-q8_0.gguf          (138 MB)
+~/.cache/quantcpp/Qwen3.5-0.8B-Q4_K_M.gguf                 (508 MB)
+~/.cache/quantcpp/llama-3.2-1b-instruct-q4_k_m.gguf        (770 MB)
+~/.cache/quantcpp/Phi-3.5-mini-instruct-Q8_0.gguf          (3.9 GB) — NEW
+~/dev/projects/TurboQuant.cpp/models/SmolLM2-1.7B-Instruct-Q8_0.gguf  (1.7 GB)
+~/dev/projects/TurboQuant.cpp/models/gemma-4-E2B-it-Q4_K_M.gguf       (2.9 GB)
+```
+
+---
+
+*This feedback was generated based on a hands-on first-time user experience session on 2026-04-12.*
+*Updated with Phi-3.5-mini-instruct and Qwen3.5-0.8B architecture compatibility findings.*
diff --git a/docs/supported_models.md b/docs/supported_models.md
new file mode 100644
index 0000000..5e9600f
--- /dev/null
+++ b/docs/supported_models.md
@@ -0,0 +1,117 @@
+# Supported Models
+
+quant.cpp loads GGUF files from HuggingFace, but only some model
+architectures are fully wired through the inference path. This page
+tracks what works, what loads-but-fails, and how to pick a model.
+
+## TL;DR — Recommended models
+
+| Use case | Model | Why |
+|---|---|---|
+| **First-time install** | `SmolLM2-1.7B` (Q8) | Fastest end-to-end on a laptop. Vocab 49K keeps the lm_head matmul small (~12 tok/s on Apple M3). |
+| Smaller download | `Llama-3.2-1B` (Q4_K_M) | 750 MB vs 1.7 GB, but ~5x slower at inference time due to 128K vocab. |
+| Quick smoke test | `SmolLM2-135M` (Q8) | 138 MB download to verify the install path. Output quality is poor — not for real use. |
+
+```bash
+# CLI quickstart
+quantcpp run smollm2          # SmolLM2-1.7B (recommended)
+quantcpp run smollm2:135m     # SmolLM2-135M (smoke test only)
+quantcpp run llama3.2:1b      # smaller download, slower
+```
+
+```python
+# Python quickstart
+from quantcpp import Model
+m = Model.from_pretrained("SmolLM2-1.7B")
+print(m.ask("What is gravity?"))
+```
+
+## Architecture compatibility matrix
+
+| Architecture | GGUF Load | Tokenizer | Attention | Inference | Status |
+|---|:---:|:---:|:---:|:---:|---|
+| **llama** (SmolLM2, Llama-3.x, Mistral) | ✅ | ✅ | ✅ | ✅ | **Fully supported** |
+| llama with 128K vocab (Llama-3.2-1B) | ✅ | ✅ | ✅ | slow | Supported, vocab is the bottleneck |
+| **gemma** (Gemma 2) | ✅ | ✅ | ✅ | ✅ | Supported |
+| **gemma3** | ✅ | ✅ | ✅ | ✅ | Supported with hybrid sliding-window attention |
+| **gemma4** (Gemma-4-E2B / E4B) | ✅ | ✅ | ⚠️ | ⚠️ | Partial — some Q4_K_M variants produce garbage; report with file SHA256 |
+| **qwen** / **qwen2** | ✅ | ✅ | ✅ | ✅ | Supported |
+| **qwen3.5** (DeltaNet hybrid) | ✅ | ✅ | partial | ⚠️ | Partial — pure-attention layers work, DeltaNet hybrid still being validated |
+| **phi3** / **phi3.5** (fused QKV) | ❌ | — | — | — | **Not supported** — uses `attn_qkv`, see "Why phi3 is hard" below |
+
+✅ = works · ⚠️ = loads but inference is unreliable · ❌ = load fails fast with a clear error (since 2026-04-12)
+
+If you load an unsupported architecture, the loader now prints:
+
+```
+tq_load_gguf: ERROR — model architecture 'phi3' is not supported.
+  Detected 0 self_attn layers and no DeltaNet weights.
+  This usually means the model uses fused QKV projection
+  (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle.
+  See docs/supported_models.md for the architecture support matrix.
+```
+
+…and `tq_load_gguf` returns NULL, so callers can fail-fast instead of
+silently producing garbage tokens.
+
+## Why vocab size dominates speed
+
+quant.cpp generates one token at a time. Every token requires a
+`lm_head` matmul of shape `[hidden_dim, vocab_size]`. For a typical 1B
+model with `hidden_dim = 2048`:
+
+| Model | vocab_size | lm_head FLOPs/token |
+|---|---:|---:|
+| SmolLM2-1.7B | 49,152 | 100 M |
+| Llama-3.2-1B | 128,256 | 263 M |
+
+Llama-3.2-1B has fewer parameters (1.0B vs 1.7B) but its lm_head matmul
+is 2.6x bigger, and on CPU it dominates wall time. External user
+benchmarks on Apple M3 (8-core CPU, 16 GB RAM):
+
+| Model | tok/s | 60-token latency |
+|---|---:|---:|
+| SmolLM2-1.7B (Q8, vocab 49K) | ~12.5 | ~5 s |
+| Llama-3.2-1B (Q4_K_M, vocab 128K) | ~2.3 | ~27 s |
+
+**Take-away**: when picking a model for an embedded / laptop scenario,
+vocab size is a better predictor of interactive latency than parameter
+count. Pick the smallest vocab that produces output you're happy with.
+
+## Why phi3 is hard
+
+Phi-3 / Phi-3.5 uses a *fused* QKV projection: instead of three separate
+tensors `attn_q.weight`, `attn_k.weight`, `attn_v.weight`, it ships one
+`attn_qkv.weight` with all three projections concatenated along the
+output dimension.
+
+quant.cpp's GGUF loader currently looks for the three-tensor layout
+(`blk.N.attn_q.weight` etc.). When it loads a Phi-3 GGUF, none of those
+names match → 0 self_attn layers detected → forward pass runs against
+zero-initialized attention weights → garbage tokens.
+
+Adding Phi-3 support requires either:
+
+1. **Loader splits** `attn_qkv.weight` into the three views at load time
+   and writes them into the existing `wq`/`wk`/`wv` slots, OR
+2. **Forward path** learns to dispatch a fused QKV matmul when the
+   loader detects the fused tensor.
+
+Option (1) is simpler but doubles the working set during load. Option
+(2) is the right long-term answer. There's a tracking issue / spike in
+progress; until then Phi-3 is the highest-value missing architecture for
+quant.cpp's "speed + quality" target (Phi-3.5-mini has vocab 32K plus
+3.8B params — it would beat both SmolLM2-1.7B and Llama-3.2-1B at
+interactive use).
+
+## Reporting an unsupported model
+
+If you tried a model that's not in the matrix above, please open an
+issue with:
+
+- The HuggingFace repo + filename
+- The exact `tq_load_gguf:` log lines (including `architecture = '...'`)
+- The first ~50 generated tokens (so we can see whether it's garbage,
+  partial garbage, or just wrong-language)
+
+Don't include the model file itself — link to the HuggingFace page.
diff --git a/quant.h b/quant.h
index 36cbbb2..136d1e4 100644
--- a/quant.h
+++ b/quant.h
@@ -11940,6 +11940,39 @@ tq_model_t* tq_load_gguf(const char* path) {
                 n_attn_layers, c->n_layers);
     }
 
+    /* Hard-fail when neither standard self_attn (`blk.N.attn_q.weight`) nor
+     * DeltaNet (`blk.N.ssm_a`) was detected on any layer. The GGUF loaded
+     * fine but every layer is missing its attention block — typically
+     * because the architecture uses fused QKV (Phi-3 `attn_qkv`) or some
+     * other naming convention we don't recognize yet.
+     *
+     * Without this check the load returns successfully, the forward pass
+     * runs against zero-initialized attention weights, and the user gets
+     * pages of garbage tokens with no clear error to debug. The previous
+     * behavior was reported by an external user (2026-04-12 feedback) as
+     * the worst part of the first-time experience: "loaded 32 layers
+     * (0 self_attn)" looked like a success log.
+     *
+     * Listed architectures that hit this path:
+     *   - phi3 / phi3.5 (uses fused `blk.N.attn_qkv.weight`)
+     *   - any future fused-QKV architecture we haven't ported yet
+     *
+     * Hybrid models with at least ONE self_attn layer (e.g., Qwen3.5
+     * DeltaNet) are NOT affected — they hit the branch above and proceed. */
+    if (n_attn_layers == 0 && c->delta_n_heads == 0) {
+        fprintf(stderr,
+            "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n"
+            "  Detected 0 self_attn layers and no DeltaNet weights.\n"
+            "  This usually means the model uses fused QKV projection\n"
+            "  (e.g., Phi-3 `attn_qkv`) which quant.cpp does not yet handle.\n"
+            "  See docs/supported_models.md for the architecture support matrix.\n",
+            gguf->arch[0] ? gguf->arch : "unknown");
+        /* tq_free_model owns gguf_ctx (set above at line 11463) and will
+         * close it as part of the teardown — do not double-close. */
+        tq_free_model(model);
+        return NULL;
+    }
+
     /* Set up layer_is_sliding for Gemma hybrid attention.
      * Detect from K tensor shape: sliding and full layers have different K output dims.
      * The MAJORITY of layers are sliding (e.g., 25/30 or 28/35). */
@@ -15874,36 +15907,197 @@ int tq_generate_continue(tq_model_t* model,
  * Pass cached_text_io == NULL to disable text-prefix tracking.
  * ============================================================================ */
 
+/* ChatML / template-marker filter ----------------------------------------
+ *
+ * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`,
+ * `<end_of_turn>`, etc. as REGULAR text bytes (not special tokens). When
+ * that happens the BPE tokenizer fragments them across multiple tokens,
+ * and a per-token strstr check (like the existing `should_stop` logic)
+ * never matches. The user sees the marker leak into their stream.
+ *
+ * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated
+ * text in `pending` and only flushes bytes that are guaranteed to NOT
+ * be the start of a marker. When a full marker is matched:
+ *   - `<|im_start|>` at the very beginning of the response → header
+ *     skip mode (drop until next '\n'). The model is regurgitating the
+ *     `<|im_start|>assistant\n` prefix that the prompt template already
+ *     contains; we silently strip it.
+ *   - any END marker → emit the prefix, drop the marker and everything
+ *     after, set `stop_requested` so the generation loop can break.
+ *
+ * Cost: each token is delayed by ~CHAT_LOOKAHEAD bytes worth of stream.
+ * For typical English (3-4 chars/token), that's ~8-10 tokens of latency
+ * before the first token shows up. After that, streaming is steady-state
+ * with the same latency window.
+ * ----------------------------------------------------------------------- */
+#define CHAT_PENDING_CAP 128
+#define CHAT_LOOKAHEAD   32
+
 typedef struct {
     char*  buf;
     size_t len;
     size_t cap;
-    int    tainted;   /* 1 if accumulation ever failed → buf is incomplete */
+    int    tainted;          /* 1 if accumulation ever failed → buf incomplete */
+    /* Lookahead filter state */
+    char   pending[CHAT_PENDING_CAP];
+    int    pending_len;
+    int    in_header;        /* skipping <|im_start|>...\n */
+    int    stop_requested;   /* end marker hit → caller should break */
     void (*user_cb)(const char*, void*);
     void*  user_data;
 } chat_accum_t;
 
-static void chat_accum_callback(const char* tok, void* u) {
-    chat_accum_t* ctx = (chat_accum_t*)u;
-    if (!tok) return;
-    /* Always pass through to the user's callback first — losing tokens
-     * from the user's stream because of an INTERNAL realloc failure is
-     * far worse than a stale cached_text on the next turn. */
-    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+/* Emit n bytes from `p` to BOTH the user callback and accum.buf.
+ * Used after the marker filter has decided the bytes are safe. */
+static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) {
+    if (n <= 0) return;
+    /* User callback gets a NUL-terminated copy. */
+    char tmp[CHAT_PENDING_CAP + 1];
+    if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP;
+    memcpy(tmp, p, (size_t)n);
+    tmp[n] = '\0';
+    if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data);
     if (ctx->tainted) return;
-    size_t tlen = strlen(tok);
-    if (ctx->len + tlen + 1 > ctx->cap) {
-        size_t new_cap = (ctx->cap + tlen + 64) * 2;
+    if (ctx->len + (size_t)n + 1 > ctx->cap) {
+        size_t new_cap = (ctx->cap + (size_t)n + 64) * 2;
         char* nb = (char*)realloc(ctx->buf, new_cap);
         if (!nb) { ctx->tainted = 1; return; }
-        ctx->buf = nb;
-        ctx->cap = new_cap;
+        ctx->buf = nb; ctx->cap = new_cap;
     }
-    memcpy(ctx->buf + ctx->len, tok, tlen);
-    ctx->len += tlen;
+    memcpy(ctx->buf + ctx->len, tmp, (size_t)n);
+    ctx->len += (size_t)n;
     ctx->buf[ctx->len] = '\0';
 }
 
+/* Drop n bytes from the front of pending. */
+static void chat_accum_drop(chat_accum_t* ctx, int n) {
+    if (n <= 0) return;
+    if (n > ctx->pending_len) n = ctx->pending_len;
+    memmove(ctx->pending, ctx->pending + n,
+            (size_t)(ctx->pending_len - n));
+    ctx->pending_len -= n;
+}
+
+/* Find first occurrence of marker `m` in haystack[0..hlen). -1 if none. */
+static int chat_find_marker(const char* h, int hlen, const char* m) {
+    int mlen = (int)strlen(m);
+    if (hlen < mlen) return -1;
+    for (int p = 0; p + mlen <= hlen; p++) {
+        if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p;
+    }
+    return -1;
+}
+
+/* Markers that signal "stop generating now". <|im_start|> is included
+ * because if the model emits it MID-response (after generating real
+ * content), it's hallucinating a new chat turn and we should stop. */
+static const char* const CHAT_END_MARKERS[] = {
+    "<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
+    "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
+    NULL,
+};
+
+static void chat_accum_callback(const char* tok, void* u) {
+    chat_accum_t* ctx = (chat_accum_t*)u;
+    if (!tok || ctx->stop_requested) return;
+    int tlen = (int)strlen(tok);
+    if (tlen == 0) return;
+
+    /* Make room. If pending would overflow, flush the safe prefix
+     * (everything but the last LOOKAHEAD bytes) first. */
+    if (ctx->pending_len + tlen > CHAT_PENDING_CAP) {
+        int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+        if (emit > 0) {
+            if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit);
+            chat_accum_drop(ctx, emit);
+        }
+    }
+    /* Pathological: token bigger than the whole pending buffer.
+     * Emit pending + token raw and bail (no marker scan). */
+    if (tlen > CHAT_PENDING_CAP) {
+        if (!ctx->in_header) {
+            chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+            chat_accum_emit(ctx, tok, tlen);
+        }
+        ctx->pending_len = 0;
+        return;
+    }
+    memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen);
+    ctx->pending_len += tlen;
+
+    /* State machine: drain pending as far as possible. */
+    int progress = 1;
+    while (progress) {
+        progress = 0;
+        if (ctx->in_header) {
+            int nl = -1;
+            for (int i = 0; i < ctx->pending_len; i++) {
+                if (ctx->pending[i] == '\n') { nl = i; break; }
+            }
+            if (nl >= 0) {
+                chat_accum_drop(ctx, nl + 1);
+                ctx->in_header = 0;
+                progress = 1;
+            } else {
+                /* No newline yet — drop everything (it's all in header) */
+                ctx->pending_len = 0;
+                return;
+            }
+        }
+        /* Scan for the EARLIEST end marker in pending. */
+        int em_pos = -1;
+        const char* em_str = NULL;
+        for (int i = 0; CHAT_END_MARKERS[i]; i++) {
+            int p = chat_find_marker(ctx->pending, ctx->pending_len,
+                                       CHAT_END_MARKERS[i]);
+            if (p >= 0 && (em_pos < 0 || p < em_pos)) {
+                em_pos = p; em_str = CHAT_END_MARKERS[i];
+            }
+        }
+        if (em_pos >= 0) {
+            /* Special case: <|im_start|> at the very start of the
+             * response → strip the header (don't stop). The model is
+             * echoing the chat-template prefix. */
+            if (em_pos == 0 && ctx->len == 0 && em_str &&
+                strcmp(em_str, "<|im_start|>") == 0) {
+                chat_accum_drop(ctx, 12); /* len("<|im_start|>") */
+                ctx->in_header = 1;
+                progress = 1;
+                continue;
+            }
+            /* Otherwise: emit clean prefix, discard rest, request stop. */
+            if (em_pos > 0) {
+                chat_accum_emit(ctx, ctx->pending, em_pos);
+            }
+            ctx->pending_len = 0;
+            ctx->stop_requested = 1;
+            return;
+        }
+    }
+
+    /* Safe portion: keep the trailing LOOKAHEAD bytes (any in-flight
+     * marker is at most this long), flush the rest. */
+    if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) {
+        int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+        chat_accum_emit(ctx, ctx->pending, emit);
+        chat_accum_drop(ctx, emit);
+    }
+}
+
+/* Generation finished — flush any leftover pending bytes. Called once
+ * before reading accum.buf for the cached_text update. */
+static void chat_accum_finish(chat_accum_t* ctx) {
+    if (ctx->in_header) {
+        /* Stuck mid-header (no '\n' arrived) → drop the rest. */
+        ctx->pending_len = 0;
+        return;
+    }
+    if (ctx->pending_len > 0) {
+        chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+        ctx->pending_len = 0;
+    }
+}
+
 int tq_generate_chat_text(tq_model_t* model,
                            tq_tokenizer_t* tokenizer,
                            tq_state_t* state,
@@ -15929,9 +16123,10 @@ int tq_generate_chat_text(tq_model_t* model,
         }
     }
 
-    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
-                            .user_cb = config->on_token,
-                            .user_data = config->user_data };
+    chat_accum_t accum;
+    memset(&accum, 0, sizeof(accum));
+    accum.user_cb = config->on_token;
+    accum.user_data = config->user_data;
     void (*orig_cb)(const char*, void*) = config->on_token;
     void*  orig_ud = config->user_data;
     config->on_token = chat_accum_callback;
@@ -16052,6 +16247,9 @@ int tq_generate_chat_text(tq_model_t* model,
 
             int piece_len = (int)strlen(piece ? piece : "");
             if (config->on_token && piece) config->on_token(piece, config->user_data);
+            /* The chat_accum filter may have detected an end marker
+             * spanning multiple tokens — break before forwarding more. */
+            if (accum.stop_requested) break;
             if (output && piece && output_pos + piece_len < output_size - 1) {
                 memcpy(output + output_pos, piece, piece_len);
                 output_pos += piece_len;
@@ -16100,6 +16298,11 @@ int tq_generate_chat_text(tq_model_t* model,
             output, output_size);
     }
 
+    /* Drain the marker filter's lookahead buffer before reading
+     * accum.buf for the cached_text update. Without this, the last
+     * ~32 bytes of clean output would be silently lost. */
+    chat_accum_finish(&accum);
+
     config->on_token = orig_cb;
     config->user_data = orig_ud;
 
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
index 0211a83..f3a69a4 100644
--- a/src/engine/tq_generate.c
+++ b/src/engine/tq_generate.c
@@ -834,36 +834,165 @@ int tq_generate_continue(tq_model_t* model,
  * exactly like tq_generate_continue.
  * ============================================================================ */
 
+/* ChatML / template-marker filter ----------------------------------------
+ *
+ * The model can generate template tokens like `<|im_start|>`, `<|im_end|>`,
+ * `<end_of_turn>`, etc. as REGULAR text bytes (not special tokens). When
+ * that happens the BPE tokenizer fragments them across multiple tokens,
+ * and a per-token strstr check (like the existing `should_stop` logic)
+ * never matches. The user sees the marker leak into their stream.
+ *
+ * This filter holds the most recent CHAT_LOOKAHEAD bytes of generated
+ * text in `pending` and only flushes bytes that are guaranteed to NOT
+ * be the start of a marker. When a full marker is matched:
+ *   - `<|im_start|>` at the very beginning of the response → header
+ *     skip mode (drop until next '\n').
+ *   - any END marker → emit prefix, drop the rest, set stop_requested.
+ *
+ * Mirrored byte-for-byte with the version in quant.h. ---------------------- */
+#define CHAT_PENDING_CAP 128
+#define CHAT_LOOKAHEAD   32
+
 typedef struct {
     char*  buf;
     size_t len;
     size_t cap;
-    int    tainted;   /* 1 if accumulation ever failed → buf is incomplete */
+    int    tainted;
+    char   pending[CHAT_PENDING_CAP];
+    int    pending_len;
+    int    in_header;
+    int    stop_requested;
     void (*user_cb)(const char*, void*);
     void*  user_data;
 } chat_accum_t;
 
-static void chat_accum_callback(const char* tok, void* u) {
-    chat_accum_t* ctx = (chat_accum_t*)u;
-    if (!tok) return;
-    /* Always pass through to the user's callback first — losing tokens
-     * from the user's stream because of an INTERNAL realloc failure is
-     * far worse than a stale cached_text on the next turn. */
-    if (ctx->user_cb) ctx->user_cb(tok, ctx->user_data);
+static void chat_accum_emit(chat_accum_t* ctx, const char* p, int n) {
+    if (n <= 0) return;
+    char tmp[CHAT_PENDING_CAP + 1];
+    if (n > CHAT_PENDING_CAP) n = CHAT_PENDING_CAP;
+    memcpy(tmp, p, (size_t)n);
+    tmp[n] = '\0';
+    if (ctx->user_cb) ctx->user_cb(tmp, ctx->user_data);
     if (ctx->tainted) return;
-    size_t tlen = strlen(tok);
-    if (ctx->len + tlen + 1 > ctx->cap) {
-        size_t new_cap = (ctx->cap + tlen + 64) * 2;
+    if (ctx->len + (size_t)n + 1 > ctx->cap) {
+        size_t new_cap = (ctx->cap + (size_t)n + 64) * 2;
         char* nb = (char*)realloc(ctx->buf, new_cap);
         if (!nb) { ctx->tainted = 1; return; }
-        ctx->buf = nb;
-        ctx->cap = new_cap;
+        ctx->buf = nb; ctx->cap = new_cap;
     }
-    memcpy(ctx->buf + ctx->len, tok, tlen);
-    ctx->len += tlen;
+    memcpy(ctx->buf + ctx->len, tmp, (size_t)n);
+    ctx->len += (size_t)n;
     ctx->buf[ctx->len] = '\0';
 }
 
+static void chat_accum_drop(chat_accum_t* ctx, int n) {
+    if (n <= 0) return;
+    if (n > ctx->pending_len) n = ctx->pending_len;
+    memmove(ctx->pending, ctx->pending + n,
+            (size_t)(ctx->pending_len - n));
+    ctx->pending_len -= n;
+}
+
+static int chat_find_marker(const char* h, int hlen, const char* m) {
+    int mlen = (int)strlen(m);
+    if (hlen < mlen) return -1;
+    for (int p = 0; p + mlen <= hlen; p++) {
+        if (h[p] == m[0] && memcmp(h + p, m, (size_t)mlen) == 0) return p;
+    }
+    return -1;
+}
+
+static const char* const CHAT_END_MARKERS[] = {
+    "<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
+    "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
+    NULL,
+};
+
+static void chat_accum_callback(const char* tok, void* u) {
+    chat_accum_t* ctx = (chat_accum_t*)u;
+    if (!tok || ctx->stop_requested) return;
+    int tlen = (int)strlen(tok);
+    if (tlen == 0) return;
+
+    if (ctx->pending_len + tlen > CHAT_PENDING_CAP) {
+        int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+        if (emit > 0) {
+            if (!ctx->in_header) chat_accum_emit(ctx, ctx->pending, emit);
+            chat_accum_drop(ctx, emit);
+        }
+    }
+    if (tlen > CHAT_PENDING_CAP) {
+        if (!ctx->in_header) {
+            chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+            chat_accum_emit(ctx, tok, tlen);
+        }
+        ctx->pending_len = 0;
+        return;
+    }
+    memcpy(ctx->pending + ctx->pending_len, tok, (size_t)tlen);
+    ctx->pending_len += tlen;
+
+    int progress = 1;
+    while (progress) {
+        progress = 0;
+        if (ctx->in_header) {
+            int nl = -1;
+            for (int i = 0; i < ctx->pending_len; i++) {
+                if (ctx->pending[i] == '\n') { nl = i; break; }
+            }
+            if (nl >= 0) {
+                chat_accum_drop(ctx, nl + 1);
+                ctx->in_header = 0;
+                progress = 1;
+            } else {
+                ctx->pending_len = 0;
+                return;
+            }
+        }
+        int em_pos = -1;
+        const char* em_str = NULL;
+        for (int i = 0; CHAT_END_MARKERS[i]; i++) {
+            int p = chat_find_marker(ctx->pending, ctx->pending_len,
+                                       CHAT_END_MARKERS[i]);
+            if (p >= 0 && (em_pos < 0 || p < em_pos)) {
+                em_pos = p; em_str = CHAT_END_MARKERS[i];
+            }
+        }
+        if (em_pos >= 0) {
+            if (em_pos == 0 && ctx->len == 0 && em_str &&
+                strcmp(em_str, "<|im_start|>") == 0) {
+                chat_accum_drop(ctx, 12);
+                ctx->in_header = 1;
+                progress = 1;
+                continue;
+            }
+            if (em_pos > 0) {
+                chat_accum_emit(ctx, ctx->pending, em_pos);
+            }
+            ctx->pending_len = 0;
+            ctx->stop_requested = 1;
+            return;
+        }
+    }
+
+    if (!ctx->in_header && ctx->pending_len > CHAT_LOOKAHEAD) {
+        int emit = ctx->pending_len - CHAT_LOOKAHEAD;
+        chat_accum_emit(ctx, ctx->pending, emit);
+        chat_accum_drop(ctx, emit);
+    }
+}
+
+static void chat_accum_finish(chat_accum_t* ctx) {
+    if (ctx->in_header) {
+        ctx->pending_len = 0;
+        return;
+    }
+    if (ctx->pending_len > 0) {
+        chat_accum_emit(ctx, ctx->pending, ctx->pending_len);
+        ctx->pending_len = 0;
+    }
+}
+
 int tq_generate_chat_text(tq_model_t* model,
                            tq_tokenizer_t* tokenizer,
                            tq_state_t* state,
@@ -905,9 +1034,10 @@ int tq_generate_chat_text(tq_model_t* model,
 
     /* Wrap user callback to capture generated text into a buffer for the
      * next call's cached_text update. */
-    chat_accum_t accum = { .buf = NULL, .len = 0, .cap = 0, .tainted = 0,
-                            .user_cb = config->on_token,
-                            .user_data = config->user_data };
+    chat_accum_t accum;
+    memset(&accum, 0, sizeof(accum));
+    accum.user_cb = config->on_token;
+    accum.user_data = config->user_data;
     void (*orig_cb)(const char*, void*) = config->on_token;
     void*  orig_ud = config->user_data;
     config->on_token = chat_accum_callback;
@@ -1039,6 +1169,9 @@ int tq_generate_chat_text(tq_model_t* model,
 
             int piece_len = (int)strlen(piece ? piece : "");
             if (config->on_token && piece) config->on_token(piece, config->user_data);
+            /* The chat_accum filter may have detected an end marker
+             * spanning multiple tokens — break before forwarding more. */
+            if (accum.stop_requested) break;
             if (output && piece && output_pos + piece_len < output_size - 1) {
                 memcpy(output + output_pos, piece, piece_len);
                 output_pos += piece_len;
@@ -1088,6 +1221,11 @@ int tq_generate_chat_text(tq_model_t* model,
             output, output_size);
     }
 
+    /* Drain the marker filter's lookahead buffer before reading
+     * accum.buf for the cached_text update. Without this, the last
+     * ~32 bytes of clean output would be silently lost. */
+    chat_accum_finish(&accum);
+
     /* Restore the original callback before returning to caller */
     config->on_token = orig_cb;
     config->user_data = orig_ud;
diff --git a/wasm/quant.wasm b/wasm/quant.wasm
index f018484..477218d 100755
Binary files a/wasm/quant.wasm and b/wasm/quant.wasm differ