quant.cpp/bindings/python/quantcpp/cli.py at 1b7bec372fa7f29ae758febe5901d0f2062352ef · quantumaikr/quant.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
"""
quantcpp CLI — chat with a local LLM in your terminal.

Ollama-style commands:
    quantcpp pull MODEL       Download a model from HuggingFace
    quantcpp list             List cached and available models
    quantcpp run MODEL [Q]    Chat with a model (auto-pulls if needed)
    quantcpp serve MODEL      Start OpenAI-compatible HTTP server

Backwards-compatible shortcut:
    quantcpp                  Auto-downloads Llama-3.2-1B, starts chat
    quantcpp "What is X?"     One-shot question with default model
    quantcpp --model NAME     Use a specific model
"""

import sys
import os
import json


# Ollama-style short aliases → canonical _MODEL_REGISTRY keys.
# Plain "smollm2" without a size suffix points at the 1.7B model — that's
# the recommended default. Users who explicitly want the 135M demo model
# need to ask for it by full name.
MODEL_ALIASES = {
    "smollm2":         "SmolLM2-1.7B",
    "smollm2:1.7b":    "SmolLM2-1.7B",
    "smollm2:135m":    "SmolLM2-135M",
    "qwen3.5":         "Qwen3.5-0.8B",
    "qwen3.5:0.8b":    "Qwen3.5-0.8B",
    "llama3.2":        "Llama-3.2-1B",
    "llama3.2:1b":     "Llama-3.2-1B",
    "phi3.5":          "Phi-3.5-mini",
    "phi3.5:mini":     "Phi-3.5-mini",
    "phi-3.5":         "Phi-3.5-mini",
    "phi-3.5-mini":    "Phi-3.5-mini",
}


def _resolve_name(name):
    """Resolve user input to canonical registry key or local path."""
    if name is None:
        return None
    if os.path.exists(name) and name.endswith(".gguf"):
        return name
    return MODEL_ALIASES.get(name.lower(), name)


def _registry():
    from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
    return _MODEL_REGISTRY, _CACHE_DIR


def cmd_pull(args):
    """Download a model by alias or canonical name."""
    import quantcpp
    name = _resolve_name(args.model)

    if os.path.exists(name) and name.endswith(".gguf"):
        print(f"already local: {name}")
        return 0

    if name not in quantcpp._MODEL_REGISTRY:
        avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
        aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
        print(f"unknown model: {args.model!r}", file=sys.stderr)
        print(f"  registry:  {avail}", file=sys.stderr)
        print(f"  aliases:   {aliases}", file=sys.stderr)
        return 1

    print(f"pulling {name}...", file=sys.stderr)
    try:
        path = quantcpp.download(name)
        size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
        return 0
    except Exception as e:
        print(f"download failed: {e}", file=sys.stderr)
        return 1


def cmd_list(args):
    """List cached and available models."""
    registry, cache_dir = _registry()

    rows = []
    for name, (repo, filename, approx_mb) in sorted(registry.items()):
        path = cache_dir / filename
        if path.exists():
            size_mb = path.stat().st_size / (1024 * 1024)
            status = "cached"
            display_path = str(path)
        else:
            size_mb = approx_mb
            status = "remote"
            display_path = f"~{approx_mb} MB"
        alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
        rows.append((status, name, alias, size_mb, display_path))

    if args.json_output:
        print(json.dumps([
            {"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
            for (s, n, a, sz, p) in rows
        ], indent=2))
        return 0

    print(f"\n  Models  cache: {cache_dir}\n")
    print(f"  {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
    print(f"  {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
    for status, name, alias, size_mb, _ in rows:
        size_str = f"{size_mb:.0f} MB"
        print(f"  {status:<8} {name:<16} {alias:<14} {size_str:>8}")
    print()
    return 0


def _resolve_to_path(name_or_path):
    """Resolve alias/name to a local .gguf path, downloading if needed."""
    import quantcpp
    name = _resolve_name(name_or_path)

    if os.path.exists(name) and name.endswith(".gguf"):
        return name

    if name not in quantcpp._MODEL_REGISTRY:
        avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
        raise ValueError(
            f"unknown model: {name_or_path!r}. Available: {avail}"
        )

    repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
    cached = quantcpp._CACHE_DIR / filename
    if cached.exists():
        return str(cached)

    print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
    return quantcpp.download(name)


def cmd_run(args):
    """Chat with a model (auto-pull if needed)."""
    try:
        model_path = _resolve_to_path(args.model)
    except ValueError as e:
        print(str(e), file=sys.stderr)
        return 1
    except Exception as e:
        print(f"pull failed: {e}", file=sys.stderr)
        return 1

    from quantcpp import Model
    print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
    m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
              n_threads=args.threads)

    if args.prompt:
        question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
        for tok in m.generate(question):
            print(tok, end="", flush=True)
        print()
    else:
        from quantcpp import ChatContextOverflow
        print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
        # Multi-turn chat: accumulate history as ChatML so the model sees
        # prior turns. m.chat() reuses the KV cache via prefix-match, so
        # repeating the history is cheap (O(new tokens), not O(n^2)).
        # turns is a list of (user_msg, assistant_msg) pairs so we can
        # trim from the front when we hit context overflow.
        turns = []
        def _build_history(extra_user=None):
            parts = []
            for u, a in turns:
                parts.append(f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n")
            if extra_user is not None:
                parts.append(f"<|im_start|>user\n{extra_user}<|im_end|>\n<|im_start|>assistant\n")
            return "".join(parts)

        try:
            while True:
                question = input("\nYou: ")
                if not question.strip():
                    continue
                print("AI: ", end="", flush=True)
                reply_buf = []
                # Retry loop: on context overflow, drop the oldest turn
                # and try again. Without this, the C side resets the KV
                # cache but Python's history still has the bloat, so
                # every subsequent turn would loop back into overflow.
                attempt = 0
                while True:
                    history = _build_history(extra_user=question)
                    try:
                        for tok in m.chat(history):
                            print(tok, end="", flush=True)
                            reply_buf.append(tok)
                        break
                    except ChatContextOverflow:
                        if not turns:
                            print("\n[chat] message alone exceeds context window — try a shorter question.",
                                  file=sys.stderr)
                            reply_buf = []  # nothing was emitted
                            break
                        dropped = turns.pop(0)
                        attempt += 1
                        print(f"\n[chat] context full \u2014 dropped oldest turn ({len(dropped[0])+len(dropped[1])} chars), retrying...",
                              file=sys.stderr)
                        # The session was already reset by the C side;
                        # retrying with the trimmed history will hit
                        # the slow path on this turn and the fast path
                        # again from the next turn onward.
                        if attempt > 8:
                            print("[chat] too many overflow retries, giving up on this turn.",
                                  file=sys.stderr)
                            reply_buf = []
                            break
                print()
                if reply_buf:
                    turns.append((question, "".join(reply_buf)))
        except (KeyboardInterrupt, EOFError):
            print("\nBye!", file=sys.stderr)

    m.close()
    return 0


def cmd_serve(args):
    """Start OpenAI-compatible HTTP server (requires quant-server binary)."""
    import shutil
    import subprocess

    try:
        model_path = _resolve_to_path(args.model)
    except Exception as e:
        print(f"error: {e}", file=sys.stderr)
        return 1

    binary = shutil.which("quant-server")
    if not binary:
        # Look in common build dirs relative to repo
        for guess in ("./build/quant-server", "./build_metal/quant-server"):
            if os.path.isfile(guess) and os.access(guess, os.X_OK):
                binary = guess
                break

    if not binary:
        print("quant-server binary not found.", file=sys.stderr)
        print("  Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
              file=sys.stderr)
        print("  Or install via your package manager.", file=sys.stderr)
        return 2

    cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
    print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
    print("", file=sys.stderr)
    print("OpenAI-compatible endpoints:", file=sys.stderr)
    print(f"  POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
    print(f"  GET  http://localhost:{args.port}/v1/models", file=sys.stderr)
    print(f"  GET  http://localhost:{args.port}/health", file=sys.stderr)
    print("", file=sys.stderr)
    print("Streaming (SSE — token-by-token):", file=sys.stderr)
    print(f"  curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
    print('    -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
          file=sys.stderr)
    print("", file=sys.stderr)
    print("Non-streaming (single JSON response):", file=sys.stderr)
    print(f"  curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
    print('    -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
          file=sys.stderr)
    print("", file=sys.stderr)
    print("OpenAI Python SDK works as-is:", file=sys.stderr)
    print(f"  client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
          file=sys.stderr)
    print("  client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
          file=sys.stderr)
    print("", file=sys.stderr)
    os.execvp(cmd[0], cmd)


def cmd_client(args):
    """Send a chat request to a running quantcpp serve endpoint.

    Default mode is streaming (SSE) — tokens print as they arrive.
    Use --no-stream for a single JSON response.
    """
    import json as _json
    import urllib.request

    url = args.url.rstrip("/") + "/v1/chat/completions"
    payload = {
        "model": args.model_name,
        "messages": [{"role": "user", "content": args.prompt}],
        "max_tokens": args.max_tokens,
        "temperature": args.temperature,
        "stream": not args.no_stream,
    }
    body = _json.dumps(payload).encode()
    req = urllib.request.Request(
        url, data=body,
        headers={
            "Content-Type": "application/json",
            "User-Agent": "quantcpp-client",
        },
    )

    try:
        with urllib.request.urlopen(req) as resp:
            if args.no_stream:
                data = _json.loads(resp.read())
                print(data["choices"][0]["message"]["content"])
                return 0

            # SSE stream — parse `data: {...}\n\n` chunks
            for line in resp:
                line = line.decode("utf-8", errors="replace").rstrip()
                if not line.startswith("data:"):
                    continue
                payload_str = line[5:].strip()
                if payload_str == "[DONE]":
                    break
                try:
                    chunk = _json.loads(payload_str)
                    delta = chunk["choices"][0]["delta"].get("content", "")
                    if delta:
                        print(delta, end="", flush=True)
                except Exception:
                    pass
            print()
            return 0
    except urllib.error.URLError as e:
        print(f"connection failed: {e}", file=sys.stderr)
        print(f"  Is the server running on {args.url}?", file=sys.stderr)
        print(f"  Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
              file=sys.stderr)
        return 1


def cmd_chat_default(args):
    """Backwards-compatible default: auto-download Phi-3.5-mini and chat.

    Default progression:
      Llama-3.2-1B → SmolLM2-1.7B (2026-04-12, vocab fix)
                   → Phi-3.5-mini (2026-04-12, after Phi-3 arch support
                     landed). Phi-3.5-mini has the smallest vocab in
                     the registry (32K) AND 3.8B params, giving the
                     best speed/quality combo we ship.
    """
    args.model = args.model or "Phi-3.5-mini"
    args.threads = getattr(args, "threads", 4)
    args.max_tokens = getattr(args, "max_tokens", 256)
    args.temperature = getattr(args, "temperature", 0.7)
    args.prompt = args.prompt or None
    return cmd_run(args)


def main():
    import argparse

    parser = argparse.ArgumentParser(
        prog="quantcpp",
        description="Chat with a local LLM. No API key, no GPU, no server.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
commands:
  pull MODEL            Download a model (e.g. llama3.2:1b)
  list                  List cached and available models
  run MODEL [PROMPT]    Chat with a model (auto-pulls if needed)
  serve MODEL           Start OpenAI-compatible HTTP server
  client PROMPT         Send a request to a running serve (default: SSE streaming)

examples:
  quantcpp pull phi-3.5-mini         # recommended default (32K vocab → fast)
  quantcpp list
  quantcpp run phi-3.5-mini
  quantcpp run phi-3.5-mini "What is gravity?"
  quantcpp serve phi-3.5-mini --port 8080
  quantcpp client "What is gravity?"                  # streams from :8080
  quantcpp client "Hi" --url http://localhost:8081
  quantcpp client "Hi" --no-stream                    # single JSON response

backwards-compat (no subcommand):
  quantcpp                          # default chat with Phi-3.5-mini
  quantcpp "What is gravity?"       # one-shot
  quantcpp --model smollm2          # lightweight alternative
  quantcpp --model llama3.2:1b      # smallest download
""",
    )

    sub = parser.add_subparsers(dest="command")

    # pull
    p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
    p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")

    # list
    p_list = sub.add_parser("list", help="List cached and available models")
    p_list.add_argument("--json", dest="json_output", action="store_true")

    # run
    p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
    p_run.add_argument("model", help="Model name, alias, or .gguf path")
    p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
    p_run.add_argument("-j", "--threads", type=int, default=4)
    p_run.add_argument("-n", "--max-tokens", type=int, default=256)
    p_run.add_argument("-t", "--temperature", type=float, default=0.7)

    # serve
    p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
    p_serve.add_argument("model", help="Model name, alias, or .gguf path")
    p_serve.add_argument("-p", "--port", type=int, default=8080)
    p_serve.add_argument("-j", "--threads", type=int, default=4)

    # client
    p_client = sub.add_parser("client",
        help="Send a chat request to a running quantcpp serve endpoint")
    p_client.add_argument("prompt", help="Question to send")
    p_client.add_argument("--url", default="http://localhost:8080",
                          help="Server URL (default: http://localhost:8080)")
    p_client.add_argument("--model-name", "-m", default="quantcpp",
                          help="Model name in the request body (server ignores)")
    p_client.add_argument("-n", "--max-tokens", type=int, default=256)
    p_client.add_argument("-t", "--temperature", type=float, default=0.7)
    p_client.add_argument("--no-stream", action="store_true",
                          help="Disable SSE streaming (single JSON response)")

    # Backwards-compat: top-level args for direct chat
    parser.add_argument("prompt", nargs="*", default=None,
                        help="(default mode) question to ask")
    parser.add_argument("--model", "-m", default=None,
                        help="(default mode) model name or .gguf path")
    parser.add_argument("--max-tokens", "-n", type=int, default=256)
    parser.add_argument("--temperature", "-t", type=float, default=0.7)
    parser.add_argument("--threads", "-j", type=int, default=4)

    args = parser.parse_args()

    if args.command == "pull":
        return cmd_pull(args)
    if args.command == "list":
        return cmd_list(args)
    if args.command == "run":
        return cmd_run(args)
    if args.command == "serve":
        return cmd_serve(args)
    if args.command == "client":
        return cmd_client(args)

    # No subcommand → backwards-compat default chat
    return cmd_chat_default(args)


if __name__ == "__main__":
    sys.exit(main())