-
Notifications
You must be signed in to change notification settings - Fork 44
Expand file tree
/
Copy pathcli.py
More file actions
454 lines (391 loc) · 17.5 KB
/
cli.py
File metadata and controls
454 lines (391 loc) · 17.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
"""
quantcpp CLI — chat with a local LLM in your terminal.
Ollama-style commands:
quantcpp pull MODEL Download a model from HuggingFace
quantcpp list List cached and available models
quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
quantcpp serve MODEL Start OpenAI-compatible HTTP server
Backwards-compatible shortcut:
quantcpp Auto-downloads Llama-3.2-1B, starts chat
quantcpp "What is X?" One-shot question with default model
quantcpp --model NAME Use a specific model
"""
import sys
import os
import json
# Ollama-style short aliases → canonical _MODEL_REGISTRY keys.
# Plain "smollm2" without a size suffix points at the 1.7B model — that's
# the recommended default. Users who explicitly want the 135M demo model
# need to ask for it by full name.
MODEL_ALIASES = {
"smollm2": "SmolLM2-1.7B",
"smollm2:1.7b": "SmolLM2-1.7B",
"smollm2:135m": "SmolLM2-135M",
"qwen3.5": "Qwen3.5-0.8B",
"qwen3.5:0.8b": "Qwen3.5-0.8B",
"llama3.2": "Llama-3.2-1B",
"llama3.2:1b": "Llama-3.2-1B",
"phi3.5": "Phi-3.5-mini",
"phi3.5:mini": "Phi-3.5-mini",
"phi-3.5": "Phi-3.5-mini",
"phi-3.5-mini": "Phi-3.5-mini",
}
def _resolve_name(name):
"""Resolve user input to canonical registry key or local path."""
if name is None:
return None
if os.path.exists(name) and name.endswith(".gguf"):
return name
return MODEL_ALIASES.get(name.lower(), name)
def _registry():
from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
return _MODEL_REGISTRY, _CACHE_DIR
def cmd_pull(args):
"""Download a model by alias or canonical name."""
import quantcpp
name = _resolve_name(args.model)
if os.path.exists(name) and name.endswith(".gguf"):
print(f"already local: {name}")
return 0
if name not in quantcpp._MODEL_REGISTRY:
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
print(f"unknown model: {args.model!r}", file=sys.stderr)
print(f" registry: {avail}", file=sys.stderr)
print(f" aliases: {aliases}", file=sys.stderr)
return 1
print(f"pulling {name}...", file=sys.stderr)
try:
path = quantcpp.download(name)
size_mb = os.path.getsize(path) / (1024 * 1024)
print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
return 0
except Exception as e:
print(f"download failed: {e}", file=sys.stderr)
return 1
def cmd_list(args):
"""List cached and available models."""
registry, cache_dir = _registry()
rows = []
for name, (repo, filename, approx_mb) in sorted(registry.items()):
path = cache_dir / filename
if path.exists():
size_mb = path.stat().st_size / (1024 * 1024)
status = "cached"
display_path = str(path)
else:
size_mb = approx_mb
status = "remote"
display_path = f"~{approx_mb} MB"
alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
rows.append((status, name, alias, size_mb, display_path))
if args.json_output:
print(json.dumps([
{"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
for (s, n, a, sz, p) in rows
], indent=2))
return 0
print(f"\n Models cache: {cache_dir}\n")
print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
for status, name, alias, size_mb, _ in rows:
size_str = f"{size_mb:.0f} MB"
print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
print()
return 0
def _resolve_to_path(name_or_path):
"""Resolve alias/name to a local .gguf path, downloading if needed."""
import quantcpp
name = _resolve_name(name_or_path)
if os.path.exists(name) and name.endswith(".gguf"):
return name
if name not in quantcpp._MODEL_REGISTRY:
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
raise ValueError(
f"unknown model: {name_or_path!r}. Available: {avail}"
)
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
cached = quantcpp._CACHE_DIR / filename
if cached.exists():
return str(cached)
print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
return quantcpp.download(name)
def cmd_run(args):
"""Chat with a model (auto-pull if needed)."""
try:
model_path = _resolve_to_path(args.model)
except ValueError as e:
print(str(e), file=sys.stderr)
return 1
except Exception as e:
print(f"pull failed: {e}", file=sys.stderr)
return 1
from quantcpp import Model
print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
n_threads=args.threads)
if args.prompt:
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
else:
from quantcpp import ChatContextOverflow
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
# Multi-turn chat: accumulate history as ChatML so the model sees
# prior turns. m.chat() reuses the KV cache via prefix-match, so
# repeating the history is cheap (O(new tokens), not O(n^2)).
# turns is a list of (user_msg, assistant_msg) pairs so we can
# trim from the front when we hit context overflow.
turns = []
def _build_history(extra_user=None):
parts = []
for u, a in turns:
parts.append(f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n")
if extra_user is not None:
parts.append(f"<|im_start|>user\n{extra_user}<|im_end|>\n<|im_start|>assistant\n")
return "".join(parts)
try:
while True:
question = input("\nYou: ")
if not question.strip():
continue
print("AI: ", end="", flush=True)
reply_buf = []
# Retry loop: on context overflow, drop the oldest turn
# and try again. Without this, the C side resets the KV
# cache but Python's history still has the bloat, so
# every subsequent turn would loop back into overflow.
attempt = 0
while True:
history = _build_history(extra_user=question)
try:
for tok in m.chat(history):
print(tok, end="", flush=True)
reply_buf.append(tok)
break
except ChatContextOverflow:
if not turns:
print("\n[chat] message alone exceeds context window — try a shorter question.",
file=sys.stderr)
reply_buf = [] # nothing was emitted
break
dropped = turns.pop(0)
attempt += 1
print(f"\n[chat] context full \u2014 dropped oldest turn ({len(dropped[0])+len(dropped[1])} chars), retrying...",
file=sys.stderr)
# The session was already reset by the C side;
# retrying with the trimmed history will hit
# the slow path on this turn and the fast path
# again from the next turn onward.
if attempt > 8:
print("[chat] too many overflow retries, giving up on this turn.",
file=sys.stderr)
reply_buf = []
break
print()
if reply_buf:
turns.append((question, "".join(reply_buf)))
except (KeyboardInterrupt, EOFError):
print("\nBye!", file=sys.stderr)
m.close()
return 0
def cmd_serve(args):
"""Start OpenAI-compatible HTTP server (requires quant-server binary)."""
import shutil
import subprocess
try:
model_path = _resolve_to_path(args.model)
except Exception as e:
print(f"error: {e}", file=sys.stderr)
return 1
binary = shutil.which("quant-server")
if not binary:
# Look in common build dirs relative to repo
for guess in ("./build/quant-server", "./build_metal/quant-server"):
if os.path.isfile(guess) and os.access(guess, os.X_OK):
binary = guess
break
if not binary:
print("quant-server binary not found.", file=sys.stderr)
print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
file=sys.stderr)
print(" Or install via your package manager.", file=sys.stderr)
return 2
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
print("", file=sys.stderr)
print("OpenAI-compatible endpoints:", file=sys.stderr)
print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
print("", file=sys.stderr)
print("Streaming (SSE — token-by-token):", file=sys.stderr)
print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
file=sys.stderr)
print("", file=sys.stderr)
print("Non-streaming (single JSON response):", file=sys.stderr)
print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
file=sys.stderr)
print("", file=sys.stderr)
print("OpenAI Python SDK works as-is:", file=sys.stderr)
print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
file=sys.stderr)
print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
file=sys.stderr)
print("", file=sys.stderr)
os.execvp(cmd[0], cmd)
def cmd_client(args):
"""Send a chat request to a running quantcpp serve endpoint.
Default mode is streaming (SSE) — tokens print as they arrive.
Use --no-stream for a single JSON response.
"""
import json as _json
import urllib.request
url = args.url.rstrip("/") + "/v1/chat/completions"
payload = {
"model": args.model_name,
"messages": [{"role": "user", "content": args.prompt}],
"max_tokens": args.max_tokens,
"temperature": args.temperature,
"stream": not args.no_stream,
}
body = _json.dumps(payload).encode()
req = urllib.request.Request(
url, data=body,
headers={
"Content-Type": "application/json",
"User-Agent": "quantcpp-client",
},
)
try:
with urllib.request.urlopen(req) as resp:
if args.no_stream:
data = _json.loads(resp.read())
print(data["choices"][0]["message"]["content"])
return 0
# SSE stream — parse `data: {...}\n\n` chunks
for line in resp:
line = line.decode("utf-8", errors="replace").rstrip()
if not line.startswith("data:"):
continue
payload_str = line[5:].strip()
if payload_str == "[DONE]":
break
try:
chunk = _json.loads(payload_str)
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
print(delta, end="", flush=True)
except Exception:
pass
print()
return 0
except urllib.error.URLError as e:
print(f"connection failed: {e}", file=sys.stderr)
print(f" Is the server running on {args.url}?", file=sys.stderr)
print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
file=sys.stderr)
return 1
def cmd_chat_default(args):
"""Backwards-compatible default: auto-download Phi-3.5-mini and chat.
Default progression:
Llama-3.2-1B → SmolLM2-1.7B (2026-04-12, vocab fix)
→ Phi-3.5-mini (2026-04-12, after Phi-3 arch support
landed). Phi-3.5-mini has the smallest vocab in
the registry (32K) AND 3.8B params, giving the
best speed/quality combo we ship.
"""
args.model = args.model or "Phi-3.5-mini"
args.threads = getattr(args, "threads", 4)
args.max_tokens = getattr(args, "max_tokens", 256)
args.temperature = getattr(args, "temperature", 0.7)
args.prompt = args.prompt or None
return cmd_run(args)
def main():
import argparse
parser = argparse.ArgumentParser(
prog="quantcpp",
description="Chat with a local LLM. No API key, no GPU, no server.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
commands:
pull MODEL Download a model (e.g. llama3.2:1b)
list List cached and available models
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
serve MODEL Start OpenAI-compatible HTTP server
client PROMPT Send a request to a running serve (default: SSE streaming)
examples:
quantcpp pull phi-3.5-mini # recommended default (32K vocab → fast)
quantcpp list
quantcpp run phi-3.5-mini
quantcpp run phi-3.5-mini "What is gravity?"
quantcpp serve phi-3.5-mini --port 8080
quantcpp client "What is gravity?" # streams from :8080
quantcpp client "Hi" --url http://localhost:8081
quantcpp client "Hi" --no-stream # single JSON response
backwards-compat (no subcommand):
quantcpp # default chat with Phi-3.5-mini
quantcpp "What is gravity?" # one-shot
quantcpp --model smollm2 # lightweight alternative
quantcpp --model llama3.2:1b # smallest download
""",
)
sub = parser.add_subparsers(dest="command")
# pull
p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
# list
p_list = sub.add_parser("list", help="List cached and available models")
p_list.add_argument("--json", dest="json_output", action="store_true")
# run
p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
p_run.add_argument("model", help="Model name, alias, or .gguf path")
p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
p_run.add_argument("-j", "--threads", type=int, default=4)
p_run.add_argument("-n", "--max-tokens", type=int, default=256)
p_run.add_argument("-t", "--temperature", type=float, default=0.7)
# serve
p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
p_serve.add_argument("model", help="Model name, alias, or .gguf path")
p_serve.add_argument("-p", "--port", type=int, default=8080)
p_serve.add_argument("-j", "--threads", type=int, default=4)
# client
p_client = sub.add_parser("client",
help="Send a chat request to a running quantcpp serve endpoint")
p_client.add_argument("prompt", help="Question to send")
p_client.add_argument("--url", default="http://localhost:8080",
help="Server URL (default: http://localhost:8080)")
p_client.add_argument("--model-name", "-m", default="quantcpp",
help="Model name in the request body (server ignores)")
p_client.add_argument("-n", "--max-tokens", type=int, default=256)
p_client.add_argument("-t", "--temperature", type=float, default=0.7)
p_client.add_argument("--no-stream", action="store_true",
help="Disable SSE streaming (single JSON response)")
# Backwards-compat: top-level args for direct chat
parser.add_argument("prompt", nargs="*", default=None,
help="(default mode) question to ask")
parser.add_argument("--model", "-m", default=None,
help="(default mode) model name or .gguf path")
parser.add_argument("--max-tokens", "-n", type=int, default=256)
parser.add_argument("--temperature", "-t", type=float, default=0.7)
parser.add_argument("--threads", "-j", type=int, default=4)
args = parser.parse_args()
if args.command == "pull":
return cmd_pull(args)
if args.command == "list":
return cmd_list(args)
if args.command == "run":
return cmd_run(args)
if args.command == "serve":
return cmd_serve(args)
if args.command == "client":
return cmd_client(args)
# No subcommand → backwards-compat default chat
return cmd_chat_default(args)
if __name__ == "__main__":
sys.exit(main())