Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ endif()

# OpenAI-compatible HTTP server (POSIX only — uses sys/socket.h)
if(TQ_BUILD_SERVER AND NOT MSVC)
# Legacy server (libturboquant-based) — kept for backwards compat.
add_executable(quant-server src/server/tq_server.c)
target_include_directories(quant-server PRIVATE
${CMAKE_SOURCE_DIR}/src/server
Expand All @@ -323,4 +324,17 @@ if(TQ_BUILD_SERVER AND NOT MSVC)
-Wall -Wextra -Wpedantic -Wno-unused-parameter)
endif()
message(STATUS "quant.cpp: HTTP server target enabled (quant-server)")

# Unified server (quant.h-based) — recommended, no sync divergence.
# Compiles quant.h directly (single-header amalgamation) so the
# inference path is guaranteed identical to Python/WASM/CLI.
add_executable(quant-server-unified tools/quant_server_unified.c)
target_include_directories(quant-server-unified PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(quant-server-unified Threads::Threads)
if(NOT MSVC)
target_link_libraries(quant-server-unified m)
target_compile_options(quant-server-unified PRIVATE
-Wall -Wextra -Wpedantic -Wno-unused-parameter -w)
endif()
message(STATUS "quant.cpp: Unified server target enabled (quant-server-unified)")
endif()
25 changes: 12 additions & 13 deletions bindings/python/quantcpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
print(m.ask("What is gravity?"))

Model selection guide:
Phi-3.5-mini (2.4 GB, vocab 32K) — DEFAULT. 3.8B params with the
smallest lm_head in the registry,
producing the best speed/quality
combo. Coherent multi-paragraph
output even at Q4_K_M.
Phi-3.5-mini (3.8 GB, vocab 32K) — DEFAULT. 3.8B params, Q8_0.
2x faster than Q4_K_M on NEON
(3.0 vs 1.5 tok/s on M3).
Best speed/quality combo.
SmolLM2-1.7B (1.7 GB, vocab 49K) — lightweight all-rounder. ~12 tok/s
on Apple M3, smaller download.
Llama-3.2-1B (750 MB, vocab 128K) — smallest download but slower
Expand Down Expand Up @@ -72,16 +71,16 @@ class ChatContextOverflow(RuntimeError):
# adding new entries — there is no integrity check at runtime.
_MODEL_REGISTRY = {
# ── DEFAULT ──
# Phi-3.5-mini-instruct (3.8B params, vocab 32K). Set as default on
# 2026-04-12 after end-to-end Phi-3 architecture support landed
# (fused QKV / fused gate+up FFN / LongRoPE). The 32K vocab is the
# smallest of the registry, which makes the lm_head matmul the
# fastest per-token. Combined with 3.8B params it produces the
# best quality-per-token of any model we ship.
# Phi-3.5-mini-instruct Q8_0. Switched from Q4_K_M on 2026-04-12
# after benchmarking: Q8_0 is 2x faster on Apple Silicon NEON
# (3.0 vs 1.5 tok/s on M3). Q4_K_M's complex super-block dequant
# dominates compute at batch-1; Q8_0's simple int8 dequant is
# NEON-friendly. Both produce identical quality. The larger download
# (3.8 GB vs 2.2 GB) is a one-time cost.
"Phi-3.5-mini": (
"bartowski/Phi-3.5-mini-instruct-GGUF",
"Phi-3.5-mini-instruct-Q4_K_M.gguf",
2400,
"Phi-3.5-mini-instruct-Q8_0.gguf",
3800,
),
# Lightweight all-rounder for users who want a smaller download
# than Phi-3.5-mini. vocab 49K keeps the lm_head matmul small, so
Expand Down
29 changes: 22 additions & 7 deletions bindings/python/quantcpp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,12 @@ def _build_history(extra_user=None):


def cmd_serve(args):
"""Start OpenAI-compatible HTTP server (requires quant-server binary)."""
"""Start OpenAI-compatible HTTP server.

Prefers `quant-server-unified` (built on quant.h, guaranteed correct)
over the legacy `quant-server` (built on libturboquant, may diverge).
Falls back to the legacy binary if unified is not found.
"""
import shutil
import subprocess

Expand All @@ -235,19 +240,29 @@ def cmd_serve(args):
print(f"error: {e}", file=sys.stderr)
return 1

binary = shutil.which("quant-server")
if not binary:
# Look in common build dirs relative to repo
for guess in ("./build/quant-server", "./build_metal/quant-server"):
# Prefer unified server (quant.h-based, fixes #77).
# Fall back to legacy libturboquant server if unified not found.
binary = None
for name in ("quant-server-unified", "quant-server"):
binary = shutil.which(name)
if binary:
break
for guess in (f"./build/{name}", f"./build_metal/{name}",
f"./build_cpu/{name}"):
if os.path.isfile(guess) and os.access(guess, os.X_OK):
binary = guess
break
if binary:
break

if not binary:
print("quant-server binary not found.", file=sys.stderr)
print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
print(" Build with:", file=sys.stderr)
print(" cc -O2 -o quant-server-unified tools/quant_server_unified.c -lm -lpthread",
file=sys.stderr)
print(" Or via CMake:", file=sys.stderr)
print(" cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
file=sys.stderr)
print(" Or install via your package manager.", file=sys.stderr)
return 2

# Check if port is available before launching server
Expand Down
Loading