v0.7.2: turbo_kv_5b_fast — near-lossless quality at fp32 parity speed

unamedkr · claude · unamedkr · commit c8a76ef774cb · 2026-04-09T00:59:38.000+09:00
New KV type TQ_TYPE_TURBO_KV_5B_FAST. Same Variant F algorithm as
turbo_kv_5b (RHT + 32-level Lloyd-Max codebook), but stores each
5-bit index as a full byte instead of bit-packed. Wastes 3 bits per
index, but eliminates the scalar bit-extraction overhead that kept
turbo_kv_5b at -8.8% vs fp32 in v0.7.1.

Llama 3.2 3B PPL eval (3 runs each, CPU-only):

  Type             Bytes  Compression   PPL     Δ vs FP32   tok/s    vs FP32
  ---------------  -----  -----------   ------  ---------   -----    --------
  fp32             —      1×            13.56   —           17.93    baseline
  turbo_kv_4b      72     7.1×          14.08   +3.8%       18.13    +1.1%   ⭐
  turbo_kv_5b      88     5.8×          13.65   +0.7%       16.93    -5.6%
  turbo_kv_5b_fast 136    3.76×         13.65   +0.7%       17.53    -2.2%   🆕
                                                                              ↑
                                                                              new
                                                                              Pareto

The 5b_fast inner loop is pure NEON tbl with no scalar unpack — just
vld1q_u8 + vqtbl2q_s8 + int8→fp32 + scale + fma. This is the cleanest
implementation in the codebase and the closest to fp32 parity for
near-lossless quality.

Trade-off: 3.76× compression vs 5.8× for turbo_kv_5b. Same +0.7% PPL
on Llama 3.2 3B (verified — both share the same 32-level codebook
and algorithm). Use case: "want near-lossless quality + parity speed,
have memory to spare for less compression".

Block layout (136 bytes):
  norm(2) + residual_norm(2) + inv_std(2) + _pad(2)
  + mse_indices[128]  ← one byte per 5-bit index (waste 3 bits each)

Files changed:
- include/turboquant/tq_types.h: TQ_TYPE_TURBO_KV_5B_FAST enum,
  block_tq_turbo_kv_5b_fast struct (136 bytes), size assertion
- src/core/tq_turbo_kv.c: tq_turbo_kv_5b_fast_quantize_ref,
  tq_turbo_kv_5b_fast_dequantize_ref, tq_turbo_kv_5b_fast_attention_ref
  (pure NEON tbl, ~30% fewer instructions per element than 5b)
- src/core/tq_traits.c: traits table entry + format spec case
- tools/quant.c: CLI parser
- integrations/llamacpp/tq_kv_cache.cpp: ggml type registration,
  TQ_GGML_WRAPPERS / TQ_GGML_VEC_DOT, parse map
- .gitignore: add build_nometal/

35/35 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -62,3 +62,4 @@ tq_run.dSYM/
 .claude/worktrees/
 docs/assets/hero_backup.png
 build_nomt/
+build_nometal/
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -57,7 +57,8 @@ typedef enum {
     TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook   */
     TQ_TYPE_TURBO_KV_4BO = 14,/* TurboQuant KV: 4-bit codebook + 8 FP16 outliers */
     TQ_TYPE_TURBO_KV_3BO = 15,/* TurboQuant KV: 3-bit codebook + 8 FP16 outliers */
-    TQ_TYPE_COUNT     = 16
+    TQ_TYPE_TURBO_KV_5B_FAST = 16, /* 5-bit codebook, 1-byte-per-index, fp32 parity speed */
+    TQ_TYPE_COUNT     = 17
 } tq_type;
 
 /* ============================================================
@@ -264,6 +265,27 @@ typedef struct {
     uint16_t out_values[TQ_KV_4BO_OUTLIERS];   /* outlier values FP16 (16B)        */
 } block_tq_turbo_kv_3bo;
 
+/* TurboQuant KV cache block: 5-bit FAST variant (1-byte-per-index layout)
+ *
+ * Same Variant F algorithm as turbo_kv_5b (RHT + 32-level Lloyd-Max codebook),
+ * but stores each index as a full byte. This wastes 3 bits per index but
+ * enables a pure-SIMD inner loop with no scalar bit extraction overhead —
+ * gets fp32 KV speed parity at the cost of 1.55× more memory than turbo_kv_5b
+ * (3.76× vs 5.8× compression).
+ *
+ * Use case: "near-lossless quality at parity speed", for users who can spare
+ * the extra memory but need fp32 throughput. Same PPL as turbo_kv_5b.
+ *
+ * Layout: 8 hdr + 128 indices = 136 bytes per 128-element block
+ */
+typedef struct {
+    uint16_t norm;                          /* L2 norm of original (fp16)        */
+    uint16_t residual_norm;                 /* unused                            */
+    uint16_t inv_std_fp16;                  /* per-block inv_std                 */
+    uint16_t _pad;                          /* alignment                         */
+    uint8_t  mse_indices[TQ_BK];           /* 1 byte per 5-bit index (0..31)   */
+} block_tq_turbo_kv_5b_fast;
+
 /* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
  *
  * 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
@@ -340,6 +362,7 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4bo, 8 + TQ_BK / 2 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_3bo, 8 + TQ_BK * 3 / 8 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
+TQ_CHECK_SIZE(block_tq_turbo_kv_5b_fast, 8 + TQ_BK);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
diff --git a/integrations/llamacpp/tq_kv_cache.cpp b/integrations/llamacpp/tq_kv_cache.cpp
@@ -48,7 +48,8 @@ enum {
     GGML_TYPE_TQ_TURBO_KV_5B  = GGML_TYPE_TQ_BASE + 13,
     GGML_TYPE_TQ_TURBO_KV_4BO = GGML_TYPE_TQ_BASE + 14,
     GGML_TYPE_TQ_TURBO_KV_3BO = GGML_TYPE_TQ_BASE + 15,
-    GGML_TYPE_TQ_COUNT         = 16,
+    GGML_TYPE_TQ_TURBO_KV_5B_FAST = GGML_TYPE_TQ_BASE + 16,
+    GGML_TYPE_TQ_COUNT         = 17,
 };
 
 /* ============================================================
@@ -73,6 +74,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_TURBO_KV_5B:  return GGML_TYPE_TQ_TURBO_KV_5B;
         case TQ_TYPE_TURBO_KV_4BO: return GGML_TYPE_TQ_TURBO_KV_4BO;
         case TQ_TYPE_TURBO_KV_3BO: return GGML_TYPE_TQ_TURBO_KV_3BO;
+        case TQ_TYPE_TURBO_KV_5B_FAST: return GGML_TYPE_TQ_TURBO_KV_5B_FAST;
         default: return -1;
     }
 }
@@ -95,6 +97,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_TURBO_KV_5B:  return TQ_TYPE_TURBO_KV_5B;
         case GGML_TYPE_TQ_TURBO_KV_4BO: return TQ_TYPE_TURBO_KV_4BO;
         case GGML_TYPE_TQ_TURBO_KV_3BO: return TQ_TYPE_TURBO_KV_3BO;
+        case GGML_TYPE_TQ_TURBO_KV_5B_FAST: return TQ_TYPE_TURBO_KV_5B_FAST;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -163,6 +166,7 @@ TQ_GGML_WRAPPERS(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 TQ_GGML_WRAPPERS(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
 TQ_GGML_WRAPPERS(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
+TQ_GGML_WRAPPERS(turbo_kv_5b_fast, TQ_TYPE_TURBO_KV_5B_FAST)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -219,6 +223,7 @@ TQ_GGML_VEC_DOT(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 TQ_GGML_VEC_DOT(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
 TQ_GGML_VEC_DOT(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
+TQ_GGML_VEC_DOT(turbo_kv_5b_fast, TQ_TYPE_TURBO_KV_5B_FAST)
 
 /* ============================================================
  * GGML type trait table
@@ -366,6 +371,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_turbo_kv_3bo,
         tq_ggml_vec_dot_turbo_kv_3bo,
     },
+    {
+        "tq_turbo_kv_5b_fast", GGML_TYPE_TQ_TURBO_KV_5B_FAST, TQ_TYPE_TURBO_KV_5B_FAST,
+        sizeof(block_tq_turbo_kv_5b_fast), TQ_BK,
+        (float)sizeof(block_tq_turbo_kv_5b_fast) * 8.0f / TQ_BK,
+        tq_ggml_from_float_turbo_kv_5b_fast,
+        tq_ggml_to_float_turbo_kv_5b_fast,
+        tq_ggml_vec_dot_turbo_kv_5b_fast,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -460,6 +473,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
         { "turbo_kv_5b",    TQ_TYPE_TURBO_KV_5B },
         { "turbo_kv_4bo",   TQ_TYPE_TURBO_KV_4BO },
         { "turbo_kv_3bo",   TQ_TYPE_TURBO_KV_3BO },
+        { "turbo_kv_5b_fast", TQ_TYPE_TURBO_KV_5B_FAST },
         { "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
         { "turbokv4",       TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_1b",    TQ_TYPE_TURBO_KV_1B },
diff --git a/src/core/tq_traits.c b/src/core/tq_traits.c
@@ -63,6 +63,11 @@ extern void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv,
                                           float* scores, int seq_len, int head_dim);
 
+extern void tq_turbo_kv_5b_fast_quantize_ref(const float* src, void* dst, int n);
+extern void tq_turbo_kv_5b_fast_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_turbo_kv_5b_fast_attention_ref(const float* query, const void* kv,
+                                                float* scores, int seq_len, int head_dim);
+
 extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -205,6 +210,16 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .attention  = tq_turbo_kv_3bo_attention_ref,
         .residual_type = TQ_TYPE_COUNT,
     },
+    [TQ_TYPE_TURBO_KV_5B_FAST] = {
+        .name       = "turbo_kv_5b_fast",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_turbo_kv_5b_fast),
+        .bpe        = (float)sizeof(block_tq_turbo_kv_5b_fast) * 8.0f / TQ_BK,
+        .quantize   = tq_turbo_kv_5b_fast_quantize_ref,
+        .dequantize = tq_turbo_kv_5b_fast_dequantize_ref,
+        .attention  = tq_turbo_kv_5b_fast_attention_ref,
+        .residual_type = TQ_TYPE_COUNT,
+    },
     [TQ_TYPE_TURBO_KV_1B] = {
         .name       = "turbo_kv_1b",
         .block_size = TQ_BK,
@@ -310,6 +325,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
         case TQ_TYPE_TURBO_KV_3BO:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 3; break;
+        case TQ_TYPE_TURBO_KV_5B_FAST:
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
         case TQ_TYPE_TURBO_KV_1B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
         case TQ_TYPE_TURBO_KV_2B:
diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c
@@ -1786,3 +1786,157 @@ void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv_cache,
         scores[seq] = norm * mse_dot;
     }
 }
+
+/* ============================================================
+ * TurboQuant KV 5-bit FAST: 1-byte-per-index layout for fp32 parity
+ *
+ * Same Variant F algorithm as turbo_kv_5b (RHT + 32-level Lloyd-Max
+ * codebook), but stores each index as a full byte. This wastes 3 bits
+ * per index but enables a pure-SIMD inner loop with no scalar bit
+ * extraction overhead.
+ *
+ * Layout: 8 hdr + 128 indices = 136 bytes per 128-element block
+ * Compression: 128*4 / 136 = 3.76× (vs 5.8× for turbo_kv_5b)
+ * Speed: fp32 KV parity (no scalar unpack, pure NEON tbl)
+ * PPL: same as turbo_kv_5b (+0.7% on Llama 3.2 3B)
+ * ============================================================ */
+
+void tq_turbo_kv_5b_fast_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_turbo_kv_5b_fast* block = (block_tq_turbo_kv_5b_fast*)dst;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
+    float norm = sqrtf(norm_sq);
+    block->norm = tkv_fp32_to_fp16(norm);
+    block->residual_norm = 0;
+    block->_pad = 0;
+
+    float rotated[TQ_BK];
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+    for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
+    for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
+
+    float max_abs = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(rotated[i]);
+        if (a > max_abs) max_abs = a;
+    }
+    if (max_abs < 1e-10f) max_abs = 1.0f;
+    const float CENT_5BIT_MAX_FAST = 1.9956f;
+    float inv_std = CENT_5BIT_MAX_FAST / max_abs;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
+
+    /* Quantize directly to byte-aligned indices (0..31, no packing) */
+    uint8_t indices[TQ_BK];
+    tq_codebook_quantize(rotated, indices, dim, 5, inv_std);
+    for (int i = 0; i < dim; i++) block->mse_indices[i] = indices[i];
+    for (int i = dim; i < TQ_BK; i++) block->mse_indices[i] = 0;
+}
+
+void tq_turbo_kv_5b_fast_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_turbo_kv_5b_fast* block = (const block_tq_turbo_kv_5b_fast*)src;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm = tkv_fp16_to_fp32(block->norm);
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
+
+    float rotated[TQ_BK];
+    /* Direct byte-indexed dequant (no bit unpacking) */
+    tq_codebook_dequantize(block->mse_indices, rotated, dim, 5, inv_std);
+    tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
+    for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
+}
+
+/* Constant pulled out of __ARM_NEON guard so non-NEON builds also see it */
+static const float CB5_FAST_RECIP = 1.9956f / 127.0f;
+
+void tq_turbo_kv_5b_fast_attention_ref(const float* query, const void* kv_cache,
+                                         float* scores, int seq_len, int head_dim) {
+    const block_tq_turbo_kv_5b_fast* blocks = (const block_tq_turbo_kv_5b_fast*)kv_cache;
+    int dim = head_dim;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    /* Pre-rotate query once */
+    float q_rot[TQ_BK];
+    memcpy(q_rot, query, (size_t)dim * sizeof(float));
+    for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
+    tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
+
+    const float* cb = tq_codebook_centroids(5);
+#ifdef __ARM_NEON
+    /* Same int8 codebook as turbo_kv_5b — 32 entries in 2 NEON registers */
+    static int8_t s_cb5fast_i8[32] = {0};
+    static int s_cb5fast_init = 0;
+    if (!s_cb5fast_init) {
+        for (int j = 0; j < 32; j++) {
+            float v = cb[j] * (127.0f / 1.9956f);
+            int q = (int)(v >= 0 ? v + 0.5f : v - 0.5f);
+            if (q < -127) q = -127;
+            if (q >  127) q =  127;
+            s_cb5fast_i8[j] = (int8_t)q;
+        }
+        s_cb5fast_init = 1;
+    }
+    int8x16x2_t cb_vec = { vld1q_s8(s_cb5fast_i8), vld1q_s8(s_cb5fast_i8 + 16) };
+#endif
+
+    for (int seq = 0; seq < seq_len; seq++) {
+        const block_tq_turbo_kv_5b_fast* block = &blocks[seq];
+        float norm = tkv_fp16_to_fp32(block->norm);
+        float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+        if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
+        float per_block_scale = CB5_FAST_RECIP / inv_std;
+
+        const uint8_t* mi = block->mse_indices;
+        float mse_dot = 0.0f;
+
+#ifdef __ARM_NEON
+        float32x4_t acc0 = vdupq_n_f32(0.0f);
+        float32x4_t acc1 = vdupq_n_f32(0.0f);
+        float32x4_t acc2 = vdupq_n_f32(0.0f);
+        float32x4_t acc3 = vdupq_n_f32(0.0f);
+        float32x4_t scale_v = vdupq_n_f32(per_block_scale);
+
+        int d = 0;
+        /* Process 16 elements per iteration: direct 16-byte load — NO scalar
+         * bit unpacking. THIS is the key difference from turbo_kv_5b. */
+        for (; d + 15 < dim; d += 16) {
+            uint8x16_t indices = vld1q_u8(mi + d);
+            int8x16_t vals = vqtbl2q_s8(cb_vec, indices);
+
+            int16x8_t i16_lo = vmovl_s8(vget_low_s8(vals));
+            int16x8_t i16_hi = vmovl_s8(vget_high_s8(vals));
+            float32x4_t f0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(i16_lo)));
+            float32x4_t f1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(i16_lo)));
+            float32x4_t f2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(i16_hi)));
+            float32x4_t f3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(i16_hi)));
+
+            f0 = vmulq_f32(f0, scale_v);
+            f1 = vmulq_f32(f1, scale_v);
+            f2 = vmulq_f32(f2, scale_v);
+            f3 = vmulq_f32(f3, scale_v);
+
+            acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d +  0]), f0);
+            acc1 = vfmaq_f32(acc1, vld1q_f32(&q_rot[d +  4]), f1);
+            acc2 = vfmaq_f32(acc2, vld1q_f32(&q_rot[d +  8]), f2);
+            acc3 = vfmaq_f32(acc3, vld1q_f32(&q_rot[d + 12]), f3);
+        }
+        mse_dot = vaddvq_f32(vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3)));
+
+        for (; d < dim; d++) {
+            mse_dot += q_rot[d] * (s_cb5fast_i8[mi[d]] * per_block_scale);
+        }
+#else
+        float lut[32];
+        for (int j = 0; j < 32; j++) lut[j] = cb[j] / inv_std;
+        for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * lut[mi[d]];
+#endif
+
+        scores[seq] = norm * mse_dot;
+    }
+}
diff --git a/tools/quant.c b/tools/quant.c
@@ -84,6 +84,7 @@ static tq_type parse_kv_type(const char* s) {
     if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
     if (strcmp(s, "turbo_kv_4bo") == 0) return TQ_TYPE_TURBO_KV_4BO;
     if (strcmp(s, "turbo_kv_3bo") == 0) return TQ_TYPE_TURBO_KV_3BO;
+    if (strcmp(s, "turbo_kv_5b_fast") == 0) return TQ_TYPE_TURBO_KV_5B_FAST;
     if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
     if (strcmp(s, "qjl_1b") == 0)     return TQ_TYPE_QJL_1B;
     if (strcmp(s, "mixed_4b8") == 0)  return TQ_TYPE_MIXED_4B8;