Skip to content

Commit c8a76ef

Browse files
unamedkrclaude
andcommitted
v0.7.2: turbo_kv_5b_fast — near-lossless quality at fp32 parity speed
New KV type TQ_TYPE_TURBO_KV_5B_FAST. Same Variant F algorithm as turbo_kv_5b (RHT + 32-level Lloyd-Max codebook), but stores each 5-bit index as a full byte instead of bit-packed. Wastes 3 bits per index, but eliminates the scalar bit-extraction overhead that kept turbo_kv_5b at -8.8% vs fp32 in v0.7.1. Llama 3.2 3B PPL eval (3 runs each, CPU-only): Type Bytes Compression PPL Δ vs FP32 tok/s vs FP32 --------------- ----- ----------- ------ --------- ----- -------- fp32 — 1× 13.56 — 17.93 baseline turbo_kv_4b 72 7.1× 14.08 +3.8% 18.13 +1.1% ⭐ turbo_kv_5b 88 5.8× 13.65 +0.7% 16.93 -5.6% turbo_kv_5b_fast 136 3.76× 13.65 +0.7% 17.53 -2.2% 🆕 ↑ new Pareto The 5b_fast inner loop is pure NEON tbl with no scalar unpack — just vld1q_u8 + vqtbl2q_s8 + int8→fp32 + scale + fma. This is the cleanest implementation in the codebase and the closest to fp32 parity for near-lossless quality. Trade-off: 3.76× compression vs 5.8× for turbo_kv_5b. Same +0.7% PPL on Llama 3.2 3B (verified — both share the same 32-level codebook and algorithm). Use case: "want near-lossless quality + parity speed, have memory to spare for less compression". Block layout (136 bytes): norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_indices[128] ← one byte per 5-bit index (waste 3 bits each) Files changed: - include/turboquant/tq_types.h: TQ_TYPE_TURBO_KV_5B_FAST enum, block_tq_turbo_kv_5b_fast struct (136 bytes), size assertion - src/core/tq_turbo_kv.c: tq_turbo_kv_5b_fast_quantize_ref, tq_turbo_kv_5b_fast_dequantize_ref, tq_turbo_kv_5b_fast_attention_ref (pure NEON tbl, ~30% fewer instructions per element than 5b) - src/core/tq_traits.c: traits table entry + format spec case - tools/quant.c: CLI parser - integrations/llamacpp/tq_kv_cache.cpp: ggml type registration, TQ_GGML_WRAPPERS / TQ_GGML_VEC_DOT, parse map - .gitignore: add build_nometal/ 35/35 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7abf5be commit c8a76ef

File tree

6 files changed

+212
-2
lines changed

6 files changed

+212
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,4 @@ tq_run.dSYM/
6262
.claude/worktrees/
6363
docs/assets/hero_backup.png
6464
build_nomt/
65+
build_nometal/

include/turboquant/tq_types.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ typedef enum {
5757
TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook */
5858
TQ_TYPE_TURBO_KV_4BO = 14,/* TurboQuant KV: 4-bit codebook + 8 FP16 outliers */
5959
TQ_TYPE_TURBO_KV_3BO = 15,/* TurboQuant KV: 3-bit codebook + 8 FP16 outliers */
60-
TQ_TYPE_COUNT = 16
60+
TQ_TYPE_TURBO_KV_5B_FAST = 16, /* 5-bit codebook, 1-byte-per-index, fp32 parity speed */
61+
TQ_TYPE_COUNT = 17
6162
} tq_type;
6263

6364
/* ============================================================
@@ -264,6 +265,27 @@ typedef struct {
264265
uint16_t out_values[TQ_KV_4BO_OUTLIERS]; /* outlier values FP16 (16B) */
265266
} block_tq_turbo_kv_3bo;
266267

268+
/* TurboQuant KV cache block: 5-bit FAST variant (1-byte-per-index layout)
269+
*
270+
* Same Variant F algorithm as turbo_kv_5b (RHT + 32-level Lloyd-Max codebook),
271+
* but stores each index as a full byte. This wastes 3 bits per index but
272+
* enables a pure-SIMD inner loop with no scalar bit extraction overhead —
273+
* gets fp32 KV speed parity at the cost of 1.55× more memory than turbo_kv_5b
274+
* (3.76× vs 5.8× compression).
275+
*
276+
* Use case: "near-lossless quality at parity speed", for users who can spare
277+
* the extra memory but need fp32 throughput. Same PPL as turbo_kv_5b.
278+
*
279+
* Layout: 8 hdr + 128 indices = 136 bytes per 128-element block
280+
*/
281+
typedef struct {
282+
uint16_t norm; /* L2 norm of original (fp16) */
283+
uint16_t residual_norm; /* unused */
284+
uint16_t inv_std_fp16; /* per-block inv_std */
285+
uint16_t _pad; /* alignment */
286+
uint8_t mse_indices[TQ_BK]; /* 1 byte per 5-bit index (0..31) */
287+
} block_tq_turbo_kv_5b_fast;
288+
267289
/* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
268290
*
269291
* 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
@@ -340,6 +362,7 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
340362
TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
341363
TQ_CHECK_SIZE(block_tq_turbo_kv_4bo, 8 + TQ_BK / 2 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
342364
TQ_CHECK_SIZE(block_tq_turbo_kv_3bo, 8 + TQ_BK * 3 / 8 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
365+
TQ_CHECK_SIZE(block_tq_turbo_kv_5b_fast, 8 + TQ_BK);
343366
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
344367
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
345368

integrations/llamacpp/tq_kv_cache.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ enum {
4848
GGML_TYPE_TQ_TURBO_KV_5B = GGML_TYPE_TQ_BASE + 13,
4949
GGML_TYPE_TQ_TURBO_KV_4BO = GGML_TYPE_TQ_BASE + 14,
5050
GGML_TYPE_TQ_TURBO_KV_3BO = GGML_TYPE_TQ_BASE + 15,
51-
GGML_TYPE_TQ_COUNT = 16,
51+
GGML_TYPE_TQ_TURBO_KV_5B_FAST = GGML_TYPE_TQ_BASE + 16,
52+
GGML_TYPE_TQ_COUNT = 17,
5253
};
5354

5455
/* ============================================================
@@ -73,6 +74,7 @@ static int tq_to_ggml_type(tq_type type) {
7374
case TQ_TYPE_TURBO_KV_5B: return GGML_TYPE_TQ_TURBO_KV_5B;
7475
case TQ_TYPE_TURBO_KV_4BO: return GGML_TYPE_TQ_TURBO_KV_4BO;
7576
case TQ_TYPE_TURBO_KV_3BO: return GGML_TYPE_TQ_TURBO_KV_3BO;
77+
case TQ_TYPE_TURBO_KV_5B_FAST: return GGML_TYPE_TQ_TURBO_KV_5B_FAST;
7678
default: return -1;
7779
}
7880
}
@@ -95,6 +97,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
9597
case GGML_TYPE_TQ_TURBO_KV_5B: return TQ_TYPE_TURBO_KV_5B;
9698
case GGML_TYPE_TQ_TURBO_KV_4BO: return TQ_TYPE_TURBO_KV_4BO;
9799
case GGML_TYPE_TQ_TURBO_KV_3BO: return TQ_TYPE_TURBO_KV_3BO;
100+
case GGML_TYPE_TQ_TURBO_KV_5B_FAST: return TQ_TYPE_TURBO_KV_5B_FAST;
98101
default: return TQ_TYPE_COUNT;
99102
}
100103
}
@@ -163,6 +166,7 @@ TQ_GGML_WRAPPERS(uniform_3b, TQ_TYPE_UNIFORM_3B)
163166
TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
164167
TQ_GGML_WRAPPERS(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
165168
TQ_GGML_WRAPPERS(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
169+
TQ_GGML_WRAPPERS(turbo_kv_5b_fast, TQ_TYPE_TURBO_KV_5B_FAST)
166170

167171
/* ============================================================
168172
* vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -219,6 +223,7 @@ TQ_GGML_VEC_DOT(uniform_3b, TQ_TYPE_UNIFORM_3B)
219223
TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
220224
TQ_GGML_VEC_DOT(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
221225
TQ_GGML_VEC_DOT(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
226+
TQ_GGML_VEC_DOT(turbo_kv_5b_fast, TQ_TYPE_TURBO_KV_5B_FAST)
222227

223228
/* ============================================================
224229
* GGML type trait table
@@ -366,6 +371,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
366371
tq_ggml_to_float_turbo_kv_3bo,
367372
tq_ggml_vec_dot_turbo_kv_3bo,
368373
},
374+
{
375+
"tq_turbo_kv_5b_fast", GGML_TYPE_TQ_TURBO_KV_5B_FAST, TQ_TYPE_TURBO_KV_5B_FAST,
376+
sizeof(block_tq_turbo_kv_5b_fast), TQ_BK,
377+
(float)sizeof(block_tq_turbo_kv_5b_fast) * 8.0f / TQ_BK,
378+
tq_ggml_from_float_turbo_kv_5b_fast,
379+
tq_ggml_to_float_turbo_kv_5b_fast,
380+
tq_ggml_vec_dot_turbo_kv_5b_fast,
381+
},
369382
};
370383

371384
#define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -460,6 +473,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
460473
{ "turbo_kv_5b", TQ_TYPE_TURBO_KV_5B },
461474
{ "turbo_kv_4bo", TQ_TYPE_TURBO_KV_4BO },
462475
{ "turbo_kv_3bo", TQ_TYPE_TURBO_KV_3BO },
476+
{ "turbo_kv_5b_fast", TQ_TYPE_TURBO_KV_5B_FAST },
463477
{ "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
464478
{ "turbokv4", TQ_TYPE_TURBO_KV_4B },
465479
{ "turbo_kv_1b", TQ_TYPE_TURBO_KV_1B },

src/core/tq_traits.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ extern void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n);
6363
extern void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv,
6464
float* scores, int seq_len, int head_dim);
6565

66+
extern void tq_turbo_kv_5b_fast_quantize_ref(const float* src, void* dst, int n);
67+
extern void tq_turbo_kv_5b_fast_dequantize_ref(const void* src, float* dst, int n);
68+
extern void tq_turbo_kv_5b_fast_attention_ref(const float* query, const void* kv,
69+
float* scores, int seq_len, int head_dim);
70+
6671
extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
6772
extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
6873
extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -205,6 +210,16 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
205210
.attention = tq_turbo_kv_3bo_attention_ref,
206211
.residual_type = TQ_TYPE_COUNT,
207212
},
213+
[TQ_TYPE_TURBO_KV_5B_FAST] = {
214+
.name = "turbo_kv_5b_fast",
215+
.block_size = TQ_BK,
216+
.type_size = sizeof(block_tq_turbo_kv_5b_fast),
217+
.bpe = (float)sizeof(block_tq_turbo_kv_5b_fast) * 8.0f / TQ_BK,
218+
.quantize = tq_turbo_kv_5b_fast_quantize_ref,
219+
.dequantize = tq_turbo_kv_5b_fast_dequantize_ref,
220+
.attention = tq_turbo_kv_5b_fast_attention_ref,
221+
.residual_type = TQ_TYPE_COUNT,
222+
},
208223
[TQ_TYPE_TURBO_KV_1B] = {
209224
.name = "turbo_kv_1b",
210225
.block_size = TQ_BK,
@@ -310,6 +325,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
310325
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
311326
case TQ_TYPE_TURBO_KV_3BO:
312327
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 3; break;
328+
case TQ_TYPE_TURBO_KV_5B_FAST:
329+
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
313330
case TQ_TYPE_TURBO_KV_1B:
314331
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
315332
case TQ_TYPE_TURBO_KV_2B:

src/core/tq_turbo_kv.c

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,3 +1786,157 @@ void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv_cache,
17861786
scores[seq] = norm * mse_dot;
17871787
}
17881788
}
1789+
1790+
/* ============================================================
1791+
* TurboQuant KV 5-bit FAST: 1-byte-per-index layout for fp32 parity
1792+
*
1793+
* Same Variant F algorithm as turbo_kv_5b (RHT + 32-level Lloyd-Max
1794+
* codebook), but stores each index as a full byte. This wastes 3 bits
1795+
* per index but enables a pure-SIMD inner loop with no scalar bit
1796+
* extraction overhead.
1797+
*
1798+
* Layout: 8 hdr + 128 indices = 136 bytes per 128-element block
1799+
* Compression: 128*4 / 136 = 3.76× (vs 5.8× for turbo_kv_5b)
1800+
* Speed: fp32 KV parity (no scalar unpack, pure NEON tbl)
1801+
* PPL: same as turbo_kv_5b (+0.7% on Llama 3.2 3B)
1802+
* ============================================================ */
1803+
1804+
void tq_turbo_kv_5b_fast_quantize_ref(const float* src, void* dst, int n) {
1805+
block_tq_turbo_kv_5b_fast* block = (block_tq_turbo_kv_5b_fast*)dst;
1806+
int dim = n;
1807+
if (dim > TQ_BK) dim = TQ_BK;
1808+
1809+
float norm_sq = 0.0f;
1810+
for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
1811+
float norm = sqrtf(norm_sq);
1812+
block->norm = tkv_fp32_to_fp16(norm);
1813+
block->residual_norm = 0;
1814+
block->_pad = 0;
1815+
1816+
float rotated[TQ_BK];
1817+
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
1818+
for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
1819+
for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
1820+
tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
1821+
1822+
float max_abs = 0.0f;
1823+
for (int i = 0; i < dim; i++) {
1824+
float a = fabsf(rotated[i]);
1825+
if (a > max_abs) max_abs = a;
1826+
}
1827+
if (max_abs < 1e-10f) max_abs = 1.0f;
1828+
const float CENT_5BIT_MAX_FAST = 1.9956f;
1829+
float inv_std = CENT_5BIT_MAX_FAST / max_abs;
1830+
block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
1831+
1832+
/* Quantize directly to byte-aligned indices (0..31, no packing) */
1833+
uint8_t indices[TQ_BK];
1834+
tq_codebook_quantize(rotated, indices, dim, 5, inv_std);
1835+
for (int i = 0; i < dim; i++) block->mse_indices[i] = indices[i];
1836+
for (int i = dim; i < TQ_BK; i++) block->mse_indices[i] = 0;
1837+
}
1838+
1839+
void tq_turbo_kv_5b_fast_dequantize_ref(const void* src, float* dst, int n) {
1840+
const block_tq_turbo_kv_5b_fast* block = (const block_tq_turbo_kv_5b_fast*)src;
1841+
int dim = n;
1842+
if (dim > TQ_BK) dim = TQ_BK;
1843+
1844+
float norm = tkv_fp16_to_fp32(block->norm);
1845+
float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
1846+
if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
1847+
1848+
float rotated[TQ_BK];
1849+
/* Direct byte-indexed dequant (no bit unpacking) */
1850+
tq_codebook_dequantize(block->mse_indices, rotated, dim, 5, inv_std);
1851+
tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
1852+
for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
1853+
}
1854+
1855+
/* Constant pulled out of __ARM_NEON guard so non-NEON builds also see it */
1856+
static const float CB5_FAST_RECIP = 1.9956f / 127.0f;
1857+
1858+
void tq_turbo_kv_5b_fast_attention_ref(const float* query, const void* kv_cache,
1859+
float* scores, int seq_len, int head_dim) {
1860+
const block_tq_turbo_kv_5b_fast* blocks = (const block_tq_turbo_kv_5b_fast*)kv_cache;
1861+
int dim = head_dim;
1862+
if (dim > TQ_BK) dim = TQ_BK;
1863+
1864+
/* Pre-rotate query once */
1865+
float q_rot[TQ_BK];
1866+
memcpy(q_rot, query, (size_t)dim * sizeof(float));
1867+
for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
1868+
tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
1869+
1870+
const float* cb = tq_codebook_centroids(5);
1871+
#ifdef __ARM_NEON
1872+
/* Same int8 codebook as turbo_kv_5b — 32 entries in 2 NEON registers */
1873+
static int8_t s_cb5fast_i8[32] = {0};
1874+
static int s_cb5fast_init = 0;
1875+
if (!s_cb5fast_init) {
1876+
for (int j = 0; j < 32; j++) {
1877+
float v = cb[j] * (127.0f / 1.9956f);
1878+
int q = (int)(v >= 0 ? v + 0.5f : v - 0.5f);
1879+
if (q < -127) q = -127;
1880+
if (q > 127) q = 127;
1881+
s_cb5fast_i8[j] = (int8_t)q;
1882+
}
1883+
s_cb5fast_init = 1;
1884+
}
1885+
int8x16x2_t cb_vec = { vld1q_s8(s_cb5fast_i8), vld1q_s8(s_cb5fast_i8 + 16) };
1886+
#endif
1887+
1888+
for (int seq = 0; seq < seq_len; seq++) {
1889+
const block_tq_turbo_kv_5b_fast* block = &blocks[seq];
1890+
float norm = tkv_fp16_to_fp32(block->norm);
1891+
float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
1892+
if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
1893+
float per_block_scale = CB5_FAST_RECIP / inv_std;
1894+
1895+
const uint8_t* mi = block->mse_indices;
1896+
float mse_dot = 0.0f;
1897+
1898+
#ifdef __ARM_NEON
1899+
float32x4_t acc0 = vdupq_n_f32(0.0f);
1900+
float32x4_t acc1 = vdupq_n_f32(0.0f);
1901+
float32x4_t acc2 = vdupq_n_f32(0.0f);
1902+
float32x4_t acc3 = vdupq_n_f32(0.0f);
1903+
float32x4_t scale_v = vdupq_n_f32(per_block_scale);
1904+
1905+
int d = 0;
1906+
/* Process 16 elements per iteration: direct 16-byte load — NO scalar
1907+
* bit unpacking. THIS is the key difference from turbo_kv_5b. */
1908+
for (; d + 15 < dim; d += 16) {
1909+
uint8x16_t indices = vld1q_u8(mi + d);
1910+
int8x16_t vals = vqtbl2q_s8(cb_vec, indices);
1911+
1912+
int16x8_t i16_lo = vmovl_s8(vget_low_s8(vals));
1913+
int16x8_t i16_hi = vmovl_s8(vget_high_s8(vals));
1914+
float32x4_t f0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(i16_lo)));
1915+
float32x4_t f1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(i16_lo)));
1916+
float32x4_t f2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(i16_hi)));
1917+
float32x4_t f3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(i16_hi)));
1918+
1919+
f0 = vmulq_f32(f0, scale_v);
1920+
f1 = vmulq_f32(f1, scale_v);
1921+
f2 = vmulq_f32(f2, scale_v);
1922+
f3 = vmulq_f32(f3, scale_v);
1923+
1924+
acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d + 0]), f0);
1925+
acc1 = vfmaq_f32(acc1, vld1q_f32(&q_rot[d + 4]), f1);
1926+
acc2 = vfmaq_f32(acc2, vld1q_f32(&q_rot[d + 8]), f2);
1927+
acc3 = vfmaq_f32(acc3, vld1q_f32(&q_rot[d + 12]), f3);
1928+
}
1929+
mse_dot = vaddvq_f32(vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3)));
1930+
1931+
for (; d < dim; d++) {
1932+
mse_dot += q_rot[d] * (s_cb5fast_i8[mi[d]] * per_block_scale);
1933+
}
1934+
#else
1935+
float lut[32];
1936+
for (int j = 0; j < 32; j++) lut[j] = cb[j] / inv_std;
1937+
for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * lut[mi[d]];
1938+
#endif
1939+
1940+
scores[seq] = norm * mse_dot;
1941+
}
1942+
}

tools/quant.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ static tq_type parse_kv_type(const char* s) {
8484
if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
8585
if (strcmp(s, "turbo_kv_4bo") == 0) return TQ_TYPE_TURBO_KV_4BO;
8686
if (strcmp(s, "turbo_kv_3bo") == 0) return TQ_TYPE_TURBO_KV_3BO;
87+
if (strcmp(s, "turbo_kv_5b_fast") == 0) return TQ_TYPE_TURBO_KV_5B_FAST;
8788
if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
8889
if (strcmp(s, "qjl_1b") == 0) return TQ_TYPE_QJL_1B;
8990
if (strcmp(s, "mixed_4b8") == 0) return TQ_TYPE_MIXED_4B8;

0 commit comments

Comments
 (0)