Add turbo_kv_5b: 5-bit (32-level) Lloyd-Max codebook, near-lossless KV

unamedkr · claude · unamedkr · commit 87e14cbf1462 · 2026-04-08T07:19:44.000+09:00
New TQ_TYPE_TURBO_KV_5B following the Variant F architecture (single-stage RHT + Lloyd-Max codebook + ‖x‖, no QJL). 32-level codebook adds one bit of precision per element vs turbo_kv_4b for the cost of 16 bytes per block. Llama 3.2 3B PPL on bench/data/ppl_1k.txt (FP32 baseline = 13.56): turbo_kv_4b 14.28 (+5.3%) turbo_kv_5b 13.60 (+0.34%) ← near-lossless ⭐ SmolLM2 135M PPL (FP32 baseline = 18.62): turbo_kv_4b 19.70 (+5.8%) turbo_kv_5b 18.94 (+1.7%) Block layout (88 bytes, vs 72 for 4b): norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_5bit(80) 128 elements * 5 bits = 640 bits = 80 bytes for indices Changes: - tq_codebook.c: extend codebook table to b=5, add 32 Lloyd-Max-Gaussian centroids (Max 1960 Table I), bounds check 1..5 - tq_types.h: TQ_TYPE_TURBO_KV_5B enum, block_tq_turbo_kv_5b struct, size assertion - tq_turbo_kv.c: pack_5bit/unpack_5bit helpers (5 bits/element, LSB-first bit-stream packing), quantize/dequantize/attention impls following the same Variant F pattern - tq_traits.c: register TQ_TRAITS[TQ_TYPE_TURBO_KV_5B], add format spec case - tools/quant.c: CLI parser accepts -k turbo_kv_5b - integrations/llamacpp/tq_kv_cache.cpp: GGML_TYPE_TQ_TURBO_KV_5B + table entry + wrappers + count bump - tests/test_turbo_kv.cpp: FormatSpec test updated to drop the HAS_RESIDUAL assertion (Variant F removed it from 3b/4b too) All 35 tests pass. Closes one of the follow-ups in issue #15. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -54,7 +54,8 @@ typedef enum {
     TQ_TYPE_TURBO_KV_1B = 10,/* TurboQuant KV: 1-bit Hamming (sign only)           */
     TQ_TYPE_TURBO_KV_2B = 11,/* TurboQuant KV: 2-bit (1-bit codebook + 1-bit QJL) */
     TQ_TYPE_UNIFORM_3B= 12,  /* Min-Max uniform 3-bit with sub-block scales     */
-    TQ_TYPE_COUNT     = 13
+    TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook   */
+    TQ_TYPE_COUNT     = 14
 } tq_type;
 
 /* ============================================================
@@ -220,6 +221,22 @@ typedef struct {
     uint8_t  mse_indices[TQ_BK * 3 / 8];  /* 3-bit packed codebook indices (48B)  */
 } block_tq_turbo_kv_3b;
 
+/* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
+ *
+ * 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
+ * Same single-stage structure as turbo_kv_4b — no QJL residual.
+ *
+ * Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_5bit(80) = 88 bytes
+ * 128 elements * 5 bits = 640 bits = 80 bytes for indices.
+ */
+typedef struct {
+    uint16_t norm;                          /* L2 norm of original vector (fp16)       */
+    uint16_t residual_norm;                 /* unused (kept for layout symmetry)       */
+    uint16_t inv_std_fp16;                  /* per-block inv_std for codebook lookup   */
+    uint16_t _pad;                          /* alignment padding                       */
+    uint8_t  mse_indices[TQ_BK * 5 / 8];   /* 5-bit packed indices 0..31 (80B)        */
+} block_tq_turbo_kv_5b;
+
 /* TurboQuant KV cache block: 4-bit variant (Variant F: codebook-only, no QJL)
  *
  * Karpathy-loop ablation showed the QJL residual contributes ~0 to scores.
@@ -277,6 +294,7 @@ TQ_CHECK_SIZE(block_tq_uniform_3b, 4 * TQ_3B_NSUB + TQ_BK * 3 / 8);
 TQ_CHECK_SIZE(block_tq_mixed_4b8, 4 + TQ_MIXED_OUTLIERS + TQ_MIXED_OUTLIERS * 2 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK * 3 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
+TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
diff --git a/integrations/llamacpp/tq_kv_cache.cpp b/integrations/llamacpp/tq_kv_cache.cpp
@@ -45,7 +45,8 @@ enum {
     GGML_TYPE_TQ_TURBO_KV_1B  = GGML_TYPE_TQ_BASE + 10,
     GGML_TYPE_TQ_TURBO_KV_2B  = GGML_TYPE_TQ_BASE + 11,
     GGML_TYPE_TQ_UNIFORM_3B   = GGML_TYPE_TQ_BASE + 12,
-    GGML_TYPE_TQ_COUNT         = 13,
+    GGML_TYPE_TQ_TURBO_KV_5B  = GGML_TYPE_TQ_BASE + 13,
+    GGML_TYPE_TQ_COUNT         = 14,
 };
 
 /* ============================================================
@@ -67,6 +68,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_TURBO_KV_1B:  return GGML_TYPE_TQ_TURBO_KV_1B;
         case TQ_TYPE_TURBO_KV_2B:  return GGML_TYPE_TQ_TURBO_KV_2B;
         case TQ_TYPE_UNIFORM_3B:   return GGML_TYPE_TQ_UNIFORM_3B;
+        case TQ_TYPE_TURBO_KV_5B:  return GGML_TYPE_TQ_TURBO_KV_5B;
         default: return -1;
     }
 }
@@ -86,6 +88,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_TURBO_KV_1B:  return TQ_TYPE_TURBO_KV_1B;
         case GGML_TYPE_TQ_TURBO_KV_2B:  return TQ_TYPE_TURBO_KV_2B;
         case GGML_TYPE_TQ_UNIFORM_3B:   return TQ_TYPE_UNIFORM_3B;
+        case GGML_TYPE_TQ_TURBO_KV_5B:  return TQ_TYPE_TURBO_KV_5B;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -151,6 +154,7 @@ TQ_GGML_WRAPPERS(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
 TQ_GGML_WRAPPERS(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_WRAPPERS(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_WRAPPERS(uniform_3b,  TQ_TYPE_UNIFORM_3B)
+TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -204,6 +208,7 @@ TQ_GGML_VEC_DOT(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
 TQ_GGML_VEC_DOT(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_VEC_DOT(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_VEC_DOT(uniform_3b,  TQ_TYPE_UNIFORM_3B)
+TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 
 /* ============================================================
  * GGML type trait table
@@ -327,6 +332,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_uniform_3b,
         tq_ggml_vec_dot_uniform_3b,
     },
+    {
+        "tq_turbo_kv_5b", GGML_TYPE_TQ_TURBO_KV_5B, TQ_TYPE_TURBO_KV_5B,
+        sizeof(block_tq_turbo_kv_5b), TQ_BK,
+        (float)sizeof(block_tq_turbo_kv_5b) * 8.0f / TQ_BK,
+        tq_ggml_from_float_turbo_kv_5b,
+        tq_ggml_to_float_turbo_kv_5b,
+        tq_ggml_vec_dot_turbo_kv_5b,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -418,6 +431,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
         { "tq-turbo-kv-3b", TQ_TYPE_TURBO_KV_3B },
         { "turbokv3",       TQ_TYPE_TURBO_KV_3B },
         { "turbo_kv_4b",    TQ_TYPE_TURBO_KV_4B },
+        { "turbo_kv_5b",    TQ_TYPE_TURBO_KV_5B },
         { "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
         { "turbokv4",       TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_1b",    TQ_TYPE_TURBO_KV_1B },
diff --git a/src/core/tq_codebook.c b/src/core/tq_codebook.c
@@ -36,24 +36,36 @@ static const float CODEBOOK_4BIT[16] = {
      0.1284f,  0.3881f,  0.6568f,  0.9423f,  1.2562f,  1.6180f,  2.0690f,  2.7326f
 };
 
+/* b=5 (32 levels): optimal Lloyd-Max for N(0,1).
+ * Source: Max 1960 Table I, 32-level Gaussian quantizer output values.
+ * MSE is roughly 4x lower than 4-bit; outer level shrinks to ~2.0 because
+ * the additional levels concentrate in the body. */
+static const float CODEBOOK_5BIT[32] = {
+    -1.9956f, -1.7900f, -1.6107f, -1.4493f, -1.3010f, -1.1631f, -1.0334f, -0.9104f,
+    -0.7928f, -0.6795f, -0.5697f, -0.4626f, -0.3576f, -0.2543f, -0.1520f, -0.0506f,
+     0.0506f,  0.1520f,  0.2543f,  0.3576f,  0.4626f,  0.5697f,  0.6795f,  0.7928f,
+     0.9104f,  1.0334f,  1.1631f,  1.3010f,  1.4493f,  1.6107f,  1.7900f,  1.9956f
+};
+
 /* Codebook table indexed by bits */
-static const float* const CODEBOOKS[5] = {
+static const float* const CODEBOOKS[6] = {
     NULL,          /* 0 bits: unused */
     CODEBOOK_1BIT, /* 1 bit: 2 levels */
     CODEBOOK_2BIT, /* 2 bits: 4 levels */
     CODEBOOK_3BIT, /* 3 bits: 8 levels */
-    CODEBOOK_4BIT  /* 4 bits: 16 levels */
+    CODEBOOK_4BIT, /* 4 bits: 16 levels */
+    CODEBOOK_5BIT  /* 5 bits: 32 levels */
 };
 
-static const int CODEBOOK_SIZES[5] = {0, 2, 4, 8, 16};
+static const int CODEBOOK_SIZES[6] = {0, 2, 4, 8, 16, 32};
 
 /* ============================================================
  * Codebook quantize: find nearest centroid for each element
  * ============================================================ */
 
 void tq_codebook_quantize(const float* src, uint8_t* dst_indices,
                            int n, int bits, float inv_std) {
-    if (!src || !dst_indices || bits < 1 || bits > 4 || n <= 0) return;
+    if (!src || !dst_indices || bits < 1 || bits > 5 || n <= 0) return;
 
     const float* centroids = CODEBOOKS[bits];
     int n_levels = CODEBOOK_SIZES[bits];
@@ -82,7 +94,7 @@ void tq_codebook_quantize(const float* src, uint8_t* dst_indices,
 
 void tq_codebook_dequantize(const uint8_t* indices, float* dst,
                              int n, int bits, float inv_std) {
-    if (!indices || !dst || bits < 1 || bits > 4 || n <= 0) return;
+    if (!indices || !dst || bits < 1 || bits > 5 || n <= 0) return;
 
     const float* centroids = CODEBOOKS[bits];
     float std_val = (inv_std > 1e-10f) ? (1.0f / inv_std) : 1.0f;
@@ -97,7 +109,7 @@ void tq_codebook_dequantize(const uint8_t* indices, float* dst,
  * ============================================================ */
 
 const float* tq_codebook_centroids(int bits) {
-    if (bits < 1 || bits > 4) return NULL;
+    if (bits < 1 || bits > 5) return NULL;
     return CODEBOOKS[bits];
 }
 
diff --git a/src/core/tq_traits.c b/src/core/tq_traits.c
@@ -48,6 +48,11 @@ extern void tq_turbo_kv_4b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv,
                                           float* scores, int seq_len, int head_dim);
 
+extern void tq_turbo_kv_5b_quantize_ref(const float* src, void* dst, int n);
+extern void tq_turbo_kv_5b_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_turbo_kv_5b_attention_ref(const float* query, const void* kv,
+                                          float* scores, int seq_len, int head_dim);
+
 extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -158,7 +163,17 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .quantize   = tq_turbo_kv_4b_quantize_ref,
         .dequantize = tq_turbo_kv_4b_dequantize_ref,
         .attention  = tq_turbo_kv_4b_attention_ref,
-        .residual_type = TQ_TYPE_QJL_1B,
+        .residual_type = TQ_TYPE_COUNT, /* Variant F: no residual */
+    },
+    [TQ_TYPE_TURBO_KV_5B] = {
+        .name       = "turbo_kv_5b",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_turbo_kv_5b),
+        .bpe        = (float)sizeof(block_tq_turbo_kv_5b) * 8.0f / TQ_BK,
+        .quantize   = tq_turbo_kv_5b_quantize_ref,
+        .dequantize = tq_turbo_kv_5b_dequantize_ref,
+        .attention  = tq_turbo_kv_5b_attention_ref,
+        .residual_type = TQ_TYPE_COUNT,
     },
     [TQ_TYPE_TURBO_KV_1B] = {
         .name       = "turbo_kv_1b",
@@ -258,8 +273,9 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 3;
             spec.flags = TQ_FLAG_HAS_RESIDUAL; break;
         case TQ_TYPE_TURBO_KV_4B:
-            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4;
-            spec.flags = TQ_FLAG_HAS_RESIDUAL; break;
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
+        case TQ_TYPE_TURBO_KV_5B:
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
         case TQ_TYPE_TURBO_KV_1B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
         case TQ_TYPE_TURBO_KV_2B:
diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c
@@ -910,3 +910,145 @@ void tq_turbo_kv_2b_attention_ref(const float* query, const void* kv,
         scores[seq] = mse_score + qjl_correction;
     }
 }
+
+/* ============================================================
+ * TurboQuant KV 5-bit (Variant F architecture):
+ *   normalize -> RHT -> 5-bit (32-level) Lloyd-Max codebook on rotated values
+ * Single-stage estimator, no QJL residual.
+ * ============================================================ */
+
+/* Pack 5-bit indices into a bit-stream, LSB-first.
+ * 128 elems × 5 bits = 640 bits = 80 bytes. */
+static void pack_5bit(const uint8_t* indices, uint8_t* packed, int n) {
+    int total_bytes = (n * 5 + 7) / 8;
+    memset(packed, 0, (size_t)total_bytes);
+    for (int i = 0; i < n; i++) {
+        int bit_offset = i * 5;
+        int byte_idx = bit_offset / 8;
+        int bit_pos  = bit_offset % 8;
+        uint16_t val = (uint16_t)(indices[i] & 0x1F);
+        packed[byte_idx] |= (uint8_t)(val << bit_pos);
+        if (bit_pos > 3) {
+            packed[byte_idx + 1] |= (uint8_t)(val >> (8 - bit_pos));
+        }
+    }
+}
+
+static void unpack_5bit(const uint8_t* packed, uint8_t* indices, int n) {
+    int total_bytes = (n * 5 + 7) / 8;
+    for (int i = 0; i < n; i++) {
+        int bit_offset = i * 5;
+        int byte_idx = bit_offset / 8;
+        int bit_pos  = bit_offset % 8;
+        uint16_t val = (uint16_t)packed[byte_idx];
+        if (bit_pos > 3 && byte_idx + 1 < total_bytes) {
+            val |= (uint16_t)packed[byte_idx + 1] << 8;
+        }
+        indices[i] = (uint8_t)((val >> bit_pos) & 0x1F);
+    }
+}
+
+void tq_turbo_kv_5b_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_turbo_kv_5b* block = (block_tq_turbo_kv_5b*)dst;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
+    float norm = sqrtf(norm_sq);
+    block->norm = tkv_fp32_to_fp16(norm);
+    block->residual_norm = 0;
+    block->_pad = 0;
+
+    float rotated[TQ_BK];
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+    for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
+    for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
+
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
+
+    /* Variant F: max-abs scaling, no clipping */
+    float max_abs = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(rotated[i]);
+        if (a > max_abs) max_abs = a;
+    }
+    if (max_abs < 1e-10f) max_abs = 1.0f;
+    const float CENT_5BIT_MAX = 1.9956f;
+    float inv_std = CENT_5BIT_MAX / max_abs;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
+
+    uint8_t indices[TQ_BK];
+    tq_codebook_quantize(rotated, indices, dim, 5, inv_std);
+    pack_5bit(indices, block->mse_indices, dim);
+}
+
+static void dequant_mse_rotated_5bit(const block_tq_turbo_kv_5b* block,
+                                      float* rotated, int dim) {
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
+    uint8_t indices[TQ_BK] = {0};
+    unpack_5bit(block->mse_indices, indices, dim);
+    tq_codebook_dequantize(indices, rotated, dim, 5, inv_std);
+}
+
+void tq_turbo_kv_5b_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_turbo_kv_5b* block = (const block_tq_turbo_kv_5b*)src;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm = tkv_fp16_to_fp32(block->norm);
+
+    float rotated[TQ_BK];
+    dequant_mse_rotated_5bit(block, rotated, dim);
+    tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
+
+    for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
+}
+
+void tq_turbo_kv_5b_attention_ref(const float* query, const void* kv_cache,
+                                    float* scores, int seq_len, int head_dim) {
+    const block_tq_turbo_kv_5b* blocks_5b = (const block_tq_turbo_kv_5b*)kv_cache;
+    int dim = head_dim;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    /* Pre-rotate query once */
+    float q_rot[TQ_BK];
+    memcpy(q_rot, query, (size_t)dim * sizeof(float));
+    for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
+    tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
+
+    for (int seq = 0; seq < seq_len; seq++) {
+        const block_tq_turbo_kv_5b* block = &blocks_5b[seq];
+        float norm = tkv_fp16_to_fp32(block->norm);
+
+        float rotated[TQ_BK];
+        dequant_mse_rotated_5bit(block, rotated, dim);
+
+        float mse_dot = 0.0f;
+#ifdef __ARM_NEON
+        {
+            float32x4_t acc0 = vdupq_n_f32(0.0f);
+            float32x4_t acc1 = vdupq_n_f32(0.0f);
+            float32x4_t acc2 = vdupq_n_f32(0.0f);
+            float32x4_t acc3 = vdupq_n_f32(0.0f);
+            int d = 0;
+            for (; d + 15 < dim; d += 16) {
+                acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d]),      vld1q_f32(&rotated[d]));
+                acc1 = vfmaq_f32(acc1, vld1q_f32(&q_rot[d + 4]),  vld1q_f32(&rotated[d + 4]));
+                acc2 = vfmaq_f32(acc2, vld1q_f32(&q_rot[d + 8]),  vld1q_f32(&rotated[d + 8]));
+                acc3 = vfmaq_f32(acc3, vld1q_f32(&q_rot[d + 12]), vld1q_f32(&rotated[d + 12]));
+            }
+            acc0 = vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3));
+            for (; d + 3 < dim; d += 4) {
+                acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d]), vld1q_f32(&rotated[d]));
+            }
+            mse_dot = vaddvq_f32(acc0);
+            for (; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
+        }
+#else
+        for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
+#endif
+        scores[seq] = norm * mse_dot;
+    }
+}
diff --git a/tests/test_turbo_kv.cpp b/tests/test_turbo_kv.cpp
@@ -386,15 +386,18 @@ TEST(TurboKV, TraitsTable) {
 }
 
 TEST(TurboKV, FormatSpec) {
+    /* Variant F: no residual stage in 3b/4b/5b — single-stage codebook only */
     tq_format_spec_t spec3 = tq_get_format_spec(TQ_TYPE_TURBO_KV_3B);
     EXPECT_EQ(spec3.algorithm, TQ_ALG_TURBO);
     EXPECT_EQ(spec3.key_bits, 3);
-    EXPECT_TRUE(spec3.flags & TQ_FLAG_HAS_RESIDUAL);
 
     tq_format_spec_t spec4 = tq_get_format_spec(TQ_TYPE_TURBO_KV_4B);
     EXPECT_EQ(spec4.algorithm, TQ_ALG_TURBO);
     EXPECT_EQ(spec4.key_bits, 4);
-    EXPECT_TRUE(spec4.flags & TQ_FLAG_HAS_RESIDUAL);
+
+    tq_format_spec_t spec5 = tq_get_format_spec(TQ_TYPE_TURBO_KV_5B);
+    EXPECT_EQ(spec5.algorithm, TQ_ALG_TURBO);
+    EXPECT_EQ(spec5.key_bits, 5);
 }
 
 /* ============================================================
diff --git a/tools/quant.c b/tools/quant.c
@@ -81,6 +81,7 @@ static tq_type parse_kv_type(const char* s) {
     if (strcmp(s, "turbo_4b") == 0)   return TQ_TYPE_TURBO_4B;
     if (strcmp(s, "turbo_kv_3b") == 0) return TQ_TYPE_TURBO_KV_3B;
     if (strcmp(s, "turbo_kv_4b") == 0) return TQ_TYPE_TURBO_KV_4B;
+    if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
     if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
     if (strcmp(s, "qjl_1b") == 0)     return TQ_TYPE_QJL_1B;
     if (strcmp(s, "mixed_4b8") == 0)  return TQ_TYPE_MIXED_4B8;