Issue/253: feat: support custom KV cache dtype for quantization

qinyiqun · qinyiqun · commit d3be4cc93264 · 2026-03-04T15:21:29.000+08:00
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
@@ -118,6 +118,16 @@ StaticKVCache::update(size_t layer_idx,
 // ==========================
 // PagedKVCacheConfig
 // ==========================
+PagedKVCacheConfig::PagedKVCacheConfig(
+    size_t num_blocks,
+    std::string kv_cache_dtype,
+    size_t block_size)
+    : num_blocks_(num_blocks),
+      block_size_(block_size),
+      kv_cache_dtype_(parse_dtype(kv_cache_dtype)) {
+    kv_cache_dtype_set_ = true;
+}
+
 PagedKVCacheConfig::PagedKVCacheConfig(
     size_t num_blocks,
     size_t block_size)
@@ -140,6 +150,15 @@ PagedKVCacheConfig::block_size() const {
     return block_size_;
 }
 
+infinicore::DataType
+PagedKVCacheConfig::kv_cache_dtype() const {
+    return kv_cache_dtype_;
+}
+
+void PagedKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) const {
+    kv_cache_dtype_ = dtype;
+}
+
 // ==========================
 // PagedKVCache
 // ==========================
@@ -149,7 +168,6 @@ PagedKVCache::PagedKVCache(
     infinicore::Size num_k_heads,
     infinicore::Size num_v_heads,
     infinicore::Size num_layers,
-    infinicore::DataType dtype,
     const PagedKVCacheConfig &config,
     const engine::distributed::RankInfo &rank_info)
     : Cache(),
@@ -158,7 +176,7 @@ PagedKVCache::PagedKVCache(
       num_rank_k_heads_(num_k_heads / rank_info.tp_size),
       num_rank_v_heads_(num_v_heads / rank_info.tp_size),
       rank_num_layers_(num_layers),
-      dtype_(dtype),
+      dtype_(config.kv_cache_dtype()),
       num_blocks_per_layer_(config.num_blocks()),
       block_size_(config.block_size()) {
     // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim]
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -2,6 +2,7 @@
 
 #include "base_cache.hpp"
 
+#include "../utils.hpp"
 #include "infinicore/context/context.hpp"
 #include "infinicore/device.hpp"
 #include "infinicore/tensor.hpp"
@@ -88,13 +89,24 @@ class PagedKVCacheConfig final : public CacheConfig {
         size_t num_blocks,
         size_t block_size = 16);
 
+    PagedKVCacheConfig(
+        size_t num_blocks,
+        std::string kv_cache_dtype,
+        size_t block_size = 16);
+
     std::unique_ptr<CacheConfig> unique_copy() const override;
     size_t num_blocks() const;
     size_t block_size() const;
+    infinicore::DataType kv_cache_dtype() const;
+    void set_kv_cache_dtype(infinicore::DataType dtype) const;
+    bool kv_cache_dtype_set() const { return kv_cache_dtype_set_; }
 
 private:
     size_t num_blocks_;
     size_t block_size_;
+
+    bool kv_cache_dtype_set_ = false;
+    mutable infinicore::DataType kv_cache_dtype_;
 };
 
 class PagedKVCache final : public Cache {
@@ -106,7 +118,6 @@ class PagedKVCache final : public Cache {
         infinicore::Size num_k_heads,
         infinicore::Size num_v_heads,
         infinicore::Size num_layers,
-        infinicore::DataType dtype,
         const PagedKVCacheConfig &config,
         const engine::distributed::RankInfo &rank_info);
 
diff --git a/csrc/config/model_config.cpp b/csrc/config/model_config.cpp
@@ -66,23 +66,8 @@ ModelConfig::get_rope_scaling() const {
     }
 }
 
-infinicore::DataType
-ModelConfig::get_dtype() const {
-    try {
-        std::string dtype_str = this->get<std::string>("torch_dtype");
-        if (dtype_str == "float32") {
-            return infinicore::DataType::F32;
-        } else if (dtype_str == "float16") {
-            return infinicore::DataType::F16;
-        } else if (dtype_str == "bfloat16") {
-            return infinicore::DataType::BF16;
-        } else if (dtype_str == "int8") {
-            return infinicore::DataType::I8;
-        } else {
-            throw std::runtime_error("Unsupported dtype string: " + dtype_str);
-        }
-    } catch (const std::exception &e) {
-        throw std::runtime_error("Error getting dtype from config: " + std::string(e.what()));
-    }
+infinicore::DataType ModelConfig::get_dtype() const {
+    std::string dtype_str = this->get<std::string>("torch_dtype");
+    return parse_dtype(dtype_str);
 }
 } // namespace infinilm::config
diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "../utils.hpp"
 #include "infinicore/nn/rope.hpp"
 #include "infinicore/ops.hpp"
 #include "quant_config.hpp"
diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp
@@ -143,7 +143,6 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.num_key_value_heads,
             config_.num_key_value_heads,
             config_.num_hidden_layers,
-            config_.dtype,
             *paged_kv_cache_config,
             rank_info_);
     } else if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
@@ -158,13 +157,15 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             *kv_cache_config,
             rank_info_);
     } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
+        if (!paged_kv_cache_config->kv_cache_dtype_set()) {
+            paged_kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
+        }
         kv_cache_ = std::make_shared<cache::PagedKVCache>(
             model_config_->get_head_dim(),
             model_config_->get_head_dim(),
             model_config_->get<size_t>("num_key_value_heads"),
             model_config_->get<size_t>("num_key_value_heads"),
             model_config_->get<size_t>("num_hidden_layers"),
-            model_config_->get_dtype(),
             *paged_kv_cache_config,
             rank_info_);
     } else {
diff --git a/csrc/pybind11/cache/cache.hpp b/csrc/pybind11/cache/cache.hpp
@@ -1,4 +1,5 @@
 #include "../../cache/cache.hpp"
+#include "infinicore/dtype.hpp"
 #include "infinicore/tensor.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -38,12 +39,19 @@ inline void bind_cache(py::module &m) {
             py::init<size_t, size_t>(),
             py::arg("num_blocks"),
             py::arg("block_size") = 16)
+        .def(
+            py::init<size_t, std::string, size_t>(),
+            py::arg("num_blocks"),
+            py::arg("kv_cache_dtype"),
+            py::arg("block_size") = 16)
         .def(
             "num_blocks",
             &infinilm::cache::PagedKVCacheConfig::num_blocks)
         .def(
             "block_size",
             &infinilm::cache::PagedKVCacheConfig::block_size)
+        .def("kv_cache_dtype",
+             &infinilm::cache::PagedKVCacheConfig::kv_cache_dtype)
         .def("__repr__", [](const infinilm::cache::PagedKVCacheConfig &) {
             return "<PagedKVCacheConfig>";
         });
diff --git a/csrc/utils.hpp b/csrc/utils.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include <infinicore/dtype.hpp>
 #include <infinirt.h>
 
 #include <cstring>
@@ -123,3 +124,22 @@ inline uint16_t f32_to_bf16(float val) {
 inline void hash_combine(size_t &seed, size_t value) {
     seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
 }
+
+inline infinicore::DataType parse_dtype(const std::string &dtype_str) {
+    static const std::unordered_map<std::string, infinicore::DataType> dtype_map = {
+        {"float32", infinicore::DataType::F32},
+        {"float16", infinicore::DataType::F16},
+        {"bfloat16", infinicore::DataType::BF16},
+        {"int8", infinicore::DataType::I8},
+        // 可根据需要扩展
+        {"int32", infinicore::DataType::I32},
+        {"int64", infinicore::DataType::I64},
+    };
+
+    auto it = dtype_map.find(dtype_str);
+    if (it != dtype_map.end()) {
+        return it->second;
+    }
+
+    throw std::runtime_error("Unsupported dtype string: " + dtype_str);
+}
diff --git a/python/infinilm/cache/cache.py b/python/infinilm/cache/cache.py
@@ -18,9 +18,11 @@ def __init__(
         self,
         num_blocks: int,
         block_size: int = 16,
+        kv_cache_dtype: str | None = None,
     ):
-        _infinilm.PagedKVCacheConfig.__init__(
-            self,
-            num_blocks,
-            block_size,
-        )
+        if kv_cache_dtype is None:
+            _infinilm.PagedKVCacheConfig.__init__(self, num_blocks, block_size)
+        else:
+            _infinilm.PagedKVCacheConfig.__init__(
+                self, num_blocks, kv_cache_dtype, block_size
+            )