issue/253 refine static kv cache init

PanZezhong1725 · PanZezhong1725 · commit 51d0a81533c8 · 2026-03-23T01:08:52.000Z
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
@@ -10,22 +10,13 @@ namespace infinilm::cache {
 // StaticKVCacheConfig
 // ==========================
 
-StaticKVCacheConfig::StaticKVCacheConfig(
-    infinicore::Size _max_batch_size,
-    infinicore::Size _max_cache_len)
-    : max_batch_size_(_max_batch_size),
-      max_cache_len_(_max_cache_len) {
-}
-
 StaticKVCacheConfig::StaticKVCacheConfig(
     infinicore::Size _max_batch_size,
     infinicore::Size _max_cache_len,
-    std::string kv_cache_dtype)
+    std::optional<infinicore::DataType> kv_cache_dtype)
     : max_batch_size_(_max_batch_size),
-      max_cache_len_(_max_cache_len) {
-    if (!kv_cache_dtype.empty()) {
-        this->kv_cache_dtype_ = std::make_optional(parse_dtype(kv_cache_dtype));
-    }
+      max_cache_len_(_max_cache_len),
+      kv_cache_dtype_(kv_cache_dtype) {
 }
 
 std::unique_ptr<CacheConfig>
@@ -143,20 +134,11 @@ void StaticKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) {
 // ==========================
 PagedKVCacheConfig::PagedKVCacheConfig(
     size_t num_blocks,
-    std::string kv_cache_dtype,
-    size_t block_size)
-    : num_blocks_(num_blocks),
-      block_size_(block_size) {
-    if (!kv_cache_dtype.empty()) {
-        this->kv_cache_dtype_ = std::make_optional(parse_dtype(kv_cache_dtype));
-    }
-}
-
-PagedKVCacheConfig::PagedKVCacheConfig(
-    size_t num_blocks,
-    size_t block_size)
+    size_t block_size,
+    std::optional<infinicore::DataType> kv_cache_dtype)
     : num_blocks_(num_blocks),
-      block_size_(block_size) {
+      block_size_(block_size),
+      kv_cache_dtype_(kv_cache_dtype) {
 }
 
 std::unique_ptr<CacheConfig>
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -22,12 +22,8 @@ class StaticKVCacheConfig final : public CacheConfig {
 public:
     StaticKVCacheConfig(
         infinicore::Size _max_batch_size = 1,
-        infinicore::Size _max_cache_len = std::numeric_limits<infinicore::Size>::max());
-
-    StaticKVCacheConfig(
-        infinicore::Size _max_batch_size,
-        infinicore::Size _max_cache_len,
-        std::string kv_cache_dtype);
+        infinicore::Size _max_cache_len = std::numeric_limits<infinicore::Size>::max(),
+        std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
 
     std::unique_ptr<CacheConfig> unique_copy() const override;
     infinicore::Size max_batch_size() const;
@@ -40,7 +36,7 @@ class StaticKVCacheConfig final : public CacheConfig {
     infinicore::Size max_batch_size_;
     infinicore::Size max_cache_len_;
 
-    std::optional<infinicore::DataType> kv_cache_dtype_ = std::nullopt;
+    std::optional<infinicore::DataType> kv_cache_dtype_;
 };
 
 class StaticKVCache final : public Cache {
@@ -97,12 +93,8 @@ class PagedKVCacheConfig final : public CacheConfig {
 public:
     PagedKVCacheConfig(
         size_t num_blocks,
-        size_t block_size = 256);
-
-    PagedKVCacheConfig(
-        size_t num_blocks,
-        std::string kv_cache_dtype,
-        size_t block_size = 16);
+        size_t block_size = 256,
+        std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
 
     std::unique_ptr<CacheConfig> unique_copy() const override;
     size_t num_blocks() const;
diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp
@@ -64,7 +64,7 @@ class ModelConfig {
     infinicore::DataType get_dtype() const;
     infinicore::quantization::QuantScheme get_quant_scheme() const;
     std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
-    void set_kv_quant_scheme(std::string kv_cache_dtype) {
+    void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) {
         this->quant_config.set_kv_quant_scheme(kv_cache_dtype);
     }
     infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {
diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp
@@ -23,27 +23,21 @@ class QuantConfig {
         }
     }
 
-    void set_kv_quant_scheme(std::string kv_cache_dtype) {
-        if (kv_cache_dtype.empty()) {
-            this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
-            // spdlog::debug("kv_cache_dtype is empty, using default NONE");
-            return;
-        }
-
+    void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) {
         try {
-            switch (parse_dtype(kv_cache_dtype)) {
+            switch (kv_cache_dtype) {
             case infinicore::DataType::I8: {
                 this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::INT8;
                 break;
             }
             default: {
-                spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", kv_cache_dtype);
+                spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", infinicore::toString(kv_cache_dtype));
                 this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
                 break;
             }
             }
         } catch (const std::exception &e) {
-            spdlog::error("Failed to parse kv_cache_dtype '{}': {}", kv_cache_dtype, e.what());
+            spdlog::error("Failed to parse kv_cache_dtype '{}': {}", infinicore::toString(kv_cache_dtype), e.what());
             this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
         }
     }
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -56,7 +56,7 @@ InferEngine::InferEngine(
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
     backends::AttentionBackend attention_backend,
-    const std::string &kv_cache_dtype) // Changed parameter
+    std::optional<infinicore::DataType> kv_cache_dtype) // Changed parameter
     : communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
@@ -65,7 +65,9 @@ InferEngine::InferEngine(
     // Load model config if model_path is provided, model_path must be valid, and config.json exists
     this->model_config_ = std::make_shared<infinilm::config::ModelConfig>(model_path + "/config.json");
     // Only support offline int8 kv cache quantization in this version
-    this->model_config_->set_kv_quant_scheme(kv_cache_dtype);
+    if (kv_cache_dtype.has_value()) {
+        this->model_config_->set_kv_quant_scheme(kv_cache_dtype.value());
+    }
     // Create one RankWorker per rank
     int world_size = communication_group_.get_world_size();
     barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
@@ -47,7 +47,7 @@ class InferEngine {
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
-        const std::string &kv_cache_dtype = "");
+        std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
 
     // Load a parameter to all workers (each can extract its shard inside RankWorker)
     void load_param(const std::string &name, const infinicore::Tensor &param);
diff --git a/csrc/pybind11/cache/cache.hpp b/csrc/pybind11/cache/cache.hpp
@@ -19,14 +19,10 @@ inline void bind_cache(py::module &m) {
                infinilm::cache::CacheConfig,
                std::shared_ptr<infinilm::cache::StaticKVCacheConfig>>(m, "StaticKVCacheConfig")
         .def(
-            py::init<infinicore::Size, infinicore::Size>(),
-            py::arg("max_batch_size") = 1,
-            py::arg("max_cache_len") = std::numeric_limits<infinicore::Size>::max())
-        .def(
-            py::init<infinicore::Size, infinicore::Size, std::string>(),
+            py::init<infinicore::Size, infinicore::Size, std::optional<infinicore::DataType>>(),
             py::arg("max_batch_size") = 1,
             py::arg("max_cache_len") = std::numeric_limits<infinicore::Size>::max(),
-            py::arg("kv_cache_dtype"))
+            py::arg("kv_cache_dtype") = std::nullopt)
         .def(
             "max_batch_size",
             &infinilm::cache::StaticKVCacheConfig::max_batch_size)
@@ -43,14 +39,10 @@ inline void bind_cache(py::module &m) {
                infinilm::cache::CacheConfig,
                std::shared_ptr<infinilm::cache::PagedKVCacheConfig>>(m, "PagedKVCacheConfig")
         .def(
-            py::init<size_t, size_t>(),
-            py::arg("num_blocks"),
-            py::arg("block_size") = 256)
-        .def(
-            py::init<size_t, std::string, size_t>(),
+            py::init<size_t, size_t, std::optional<infinicore::DataType>>(),
             py::arg("num_blocks"),
-            py::arg("kv_cache_dtype"),
-            py::arg("block_size") = 16)
+            py::arg("block_size") = 256,
+            py::arg("kv_cache_dtype") = std::nullopt)
         .def(
             "num_blocks",
             &infinilm::cache::PagedKVCacheConfig::num_blocks)
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
@@ -66,8 +66,10 @@ inline void bind_infer_engine(py::module &m) {
             }
             return state_dict_tp_all;
         })
-        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def(
+            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def(
+            "reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
             auto cfg = self.get_cache_config();
             return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })
@@ -81,7 +83,7 @@ inline void bind_infer_engine(py::module &m) {
                           std::shared_ptr<infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
                           const std::string &attention_backend,
-                          const std::string &kv_cache_dtype) {
+                          std::optional<infinicore::DataType> kv_cache_dtype) {
                  return std::make_shared<InferEngine>(
                      model_path,
                      dist,
@@ -97,7 +99,7 @@ inline void bind_infer_engine(py::module &m) {
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
              py::arg("attention_backend") = "default",
-             py::arg("kv_cache_dtype") = "")
+             py::arg("kv_cache_dtype") = py::none())
         .def("load_param", &InferEngine::load_param,
              py::arg("name"), py::arg("param"),
              "Load a parameter tensor into all workers (each worker picks its shard)")
@@ -112,8 +114,10 @@ inline void bind_infer_engine(py::module &m) {
             }
             return state_dict_tp_all;
         })
-        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def(
+            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def(
+            "reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) {
             auto cfg = self.get_cache_config();
             return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
diff --git a/examples/bench.py b/examples/bench.py
@@ -262,8 +262,8 @@ def get_args():
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        default="",
-        choices=["", "int8"],
+        default=None,
+        choices=["int8"],
     )
 
     return parser.parse_args()
@@ -305,7 +305,7 @@ def __init__(
             cache_config=cache_config,
             enable_graph_compiling=enable_graph,
             attention_backend=attn_backend,
-            kv_cache_dtype=args.kv_cache_dtype
+            kv_cache_dtype=args.kv_cache_dtype,
         )
 
         # ---------------------------------------------------------------------------- #
@@ -544,7 +544,9 @@ def run(
             initial_capacity = input_len + output_len
             test.model.reset_cache(
                 StaticKVCacheConfig(
-                    max_batch_size=batch_size, max_cache_len=initial_capacity, kv_cache_dtype=args.kv_cache_dtype
+                    max_batch_size=batch_size,
+                    max_cache_len=initial_capacity,
+                    kv_cache_dtype=args.kv_cache_dtype,
                 )
             )
 
diff --git a/python/infinilm/cache/cache.py b/python/infinilm/cache/cache.py
@@ -1,4 +1,6 @@
 from infinilm.lib import _infinilm
+import infinicore
+from ..modeling_utils import parse_dtype
 
 
 class CacheConfig(_infinilm.CacheConfig):
@@ -9,22 +11,45 @@ def __init__(self):
 
 
 class StaticKVCacheConfig(CacheConfig, _infinilm.StaticKVCacheConfig):
-    def __init__(self, max_batch_size: int = 1, max_cache_len: int = 0, kv_cache_dtype: str | None = None):
-        if kv_cache_dtype is None:
-            _infinilm.StaticKVCacheConfig.__init__(self, max_batch_size, max_cache_len)
+    def __init__(
+        self,
+        max_batch_size: int = 1,
+        max_cache_len: int = 0,
+        kv_cache_dtype=None,
+    ):
+        if isinstance(kv_cache_dtype, str):
+            _infinilm.StaticKVCacheConfig.__init__(
+                self,
+                max_batch_size,
+                max_cache_len,
+                parse_dtype(kv_cache_dtype)._underlying,
+            )
+        elif isinstance(kv_cache_dtype, infinicore.dtype):
+            _infinilm.StaticKVCacheConfig.__init__(
+                self, max_batch_size, max_cache_len, kv_cache_dtype._underlying
+            )
         else:
-            _infinilm.StaticKVCacheConfig.__init__(self, max_batch_size, max_cache_len, kv_cache_dtype)
+            _infinilm.StaticKVCacheConfig.__init__(
+                self, max_batch_size, max_cache_len, kv_cache_dtype
+            )
+
 
 class PagedKVCacheConfig(CacheConfig, _infinilm.PagedKVCacheConfig):
     def __init__(
         self,
         num_blocks: int,
         block_size: int = 256,
-        kv_cache_dtype: str | None = None,
+        kv_cache_dtype=None,
     ):
-        if kv_cache_dtype is None:
-            _infinilm.PagedKVCacheConfig.__init__(self, num_blocks, block_size)
+        if isinstance(kv_cache_dtype, str):
+            _infinilm.PagedKVCacheConfig.__init__(
+                self, num_blocks, block_size, parse_dtype(kv_cache_dtype)._underlying
+            )
+        elif isinstance(kv_cache_dtype, infinicore.dtype):
+            _infinilm.PagedKVCacheConfig.__init__(
+                self, num_blocks, block_size, kv_cache_dtype._underlying
+            )
         else:
             _infinilm.PagedKVCacheConfig.__init__(
-                self, num_blocks, kv_cache_dtype, block_size
+                self, num_blocks, block_size, kv_cache_dtype
             )
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
@@ -8,6 +8,8 @@
 from infinilm.distributed import DistConfig
 from infinilm.lib import _infinilm
 
+from .modeling_utils import parse_dtype
+
 
 @dataclass
 class GenerationConfig:
@@ -30,7 +32,7 @@ def __init__(
         cache_config=None,
         enable_graph_compiling=False,
         attention_backend="default",
-        kv_cache_dtype="",
+        kv_cache_dtype=None,
     ):
         self.config = AutoConfig.from_pretrained(model_path)
 
@@ -44,7 +46,9 @@ def __init__(
             cache_config,
             enable_graph_compiling,
             attention_backend,
-            kv_cache_dtype,
+            parse_dtype(kv_cache_dtype)._underlying
+            if kv_cache_dtype is not None
+            else None,
         )
         self.use_cache = False
 
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
@@ -7,6 +7,24 @@
 from tqdm import tqdm
 import infinicore
 
+
+def parse_dtype(dtype_str: str):
+    if dtype_str == "float32":
+        return infinicore.float32
+    elif dtype_str == "float16":
+        return infinicore.float16
+    elif dtype_str == "bfloat16":
+        return infinicore.bfloat16
+    elif dtype_str == "int8":
+        return infinicore.int8
+    elif dtype_str == "int32":
+        return infinicore.int32
+    elif dtype_str == "int64":
+        return infinicore.int64
+    else:
+        raise ValueError(f"Unknown dtype string: {dtype_str}")
+
+
 str_to_torch_dtype = {
     "BOOL": torch.bool,
     "U8": torch.uint8,

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ class ModelConfig {`
`64`	`64`	`infinicore::DataType get_dtype() const;`
`65`	`65`	`infinicore::quantization::QuantScheme get_quant_scheme() const;`
`66`	`66`	`std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;`
`67`		`- void set_kv_quant_scheme(std::string kv_cache_dtype) {`
	`67`	`+ void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) {`
`68`	`68`	`this->quant_config.set_kv_quant_scheme(kv_cache_dtype);`
`69`	`69`	`}`
`70`	`70`	`infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {`
Original file line number	Diff line number	Diff line change
`@@ -23,27 +23,21 @@ class QuantConfig {`
`23`	`23`	`}`
`24`	`24`	`}`
`25`	`25`
`26`		`- void set_kv_quant_scheme(std::string kv_cache_dtype) {`
`27`		`- if (kv_cache_dtype.empty()) {`
`28`		`- this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;`
`29`		`- // spdlog::debug("kv_cache_dtype is empty, using default NONE");`
`30`		`- return;`
`31`		`- }`
`32`		`-`
	`26`	`+ void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) {`
`33`	`27`	`try {`
`34`		`- switch (parse_dtype(kv_cache_dtype)) {`
	`28`	`+ switch (kv_cache_dtype) {`
`35`	`29`	`case infinicore::DataType::I8: {`
`36`	`30`	`this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::INT8;`
`37`	`31`	`break;`
`38`	`32`	`}`
`39`	`33`	`default: {`
`40`		`- spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", kv_cache_dtype);`
	`34`	`+ spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", infinicore::toString(kv_cache_dtype));`
`41`	`35`	`this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;`
`42`	`36`	`break;`
`43`	`37`	`}`
`44`	`38`	`}`
`45`	`39`	`} catch (const std::exception &e) {`
`46`		`- spdlog::error("Failed to parse kv_cache_dtype '{}': {}", kv_cache_dtype, e.what());`
	`40`	`+ spdlog::error("Failed to parse kv_cache_dtype '{}': {}", infinicore::toString(kv_cache_dtype), e.what());`
`47`	`41`	`this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;`
`48`	`42`	`}`
`49`	`43`	`}`