Issue/253: (1) Refactor attention KV cache quantization to layers/kv_quant.cpp; (2)update kv_cache_dtype handling; (3)Update Python test scripts

qinyiqun · qinyiqun · commit 4aa8c3e107af · 2026-03-19T16:31:35.000+08:00
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
@@ -2,6 +2,7 @@
 
 #include "../utils.hpp"
 #include "infinicore/ops.hpp"
+#include <iostream>
 #include <stdexcept>
 
 namespace infinilm::cache {
@@ -22,11 +23,8 @@ StaticKVCacheConfig::StaticKVCacheConfig(
     std::string kv_cache_dtype)
     : max_batch_size_(_max_batch_size),
       max_cache_len_(_max_cache_len) {
-    if (kv_cache_dtype.empty()) {
-        kv_cache_dtype_set_ = false;
-    } else {
-        this->kv_cache_dtype_ = parse_dtype(kv_cache_dtype);
-        kv_cache_dtype_set_ = true;
+    if (!kv_cache_dtype.empty()) {
+        this->kv_cache_dtype_ = std::make_optional(parse_dtype(kv_cache_dtype));
     }
 }
 
@@ -130,11 +128,14 @@ StaticKVCache::update(size_t layer_idx,
 
 infinicore::DataType
 StaticKVCacheConfig::kv_cache_dtype() const {
-    return kv_cache_dtype_;
+    return kv_cache_dtype_.value();
 }
-
-void StaticKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) const {
-    kv_cache_dtype_ = dtype;
+void StaticKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) {
+    if (!this->kv_cache_dtype_.has_value()) {
+        this->kv_cache_dtype_ = std::make_optional(dtype);
+    } else {
+        return;
+    }
 }
 
 // ==========================
@@ -145,9 +146,10 @@ PagedKVCacheConfig::PagedKVCacheConfig(
     std::string kv_cache_dtype,
     size_t block_size)
     : num_blocks_(num_blocks),
-      block_size_(block_size),
-      kv_cache_dtype_(parse_dtype(kv_cache_dtype)) {
-    kv_cache_dtype_set_ = true;
+      block_size_(block_size) {
+    if (!kv_cache_dtype.empty()) {
+        this->kv_cache_dtype_ = std::make_optional(parse_dtype(kv_cache_dtype));
+    }
 }
 
 PagedKVCacheConfig::PagedKVCacheConfig(
@@ -174,11 +176,15 @@ PagedKVCacheConfig::block_size() const {
 
 infinicore::DataType
 PagedKVCacheConfig::kv_cache_dtype() const {
-    return kv_cache_dtype_;
+    return kv_cache_dtype_.value();
 }
 
-void PagedKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) const {
-    kv_cache_dtype_ = dtype;
+void PagedKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) {
+    if (!this->kv_cache_dtype_.has_value()) {
+        this->kv_cache_dtype_ = std::make_optional(dtype);
+    } else {
+        return;
+    }
 }
 
 // ==========================
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -11,6 +11,7 @@
 #include <limits>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <stdexcept>
 #include <utility>
 
@@ -33,15 +34,13 @@ class StaticKVCacheConfig final : public CacheConfig {
     infinicore::Size max_cache_len() const;
 
     infinicore::DataType kv_cache_dtype() const;
-    void set_kv_cache_dtype(infinicore::DataType dtype) const;
-    bool kv_cache_dtype_is_set() const { return kv_cache_dtype_set_; }
+    void set_kv_cache_dtype(infinicore::DataType dtype);
 
 private:
     infinicore::Size max_batch_size_;
     infinicore::Size max_cache_len_;
 
-    bool kv_cache_dtype_set_ = false;
-    mutable infinicore::DataType kv_cache_dtype_;
+    std::optional<infinicore::DataType> kv_cache_dtype_ = std::nullopt;
 };
 
 class StaticKVCache final : public Cache {
@@ -109,15 +108,13 @@ class PagedKVCacheConfig final : public CacheConfig {
     size_t num_blocks() const;
     size_t block_size() const;
     infinicore::DataType kv_cache_dtype() const;
-    void set_kv_cache_dtype(infinicore::DataType dtype) const;
-    bool kv_cache_dtype_set() const { return kv_cache_dtype_set_; }
+    void set_kv_cache_dtype(infinicore::DataType dtype);
 
 private:
     size_t num_blocks_;
     size_t block_size_;
 
-    bool kv_cache_dtype_set_ = false;
-    mutable infinicore::DataType kv_cache_dtype_;
+    std::optional<infinicore::DataType> kv_cache_dtype_ = std::nullopt;
 };
 
 class PagedKVCache final : public Cache {
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -171,7 +171,7 @@ const distributed::DistConfig &InferEngine::get_dist_config() const {
 //------------------------------------------------------
 // reset_cache (overloaded with CacheConfig)
 //------------------------------------------------------
-void InferEngine::reset_cache(const cache::CacheConfig *new_config) {
+void InferEngine::reset_cache(cache::CacheConfig *new_config) {
     for (auto &worker : workers_) {
         worker->reset_cache(new_config);
     }
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
@@ -60,7 +60,7 @@ class InferEngine {
 
     void compile();
 
-    void reset_cache(const cache::CacheConfig *new_config);
+    void reset_cache( cache::CacheConfig *new_config);
 
     ~InferEngine();
 
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
@@ -186,7 +186,7 @@ void RankWorker::wait() {
     }
 }
 
-void RankWorker::reset_cache(const cache::CacheConfig *new_config) {
+void RankWorker::reset_cache(cache::CacheConfig *new_config) {
     std::lock_guard<std::mutex> lock(mutex_);
     if (should_exit_) {
         throw std::runtime_error("RankWorker is closing; cannot reset_cache");
diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp
@@ -85,7 +85,7 @@ class RankWorker {
     void run(const Input &args);
 
     // Reset the internal cache with a new configuration
-    void reset_cache(const cache::CacheConfig *new_config);
+    void reset_cache(cache::CacheConfig *new_config);
 
     // Compile the model graph if enabled.
     void compile();
diff --git a/csrc/layers/kv_quant.cpp b/csrc/layers/kv_quant.cpp
@@ -0,0 +1,52 @@
+#include "kv_quant.hpp"
+#include "infinicore/ops/per_tensor_dequant_i8.hpp"
+#include "infinicore/ops/per_tensor_quant_i8.hpp"
+
+namespace infinilm {
+
+void KVQuantUtils::quantize(
+    infinicore::Tensor &k,
+    infinicore::Tensor &v,
+    infinicore::quantization::KVQuantAlgo algo,
+    const infinicore::Tensor &k_scale,
+    const infinicore::Tensor &v_scale) {
+
+    if (algo == infinicore::quantization::KVQuantAlgo::NONE) {
+        return;
+    }
+
+    auto device = k->device();
+    auto dtype = k->dtype();
+    auto zero_point = infinicore::Tensor::zeros({1}, dtype, device);
+
+    k = infinicore::op::per_tensor_quant_i8(k, k_scale, zero_point, true);
+    v = infinicore::op::per_tensor_quant_i8(v, v_scale, zero_point, true);
+}
+
+void KVQuantUtils::dequantize(
+    infinicore::Tensor &k,
+    infinicore::Tensor &v,
+    infinicore::quantization::KVQuantAlgo algo,
+    const infinicore::Tensor &k_scale,
+    const infinicore::Tensor &v_scale,
+    const infinicore::Tensor &reference) {
+
+    if (algo == infinicore::quantization::KVQuantAlgo::NONE) {
+        return; // 无需反量化
+    }
+
+    auto zero_point = infinicore::Tensor::zeros({1}, reference->dtype(), reference->device());
+
+    auto k_dequant = infinicore::Tensor::strided_empty(
+        k->shape(), k->strides(), reference->dtype(), reference->device());
+    auto v_dequant = infinicore::Tensor::strided_empty(
+        v->shape(), v->strides(), reference->dtype(), reference->device());
+
+    infinicore::op::per_tensor_dequant_i8_(k_dequant, k, k_scale, zero_point);
+    infinicore::op::per_tensor_dequant_i8_(v_dequant, v, v_scale, zero_point);
+
+    k = std::move(k_dequant);
+    v = std::move(v_dequant);
+}
+
+} // namespace infinilm
diff --git a/csrc/layers/kv_quant.hpp b/csrc/layers/kv_quant.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "infinicore/quantization.hpp"
+#include "infinicore/tensor.hpp"
+#include <utility>
+
+namespace infinilm {
+
+class KVQuantUtils {
+public:
+    /**
+     * @brief 量化 K/V（写入缓存前）- 原地修改 k 和 v
+     * @param k 原始 K 张量
+     * @param v 原始 V 张量
+     * @param algo 量化算法
+     * @param k_scale K 的 scale
+     * @param v_scale V 的 scale
+     */
+    static void quantize(
+        infinicore::Tensor &k,
+        infinicore::Tensor &v,
+        infinicore::quantization::KVQuantAlgo algo,
+        const infinicore::Tensor &k_scale,
+        const infinicore::Tensor &v_scale);
+
+    /**
+     * @brief 反量化 K/V（读取缓存后）- 原地修改 k 和 v
+     * @param k 量化后的 K 张量
+     * @param v 量化后的 V 张量
+     * @param algo 量化算法
+     * @param k_scale K 的 scale
+     * @param v_scale V 的 scale
+     * @param reference 参考张量（用于获取 dtype/device）
+     */
+    static void dequantize(
+        infinicore::Tensor &k,
+        infinicore::Tensor &v,
+        infinicore::quantization::KVQuantAlgo algo,
+        const infinicore::Tensor &k_scale,
+        const infinicore::Tensor &v_scale,
+        const infinicore::Tensor &reference);
+};
+
+} // namespace infinilm
diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp
@@ -43,7 +43,7 @@ class InfinilmModel : public infinicore::nn::Module {
     virtual ~InfinilmModel() = default;
     virtual Output forward(const Input &input) const = 0;
 
-    virtual void reset_cache(const cache::CacheConfig *cache_config) = 0;
+    virtual void reset_cache(cache::CacheConfig *cache_config) = 0;
     virtual const cache::CacheConfig *get_cache_config() const = 0;
 };
 } // namespace infinilm
diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp
@@ -198,16 +198,11 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
     rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true);   // [bs, seq_len, n_kv_head, head_dim]
 
-    switch (this->model_config_->get_kv_quant_scheme()) {
-    case (infinicore::quantization::KVQuantAlgo::INT8): {
-        k_reshaped = infinicore::op::per_tensor_quant_i8(k_reshaped, this->kv_cache_k_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()), true);
-        v_reshaped = infinicore::op::per_tensor_quant_i8(v_reshaped, this->kv_cache_v_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()), true);
-        break;
-    }
-    default: {
-        break;
-    }
-    }
+    infinilm::KVQuantUtils::quantize(
+        k_reshaped, v_reshaped,
+        this->model_config_->get_kv_quant_scheme(),
+        this->kv_cache_k_scale(),
+        this->kv_cache_v_scale());
 
     // 5. Prepare KV caches
     // Convert to [batch, n_head, seq_len, head_dim] for cache
@@ -238,20 +233,13 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     } else {
         size_t total_seq_len = reinterpret_cast<int32_t *>(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0];
 
-        switch (this->model_config_->get_kv_quant_scheme()) {
-        case (infinicore::quantization::KVQuantAlgo::INT8): {
-            auto k_total_dequant = infinicore::Tensor::strided_empty(k_total->shape(), k_total->strides(), q_reshaped->dtype(), q_reshaped->device());
-            auto v_total_dequant = infinicore::Tensor::strided_empty(v_total->shape(), v_total->strides(), q_reshaped->dtype(), q_reshaped->device());
-            infinicore::op::per_tensor_dequant_i8_(k_total_dequant, k_total, this->kv_cache_k_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()));
-            infinicore::op::per_tensor_dequant_i8_(v_total_dequant, v_total, this->kv_cache_v_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()));
-            k_total = k_total_dequant;
-            v_total = v_total_dequant;
-            break;
-        }
-        default: {
-            break;
-        }
-        }
+        infinilm::KVQuantUtils::dequantize(
+            k_total, v_total,
+            this->model_config_->get_kv_quant_scheme(),
+            this->kv_cache_k_scale(),
+            this->kv_cache_v_scale(),
+            q_reshaped);
+
         k_total = k_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
         v_total = v_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
 
diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp
@@ -5,6 +5,7 @@
 #include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
 #include "../../layers/fused_linear.hpp"
+#include "../../layers/kv_quant.hpp"
 #include "llama_config.hpp"
 
 #include "infinicore/nn/linear.hpp"
diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp
@@ -71,7 +71,7 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
     return {logits};
 }
 
-void LlamaForCausalLM::reset_cache(const cache::CacheConfig *cache_config) {
+void LlamaForCausalLM::reset_cache(cache::CacheConfig *cache_config) {
     cache_config_ = cache_config->unique_copy();
     model_->reset_cache(cache_config_.get());
 }
diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp
@@ -58,7 +58,7 @@ class LlamaForCausalLM : public InfinilmModel {
      */
     Output forward(const Input &input) const;
 
-    void reset_cache(const cache::CacheConfig *cache_config) override;
+    void reset_cache( cache::CacheConfig *cache_config) override;
 
     const cache::CacheConfig *get_cache_config() const override;
 
diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp
@@ -122,12 +122,12 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
     return hidden_states;
 }
 
-void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
+void LlamaModel::reset_cache(cache::CacheConfig *cache_config) {
     if (cache_config == nullptr) {
         kv_cache_ = nullptr;
         return;
     }
-    if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config);
+    if (auto kv_cache_config = dynamic_cast<cache::StaticKVCacheConfig *>(cache_config);
         kv_cache_config && model_config_ == nullptr) {
         kv_cache_ = std::make_shared<cache::StaticKVCache>(
             config_.head_dim,
@@ -138,7 +138,7 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.max_position_embeddings,
             *kv_cache_config,
             rank_info_);
-    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config);
+    } else if (auto paged_kv_cache_config = dynamic_cast<cache::PagedKVCacheConfig *>(cache_config);
                paged_kv_cache_config && model_config_ == nullptr) {
         kv_cache_ = std::make_shared<cache::PagedKVCache>(
             config_.head_dim,
@@ -148,10 +148,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.num_hidden_layers,
             *paged_kv_cache_config,
             rank_info_);
-    } else if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
-        if (!kv_cache_config->kv_cache_dtype_is_set()) {
-            kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
-        }
+    } else if (auto kv_cache_config = dynamic_cast<cache::StaticKVCacheConfig *>(cache_config)) {
+        kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
         kv_cache_ = std::make_shared<cache::StaticKVCache>(
             model_config_->get_head_dim(),
             model_config_->get_head_dim(),
@@ -161,10 +159,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             model_config_->get<size_t>("max_position_embeddings"),
             *kv_cache_config,
             rank_info_);
-    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
-        if (!paged_kv_cache_config->kv_cache_dtype_set()) {
-            paged_kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
-        }
+    } else if (auto paged_kv_cache_config = dynamic_cast<cache::PagedKVCacheConfig *>(cache_config)) {
+        paged_kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
         kv_cache_ = std::make_shared<cache::PagedKVCache>(
             model_config_->get_head_dim(),
             model_config_->get_head_dim(),
diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp
@@ -79,7 +79,7 @@ class LlamaModel : public infinicore::nn::Module {
                                std::optional<infinicore::Tensor> block_tables,
                                std::optional<infinicore::Tensor> slot_mapping) const;
 
-    void reset_cache(const cache::CacheConfig *cache_config);
+    void reset_cache(cache::CacheConfig *cache_config);
 
     // Module information
     size_t num_layers() const { return model_config_->get<size_t>("num_hidden_layers"); }
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
diff --git a/examples/bench.py b/examples/bench.py
diff --git a/examples/jiuge.py b/examples/jiuge.py

Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ const distributed::DistConfig &InferEngine::get_dist_config() const {`
`171`	`171`	`//------------------------------------------------------`
`172`	`172`	`// reset_cache (overloaded with CacheConfig)`
`173`	`173`	`//------------------------------------------------------`
`174`		`-void InferEngine::reset_cache(const cache::CacheConfig *new_config) {`
	`174`	`+void InferEngine::reset_cache(cache::CacheConfig *new_config) {`
`175`	`175`	`for (auto &worker : workers_) {`
`176`	`176`	`worker->reset_cache(new_config);`
`177`	`177`	`}`
Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ void RankWorker::wait() {`
`186`	`186`	`}`
`187`	`187`	`}`
`188`	`188`
`189`		`-void RankWorker::reset_cache(const cache::CacheConfig *new_config) {`
	`189`	`+void RankWorker::reset_cache(cache::CacheConfig *new_config) {`
`190`	`190`	`std::lock_guard<std::mutex> lock(mutex_);`
`191`	`191`	`if (should_exit_) {`
`192`	`192`	`throw std::runtime_error("RankWorker is closing; cannot reset_cache");`