Issue/253: Support offline int8 inference with calibrated models

qinyiqun · qinyiqun · commit a2a2dac837f0 · 2026-03-18T17:27:53.000+08:00
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
@@ -16,6 +16,20 @@ StaticKVCacheConfig::StaticKVCacheConfig(
       max_cache_len_(_max_cache_len) {
 }
 
+StaticKVCacheConfig::StaticKVCacheConfig(
+    infinicore::Size _max_batch_size,
+    infinicore::Size _max_cache_len,
+    std::string kv_cache_dtype)
+    : max_batch_size_(_max_batch_size),
+      max_cache_len_(_max_cache_len) {
+    if (kv_cache_dtype.empty()) {
+        kv_cache_dtype_set_ = false;
+    } else {
+        this->kv_cache_dtype_ = parse_dtype(kv_cache_dtype);
+        kv_cache_dtype_set_ = true;
+    }
+}
+
 std::unique_ptr<CacheConfig>
 StaticKVCacheConfig::unique_copy() const {
     return std::make_unique<StaticKVCacheConfig>(*this);
@@ -42,7 +56,6 @@ StaticKVCache::StaticKVCache(
     infinicore::Size num_v_heads,
     infinicore::Size num_layers,
     infinicore::Size max_positional_embedding,
-    infinicore::DataType dtype,
     const StaticKVCacheConfig &config,
     const engine::distributed::RankInfo &rank_info)
     : Cache(),
@@ -53,7 +66,7 @@ StaticKVCache::StaticKVCache(
       rank_batch_size_(config.max_batch_size()),
       cache_len_(config.max_cache_len() == std::numeric_limits<infinicore::Size>::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()),
       rank_num_layers_(num_layers),
-      dtype_(dtype) {
+      dtype_(config.kv_cache_dtype()) {
 
     // Allocate K cache
     k_caches_ = infinicore::Tensor::empty(
@@ -115,6 +128,15 @@ StaticKVCache::update(size_t layer_idx,
     return {k_cache_layer, v_cache_layer};
 }
 
+infinicore::DataType
+StaticKVCacheConfig::kv_cache_dtype() const {
+    return kv_cache_dtype_;
+}
+
+void StaticKVCacheConfig::set_kv_cache_dtype(infinicore::DataType dtype) const {
+    kv_cache_dtype_ = dtype;
+}
+
 // ==========================
 // PagedKVCacheConfig
 // ==========================
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -23,13 +23,25 @@ class StaticKVCacheConfig final : public CacheConfig {
         infinicore::Size _max_batch_size = 1,
         infinicore::Size _max_cache_len = std::numeric_limits<infinicore::Size>::max());
 
+    StaticKVCacheConfig(
+        infinicore::Size _max_batch_size,
+        infinicore::Size _max_cache_len,
+        std::string kv_cache_dtype);
+
     std::unique_ptr<CacheConfig> unique_copy() const override;
     infinicore::Size max_batch_size() const;
     infinicore::Size max_cache_len() const;
 
+    infinicore::DataType kv_cache_dtype() const;
+    void set_kv_cache_dtype(infinicore::DataType dtype) const;
+    bool kv_cache_dtype_is_set() const { return kv_cache_dtype_set_; }
+
 private:
     infinicore::Size max_batch_size_;
     infinicore::Size max_cache_len_;
+
+    bool kv_cache_dtype_set_ = false;
+    mutable infinicore::DataType kv_cache_dtype_;
 };
 
 class StaticKVCache final : public Cache {
@@ -42,7 +54,6 @@ class StaticKVCache final : public Cache {
         infinicore::Size num_v_heads,
         infinicore::Size num_layers,
         infinicore::Size max_positional_embedding,
-        infinicore::DataType dtype,
         const StaticKVCacheConfig &config,
         const engine::distributed::RankInfo &rank_info);
 
diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp
@@ -64,6 +64,14 @@ class ModelConfig {
     infinicore::DataType get_dtype() const;
     infinicore::quantization::QuantScheme get_quant_scheme() const;
     std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
+    void set_kv_quant_scheme(std::string kv_cache_dtype) {
+        if (kv_cache_dtype == "int8") {
+            this->quant_config.set_kv_quant_scheme(kv_cache_dtype);
+        }
+    }
+    infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {
+        return quant_config.get_kv_quant_scheme();
+    }
 
 private:
     nlohmann::json config_json;
diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp
@@ -1,5 +1,5 @@
 #pragma once
-// #include "../quantization/quantization.hpp"
+#include "../utils.hpp"
 #include "infinicore/quantization.hpp"
 #include "nlohmann/json.hpp"
 
@@ -22,9 +22,27 @@ class QuantConfig {
         }
     }
 
+    void set_kv_quant_scheme(std::string kv_cache_dtype) {
+        switch (parse_dtype(kv_cache_dtype)) {
+        case infinicore::DataType::I8: {
+            this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::INT8;
+            break;
+        }
+        default: {
+            this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
+            break;
+        }
+        }
+    }
+
+    infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const {
+        return kv_quant_scheme;
+    }
+
 private:
     nlohmann::json quantization_config;
     std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_method;
+    infinicore::quantization::KVQuantAlgo kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE;
 };
 
 } // namespace infinilm::config
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -55,14 +55,17 @@ InferEngine::InferEngine(
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
-    backends::AttentionBackend attention_backend) // Changed parameter
+    backends::AttentionBackend attention_backend,
+    const std::string &kv_cache_dtype) // Changed parameter
     : communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
     }
 
     // Load model config if model_path is provided, model_path must be valid, and config.json exists
     this->model_config_ = std::make_shared<infinilm::config::ModelConfig>(model_path + "/config.json");
+    // Only support offline int8 kv cache quantization in this version
+    this->model_config_->set_kv_quant_scheme(kv_cache_dtype);
     // Create one RankWorker per rank
     int world_size = communication_group_.get_world_size();
     barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
@@ -46,7 +46,8 @@ class InferEngine {
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
-        backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+        backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
+        const std::string &kv_cache_dtype = "");
 
     // Load a parameter to all workers (each can extract its shard inside RankWorker)
     void load_param(const std::string &name, const infinicore::Tensor &param);
diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp
@@ -7,10 +7,13 @@
 #include "infinicore/ops/mha_kvcache.hpp"
 #include "infinicore/ops/mha_varlen.hpp"
 #include "infinicore/ops/mul.hpp"
+#include "infinicore/ops/per_tensor_dequant_i8.hpp"
+#include "infinicore/ops/per_tensor_quant_i8.hpp"
 
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <iostream>
 #include <optional>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
@@ -137,6 +140,17 @@ LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> mo
         INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
         INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
     }
+
+    switch (this->model_config_->get_kv_quant_scheme()) {
+    case (infinicore::quantization::KVQuantAlgo::INT8): {
+        INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
+        INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
+        break;
+    }
+    default: {
+        break;
+    }
+    }
 }
 
 infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states,
@@ -184,6 +198,17 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
     rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true);   // [bs, seq_len, n_kv_head, head_dim]
 
+    switch (this->model_config_->get_kv_quant_scheme()) {
+    case (infinicore::quantization::KVQuantAlgo::INT8): {
+        k_reshaped = infinicore::op::per_tensor_quant_i8(k_reshaped, this->kv_cache_k_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()), true);
+        v_reshaped = infinicore::op::per_tensor_quant_i8(v_reshaped, this->kv_cache_v_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()), true);
+        break;
+    }
+    default: {
+        break;
+    }
+    }
+
     // 5. Prepare KV caches
     // Convert to [batch, n_head, seq_len, head_dim] for cache
     // Ensure contiguous after permute for F16 compatibility with cache operations
@@ -212,6 +237,21 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
                           ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim]
     } else {
         size_t total_seq_len = reinterpret_cast<int32_t *>(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0];
+
+        switch (this->model_config_->get_kv_quant_scheme()) {
+        case (infinicore::quantization::KVQuantAlgo::INT8): {
+            auto k_total_dequant = infinicore::Tensor::strided_empty(k_total->shape(), k_total->strides(), q_reshaped->dtype(), q_reshaped->device());
+            auto v_total_dequant = infinicore::Tensor::strided_empty(v_total->shape(), v_total->strides(), q_reshaped->dtype(), q_reshaped->device());
+            infinicore::op::per_tensor_dequant_i8_(k_total_dequant, k_total, this->kv_cache_k_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()));
+            infinicore::op::per_tensor_dequant_i8_(v_total_dequant, v_total, this->kv_cache_v_scale(), infinicore::Tensor::zeros({1}, k_reshaped->dtype(), k_reshaped->device()));
+            k_total = k_total_dequant;
+            v_total = v_total_dequant;
+            break;
+        }
+        default: {
+            break;
+        }
+        }
         k_total = k_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
         v_total = v_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
 
@@ -342,10 +382,10 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
             auto q_for_fa = q_reshaped->view({seq_len, 1, num_attention_heads_, head_dim_});
             auto attn_out_4d = infinicore::op::mha_kvcache(
                 q_for_fa,
-                k_total->permute({0, 2, 1, 3}),  // [num_blocks, block_size, num_kv_heads, head_dim]
+                k_total->permute({0, 2, 1, 3}), // [num_blocks, block_size, num_kv_heads, head_dim]
                 v_total->permute({0, 2, 1, 3}),
-                total_sequence_lengths.value(),  // [seq_len] int32 (one entry per sequence)
-                block_tables.value(),            // [seq_len, max_num_blocks_per_seq] int32
+                total_sequence_lengths.value(), // [seq_len] int32 (one entry per sequence)
+                block_tables.value(),           // [seq_len, max_num_blocks_per_seq] int32
                 std::nullopt,
                 scaling_);
             attn_output = attn_out_4d->view({seq_len, num_attention_heads_, head_dim_});
@@ -361,7 +401,6 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
                 scaling_);
         }
     }
-    
 
     // 7. Project output
     attn_output
diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp
@@ -112,6 +112,13 @@ class LlamaAttention : public infinicore::nn::Module {
                                       std::optional<infinicore::Tensor> block_tables,
                                       std::optional<infinicore::Tensor> slot_mapping) const;
 
+    infinicore::Tensor kv_cache_k_scale() const {
+        return kv_cache_k_scale_;
+    }
+    infinicore::Tensor kv_cache_v_scale() const {
+        return kv_cache_v_scale_;
+    }
+
 protected:
     // Projection layers
     INFINICORE_NN_MODULE(infinilm::layers::QKVParallelLinear, qkv_proj);
@@ -123,6 +130,10 @@ class LlamaAttention : public infinicore::nn::Module {
     // Shared Rotary Position Embeddings (RoPE)
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
 
+    // For off-line kv cache quantization
+    INFINICORE_NN_PARAMETER(kv_cache_k_scale);
+    INFINICORE_NN_PARAMETER(kv_cache_v_scale);
+
 private:
     std::shared_ptr<infinilm::config::ModelConfig> model_config_ = std::make_shared<infinilm::config::ModelConfig>();
     size_t layer_idx_; // Layer index for cache access
diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp
@@ -136,7 +136,6 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.num_key_value_heads,
             config_.num_hidden_layers,
             config_.max_position_embeddings,
-            config_.dtype,
             *kv_cache_config,
             rank_info_);
     } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config);
@@ -150,14 +149,16 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             *paged_kv_cache_config,
             rank_info_);
     } else if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
+        if (!kv_cache_config->kv_cache_dtype_is_set()) {
+            kv_cache_config->set_kv_cache_dtype(model_config_->get_dtype());
+        }
         kv_cache_ = std::make_shared<cache::StaticKVCache>(
             model_config_->get_head_dim(),
             model_config_->get_head_dim(),
             model_config_->get<size_t>("num_key_value_heads"),
             model_config_->get<size_t>("num_key_value_heads"),
             model_config_->get<size_t>("num_hidden_layers"),
             model_config_->get<size_t>("max_position_embeddings"),
-            model_config_->get_dtype(),
             *kv_cache_config,
             rank_info_);
     } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
diff --git a/csrc/pybind11/cache/cache.hpp b/csrc/pybind11/cache/cache.hpp
@@ -22,12 +22,19 @@ inline void bind_cache(py::module &m) {
             py::init<infinicore::Size, infinicore::Size>(),
             py::arg("max_batch_size") = 1,
             py::arg("max_cache_len") = std::numeric_limits<infinicore::Size>::max())
+        .def(
+            py::init<infinicore::Size, infinicore::Size, std::string>(),
+            py::arg("max_batch_size") = 1,
+            py::arg("max_cache_len") = std::numeric_limits<infinicore::Size>::max(),
+            py::arg("kv_cache_dtype"))
         .def(
             "max_batch_size",
             &infinilm::cache::StaticKVCacheConfig::max_batch_size)
         .def(
             "max_cache_len",
             &infinilm::cache::StaticKVCacheConfig::max_cache_len)
+        .def("kv_cache_dtype",
+             &infinilm::cache::StaticKVCacheConfig::kv_cache_dtype)
         .def("__repr__", [](const infinilm::cache::StaticKVCacheConfig &) {
             return "<StaticKVCacheConfig>";
         });
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
@@ -66,14 +66,11 @@ inline void bind_infer_engine(py::module &m) {
             }
             return state_dict_tp_all;
         })
-        .def(
-            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def(
-            "reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def("reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
             auto cfg = self.get_cache_config();
-            return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr;
-        })
+            return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })
         .def("__repr__", [](const InferEngine &self) { return "<InferEngine: " + std::string(self.get_dist_config()) + ">"; });
 
     infer_engine
@@ -83,21 +80,24 @@ inline void bind_infer_engine(py::module &m) {
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
-                          const std::string &attention_backend) {
+                          const std::string &attention_backend,
+                          const std::string &kv_cache_dtype) {
                  return std::make_shared<InferEngine>(
                      model_path,
                      dist,
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
                      enable_graph_compiling,
-                     infinilm::backends::parse_attention_backend(attention_backend));
+                     infinilm::backends::parse_attention_backend(attention_backend),
+                     kv_cache_dtype);
              }),
              py::arg("model_path") = "",
              py::arg("distributed_config") = distributed::DistConfig(),
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
-             py::arg("attention_backend") = "default")
+             py::arg("attention_backend") = "default",
+             py::arg("kv_cache_dtype") = "")
         .def("load_param", &InferEngine::load_param,
              py::arg("name"), py::arg("param"),
              "Load a parameter tensor into all workers (each worker picks its shard)")
@@ -112,10 +112,8 @@ inline void bind_infer_engine(py::module &m) {
             }
             return state_dict_tp_all;
         })
-        .def(
-            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def(
-            "reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def("reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) {
             auto cfg = self.get_cache_config();
             return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
diff --git a/examples/bench.py b/examples/bench.py
diff --git a/examples/jiuge.py b/examples/jiuge.py
diff --git a/python/infinilm/cache/cache.py b/python/infinilm/cache/cache.py
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py