From 0bf9d33d3eec53c57b43235604092bd115e8d7d6 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Tue, 12 May 2026 02:37:50 +0000 Subject: [PATCH] refactor: unify linear/quantization architecture and remove deprecated interfaces - Move linear module from InfiniCore to InfiniLM with quantization-based dispatch - Add GPTQ->GPTQ_QY weight conversion gated by QY device type - Implement fused linear weight splitting and re-registration - Fix TP split dimensions for all quantization schemes - Add alpha scaling parameter and logical dim size delegation - Move set_zeros/set_minus_one to utils.hpp as shared utilities --- csrc/cache/kv_cache.cpp | 4 + csrc/config/model_config.cpp | 6 +- csrc/config/model_config.hpp | 6 +- csrc/config/quant_config.cpp | 15 +- csrc/config/quant_config.hpp | 20 +- csrc/engine/compiler/paged_compiler.cpp | 16 +- csrc/engine/infer_engine.cpp | 43 -- csrc/engine/infer_engine.hpp | 22 - csrc/engine/rank_worker.cpp | 60 +-- csrc/engine/rank_worker.hpp | 8 - csrc/layers/attention/attention.cpp | 88 ++-- csrc/layers/attention/attention.hpp | 16 +- .../layers/attention/backends/static_attn.cpp | 4 +- .../layers/attention/backends/static_attn.hpp | 2 +- .../causal_lm_templates/text_causal_lm.hpp | 4 +- .../text_decoder_layer.hpp | 8 +- .../layers/causal_lm_templates/text_model.hpp | 6 +- csrc/layers/linear/base_linear.cpp | 133 +++++++ csrc/layers/linear/base_linear.hpp | 70 ++++ csrc/layers/linear/fused_linear.cpp | 376 +++--------------- csrc/layers/linear/fused_linear.hpp | 269 ++----------- csrc/layers/linear/linear.cpp | 100 +++++ csrc/layers/linear/linear.hpp | 90 ++++- csrc/layers/mlp/mlp.cpp | 49 +-- csrc/layers/mlp/mlp.hpp | 8 +- csrc/layers/mlp/moe_mlp.cpp | 8 +- csrc/layers/mlp/moe_mlp.hpp | 6 +- csrc/layers/quantization/awq.cpp | 96 +++++ csrc/layers/quantization/awq.hpp | 48 +++ .../layers/quantization/base_quantization.hpp | 122 ++++++ .../quantization/compressed_tensors.cpp | 76 ++++ .../quantization/compressed_tensors.hpp | 35 ++ csrc/layers/quantization/gptq.cpp | 91 +++++ csrc/layers/quantization/gptq.hpp | 48 +++ csrc/layers/quantization/gptq_qy.cpp | 259 ++++++++++++ csrc/layers/quantization/gptq_qy.hpp | 139 +++++++ csrc/layers/quantization/kv_quant.cpp | 8 +- csrc/layers/quantization/kv_quant.hpp | 6 +- .../layers/quantization/none_quantization.cpp | 65 +++ .../layers/quantization/none_quantization.hpp | 35 ++ csrc/layers/quantization/quantization.hpp | 9 + .../quantization/quantization_scheme.hpp | 18 + csrc/models/infinilm_model.cpp | 17 +- csrc/models/infinilm_model.hpp | 2 +- csrc/models/llama_legacy/llama_attention.cpp | 119 +----- csrc/models/llama_legacy/llama_attention.hpp | 32 +- .../llama_legacy/llama_decoder_layer.cpp | 39 +- .../llama_legacy/llama_decoder_layer.hpp | 27 +- .../llama_legacy/llama_for_causal_lm.cpp | 42 +- .../llama_legacy/llama_for_causal_lm.hpp | 21 +- csrc/models/llama_legacy/llama_mlp.cpp | 73 +--- csrc/models/llama_legacy/llama_mlp.hpp | 22 +- csrc/models/llama_legacy/llama_model.cpp | 87 +--- csrc/models/llama_legacy/llama_model.hpp | 31 +- .../minicpm_sala/minicpm_sala_attention.cpp | 38 +- .../minicpm_sala/minicpm_sala_attention.hpp | 22 +- .../minicpm_sala_decoderLayer.cpp | 6 +- .../minicpm_sala_decoderLayer.hpp | 8 +- .../minicpm_sala_for_causal_lm.cpp | 4 +- .../minicpm_sala_for_causal_lm.hpp | 4 +- csrc/models/model_factory.cpp | 53 --- csrc/models/model_factory.hpp | 29 -- csrc/models/qwen3/qwen3_attention.cpp | 57 +-- csrc/models/qwen3/qwen3_attention.hpp | 12 +- .../qwen3_next/qwen3_next_attention.cpp | 40 +- .../qwen3_next/qwen3_next_attention.hpp | 12 +- .../qwen3_next/qwen3_next_decoderLayer.cpp | 10 +- .../qwen3_next/qwen3_next_decoderLayer.hpp | 10 +- .../qwen3_next/qwen3_next_for_causal_lm.cpp | 4 +- .../qwen3_next/qwen3_next_for_causal_lm.hpp | 4 +- .../qwen3_next/qwen3_next_gated_deltanet.cpp | 19 +- .../qwen3_next/qwen3_next_gated_deltanet.hpp | 16 +- .../qwen3_vl_for_conditional_generation.cpp | 7 +- .../qwen3_vl_for_conditional_generation.hpp | 6 +- csrc/pybind11/engine/engine.hpp | 48 +-- csrc/utils.hpp | 12 + 76 files changed, 1832 insertions(+), 1593 deletions(-) create mode 100644 csrc/layers/linear/base_linear.cpp create mode 100644 csrc/layers/linear/base_linear.hpp create mode 100644 csrc/layers/linear/linear.cpp create mode 100644 csrc/layers/quantization/awq.cpp create mode 100644 csrc/layers/quantization/awq.hpp create mode 100644 csrc/layers/quantization/base_quantization.hpp create mode 100644 csrc/layers/quantization/compressed_tensors.cpp create mode 100644 csrc/layers/quantization/compressed_tensors.hpp create mode 100644 csrc/layers/quantization/gptq.cpp create mode 100644 csrc/layers/quantization/gptq.hpp create mode 100644 csrc/layers/quantization/gptq_qy.cpp create mode 100644 csrc/layers/quantization/gptq_qy.hpp create mode 100644 csrc/layers/quantization/none_quantization.cpp create mode 100644 csrc/layers/quantization/none_quantization.hpp create mode 100644 csrc/layers/quantization/quantization.hpp create mode 100644 csrc/layers/quantization/quantization_scheme.hpp diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index d3bcd170..70a71e00 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -211,6 +211,7 @@ PagedKVCache::PagedKVCache( k_dim_}, dtype_, rank_info.device); + set_zeros(k_caches_); // [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim] v_caches_ = infinicore::Tensor::empty( @@ -221,6 +222,9 @@ PagedKVCache::PagedKVCache( v_dim_}, dtype_, rank_info.device); + set_zeros(v_caches_); + + infinicore::context::syncStream(); } infinicore::Tensor PagedKVCache::create_layer_kv_cache( diff --git a/csrc/config/model_config.cpp b/csrc/config/model_config.cpp index d037f3c0..e8bb6f5b 100644 --- a/csrc/config/model_config.cpp +++ b/csrc/config/model_config.cpp @@ -16,12 +16,12 @@ ModelConfig::ModelConfig(const std::string &path) { this->quant_config = QuantConfig(config_json["quantization_config"]); } -infinicore::quantization::QuantScheme +infinilm::quantization::QuantScheme ModelConfig::get_quant_scheme() const { - if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) { + if (quant_config.get_quant_scheme() != infinilm::quantization::QuantScheme::NONE) { return quant_config.get_quant_scheme(); } else { - return infinicore::quantization::QuantScheme::NONE; + return infinilm::quantization::QuantScheme::NONE; } } diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp index bfa0ed18..3752e218 100644 --- a/csrc/config/model_config.hpp +++ b/csrc/config/model_config.hpp @@ -62,17 +62,17 @@ class ModelConfig { return quant_config; } - std::shared_ptr get_quantization_method() const { + std::shared_ptr get_quantization_method() const { return quant_config.get_quantization_method(); } infinicore::DataType get_dtype() const; - infinicore::quantization::QuantScheme get_quant_scheme() const; + infinilm::quantization::QuantScheme get_quant_scheme() const; std::shared_ptr get_rope_scaling() const; void set_kv_quant_scheme(infinicore::DataType kv_cache_dtype) { this->quant_config.set_kv_quant_scheme(kv_cache_dtype); } - infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const { + infinilm::quantization::KVQuantAlgo get_kv_quant_scheme() const { return quant_config.get_kv_quant_scheme(); } infinicore::DataType get_kv_cache_dtype() const { diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp index c7fdce34..87375d30 100644 --- a/csrc/config/quant_config.cpp +++ b/csrc/config/quant_config.cpp @@ -5,25 +5,24 @@ QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) this->quantization_method = get_quantization_method(); } -std::shared_ptr +std::shared_ptr QuantConfig::get_quantization_method() const { if (quantization_config.is_null()) { - return std::make_shared(quantization_config); // Default case if no matching scheme + return std::make_shared(quantization_config); // Default case if no matching scheme } // Determine the quantization scheme from the JSON config if (quantization_config["quant_method"] == "compressed-tensors") { - return std::make_shared(quantization_config); + return std::make_shared(quantization_config); } else if (quantization_config["quant_method"] == "awq") { - return std::make_shared(quantization_config); + return std::make_shared(quantization_config); } else if (quantization_config["quant_method"] == "gptq") { - // return std::make_shared(quantization_config); - return std::make_shared(quantization_config); + return std::make_shared(quantization_config); } else { - return std::make_shared(quantization_config); + return std::make_shared(quantization_config); } // Add other schemes as needed - return std::make_shared(quantization_config); // Default case if no matching scheme + return std::make_shared(quantization_config); // Default case if no matching scheme } } // namespace infinilm::config diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp index d0b8cb24..fb0b8abf 100644 --- a/csrc/config/quant_config.hpp +++ b/csrc/config/quant_config.hpp @@ -1,6 +1,6 @@ #pragma once #include "../utils.hpp" -#include "infinicore/quantization.hpp" +#include "../layers/quantization/quantization.hpp" #include "nlohmann/json.hpp" #include #include @@ -14,13 +14,13 @@ class QuantConfig { QuantConfig() = default; QuantConfig(const nlohmann::json &json); - std::shared_ptr get_quantization_method() const; + std::shared_ptr get_quantization_method() const; - infinicore::quantization::QuantScheme get_quant_scheme() const { + infinilm::quantization::QuantScheme get_quant_scheme() const { if (quantization_method != nullptr) { return quantization_method->get_quant_scheme(); } else { - return infinicore::quantization::QuantScheme::NONE; + return infinilm::quantization::QuantScheme::NONE; } } @@ -29,22 +29,22 @@ class QuantConfig { this->kv_cache_dtype_ = std::make_optional(kv_cache_dtype); switch (kv_cache_dtype) { case infinicore::DataType::I8: { - this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::INT8; + this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::INT8; break; } default: { spdlog::warn("Unsupported kv_cache_dtype: '{}', fallback to NONE", infinicore::toString(kv_cache_dtype)); - this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE; + this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE; break; } } } catch (const std::exception &e) { spdlog::error("Failed to parse kv_cache_dtype '{}': {}", infinicore::toString(kv_cache_dtype), e.what()); - this->kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE; + this->kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE; } } - infinicore::quantization::KVQuantAlgo get_kv_quant_scheme() const { + infinilm::quantization::KVQuantAlgo get_kv_quant_scheme() const { return kv_quant_scheme; } @@ -57,9 +57,9 @@ class QuantConfig { private: nlohmann::json quantization_config; - std::shared_ptr quantization_method; + std::shared_ptr quantization_method; - infinicore::quantization::KVQuantAlgo kv_quant_scheme = infinicore::quantization::KVQuantAlgo::NONE; + infinilm::quantization::KVQuantAlgo kv_quant_scheme = infinilm::quantization::KVQuantAlgo::NONE; std::optional kv_cache_dtype_ = std::nullopt; }; diff --git a/csrc/engine/compiler/paged_compiler.cpp b/csrc/engine/compiler/paged_compiler.cpp index 29907d74..1a73c8fd 100644 --- a/csrc/engine/compiler/paged_compiler.cpp +++ b/csrc/engine/compiler/paged_compiler.cpp @@ -1,20 +1,7 @@ #include "paged_compiler.hpp" #include "../../global_state/global_state.hpp" +#include "../../utils.hpp" -namespace { -// Todo: replace with Tensor::zeros when it is available -inline void set_zeros(infinicore::Tensor &tensor) { - std::vector zeros(tensor->nbytes(), 0); - infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false); -} - -inline void set_minus_one(infinicore::Tensor &tensor) { - // For int32 tensors, 0xFF bytes correspond to -1 in two's complement. - std::vector minus_one(tensor->nbytes(), 0xFF); - infinicore::context::memcpyH2D(tensor->data(), minus_one.data(), tensor->nbytes(), false); -} - -} // namespace namespace infinilm::engine { PagedCompiler::PagedCompiler(const std::shared_ptr &model, RankBarrier *barrier) : GraphCompiler(model, barrier) { @@ -61,7 +48,6 @@ void PagedCompiler::compile() { const size_t block_per_req = nblocks; input.block_tables = block_tables_holder_->as_strided({b, block_per_req}, {(ptrdiff_t)block_per_req, 1}); input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice()); - set_zeros(input.slot_mapping.value()); // Attention reads attn_metadata from thread-local forward context. infinilm::global_state::get_forward_context().attn_metadata = { diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index db0dfdd4..2e34c6cc 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -7,49 +7,6 @@ namespace infinilm::engine { //------------------------------------------------------ // Constructor //------------------------------------------------------ -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -InferEngine::InferEngine( - const InfinilmModel::Config &config, - const distributed::DistConfig &distributed_config, - infinicore::Device::Type device_type, - const cache::CacheConfig *cache_config, - bool enable_graph_compiling, - backends::AttentionBackend attention_backend) // Changed parameter - : communication_group_(distributed_config, device_type), - legacy_model_config_(config), - attention_backend_(attention_backend) { - if (cache_config != nullptr) { - cache_config_ = cache_config->unique_copy(); - } - // Create one RankWorker per rank - int world_size = communication_group_.get_world_size(); - barrier_ = std::make_unique((size_t)world_size); - workers_.reserve(world_size); - for (int r = 0; r < world_size; ++r) { - workers_.emplace_back(std::make_unique( - legacy_model_config_, - communication_group_.get_rank_info(r), - cache_config_ != nullptr ? cache_config_.get() : nullptr, - barrier_.get(), - enable_graph_compiling, - attention_backend_)); - } - - // Compile the model on all workers - this->compile(); -} - InferEngine::InferEngine( const std::string &config_str, const distributed::DistConfig &distributed_config, diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index e36ec369..4bafffa2 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -3,7 +3,6 @@ #include "../config/model_config.hpp" #include "../global_state/global_state.hpp" #include "../models/infinilm_model.hpp" -#include "../models/llama_legacy/llama_config.hpp" #include "distributed/distributed.hpp" #include "infinicore/tensor.hpp" #include "rank_barrier.hpp" @@ -21,26 +20,6 @@ class InferEngine { using Output = RankWorker::Output; // Updated constructor: accept CacheConfig instead of CacheType - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - InferEngine( - const InfinilmModel::Config &config, - const distributed::DistConfig &distributed_config = distributed::DistConfig(), - infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), - const cache::CacheConfig *cache_config = nullptr, - bool enable_graph_compiling = false, - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - InferEngine( const std::string &config_str, const distributed::DistConfig &distributed_config = distributed::DistConfig(), @@ -78,7 +57,6 @@ class InferEngine { std::unique_ptr barrier_; distributed::CommunicationGroup communication_group_; std::unique_ptr cache_config_; - const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); std::shared_ptr model_config_; backends::AttentionBackend attention_backend_ = backends::AttentionBackend::Default; }; diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 8a94c441..bc51f422 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -10,46 +10,6 @@ namespace infinilm::engine { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -RankWorker::RankWorker(const InfinilmModel::Config &model_config, - const distributed::RankInfo &rank_info, - const cache::CacheConfig *cache_config, - RankBarrier *barrier, - bool enable_graph_compiling, - backends::AttentionBackend attention_backend) - : legacy_model_config_(model_config), - rank_info_(rank_info), - attention_backend_(attention_backend), - enable_graph_compiling_(enable_graph_compiling), - job_cmd_(Command::INIT), - has_job_(false), - job_done_(false), - should_exit_(false), - init_done_(false), - rng_(std::random_device{}()), - barrier_(barrier) { - if (cache_config != nullptr) { - pending_cache_config_ = cache_config->unique_copy(); - } - // start the thread - thread_ = std::thread(&RankWorker::thread_loop, this); - - // Wait until the worker thread finishes initialization (model created) - std::unique_lock lk(mutex_); - cv_.wait(lk, [&] { return init_done_; }); -} - RankWorker::RankWorker( std::shared_ptr infinilm_config, const distributed::RankInfo &rank_info, @@ -269,15 +229,6 @@ void RankWorker::thread_loop() { infinilm::global_state::initialize_infinilm_config(infinilm_config_); // Create model using factory (may be expensive) - if (model_config_ == nullptr) { - // model_ = InfinilmModelFactory::createModel( - // legacy_model_config_, - // rank_info_, - // pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, - // attention_backend_); - throw std::runtime_error("RankWorker::thread_loop(): the way of creating models using LlamaConfig is no longer supported !!!"); - } - const std::string &model_type = model_config_->get("model_type"); const auto &model_map = models::get_causal_lm_model_map(); auto it = model_map.find(model_type); @@ -287,16 +238,7 @@ void RankWorker::thread_loop() { rank_info_.device, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; - if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) { - model_ = InfinilmModelFactory::createModel( - model_config_, - rank_info_, - pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, - attention_backend_); - } else { - throw std::runtime_error("RankWorker::thread_loop(): Unsupported model config type: " + model_type); - } + throw std::runtime_error("RankWorker::thread_loop(): Unsupported model config type: " + model_type); } if (!model_) { diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index f6adcf47..027ebc1c 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -70,13 +70,6 @@ class RankWorker { infinicore::Tensor output_ids; }; - RankWorker(const InfinilmModel::Config &model_config, - const distributed::RankInfo &rank_info, - const cache::CacheConfig *cache_config, - RankBarrier *barrier, - bool enable_graph_compiling, - backends::AttentionBackend attention_backend); - RankWorker(std::shared_ptr infinilm_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, @@ -118,7 +111,6 @@ class RankWorker { private: // Worker properties - const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); std::shared_ptr infinilm_config_; std::shared_ptr model_config_; engine::distributed::RankInfo rank_info_; diff --git a/csrc/layers/attention/attention.cpp b/csrc/layers/attention/attention.cpp index 7cadb81b..1b87f6fb 100644 --- a/csrc/layers/attention/attention.cpp +++ b/csrc/layers/attention/attention.cpp @@ -26,52 +26,15 @@ Attention::Attention(std::shared_ptr model_config num_attention_heads_ = total_num_heads / tp_size; num_key_value_heads_ = total_num_kv_heads < tp_size ? 1 : total_num_kv_heads / tp_size; - auto quant_scheme = model_config->get_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: { - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { - INFINILM_QKV_LINEAR_W4A16GPTQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, quantization_method, use_bias, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, use_output_bias, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16: { - - INFINILM_QKV_LINEAR_W4A16GPTQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, quantization_method, use_bias, - dtype, device, rank_info); - - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, use_output_bias, - dtype, device, tp_rank, tp_size, rank_info.comm); - - break; - } - default: { - throw std::runtime_error("infinilm::layers::attention::Attention: unsupported quantization scheme"); - break; - } - } + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + qkv_proj_ = std::make_shared( + hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, + "q_proj", "k_proj", "v_proj", register_fn, + quantization_method, use_bias, dtype, device, rank_info); + o_proj_ = this->register_module( + "o_proj", total_num_heads * head_dim_, hidden_size_, quantization_method, + use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device); @@ -79,21 +42,7 @@ Attention::Attention(std::shared_ptr model_config attn_ = std::make_shared(num_attention_heads_, head_dim_, scaling, num_key_value_heads_, layer_idx_, kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_); - auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme(); - switch (kv_quant_scheme) { - case (infinicore::quantization::KVQuantAlgo::NONE): { - break; - } - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; - } - default: { - throw std::runtime_error("infinilm::layers::attention: unsupported kv_quant_scheme"); - break; - } - } + init_kv_cache_quant_params(register_fn, device, kv_cache_k_scale_, kv_cache_v_scale_); } infinicore::Tensor Attention::forward(const infinicore::Tensor &positions, @@ -188,4 +137,23 @@ infinicore::Tensor Attention::forward_paged_(const infinicore::Tensor &position_ return output; } +void init_kv_cache_quant_params(std::function register_fn, + const infinicore::Device &device, + infinicore::nn::Parameter &kv_cache_k_scale, + infinicore::nn::Parameter &kv_cache_v_scale) { + auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme(); + switch (kv_quant_scheme) { + case infinilm::quantization::KVQuantAlgo::NONE: + break; + case infinilm::quantization::KVQuantAlgo::INT8: + kv_cache_k_scale = infinicore::nn::Parameter({1}, infinicore::DataType::F32, device, 0, 0, 1); + register_fn("kv_cache_k_scale", kv_cache_k_scale); + kv_cache_v_scale = infinicore::nn::Parameter({1}, infinicore::DataType::F32, device, 0, 0, 1); + register_fn("kv_cache_v_scale", kv_cache_v_scale); + break; + default: + throw std::runtime_error("unsupported kv_quant_scheme"); + } +} + } // namespace infinilm::layers::attention diff --git a/csrc/layers/attention/attention.hpp b/csrc/layers/attention/attention.hpp index ac46e547..55622393 100644 --- a/csrc/layers/attention/attention.hpp +++ b/csrc/layers/attention/attention.hpp @@ -20,6 +20,10 @@ class Attention : public infinicore::nn::Module { infinicore::Tensor forward(const infinicore::Tensor &positions, const infinicore::Tensor &hidden_states) const; + void process_fused_weights_after_loading() { + qkv_proj_->process_weights_after_loading(); + } + size_t layer_idx() const { return layer_idx_; } size_t num_heads() const { return num_attention_heads_; } size_t num_kv_heads() const { return num_key_value_heads_; } @@ -34,8 +38,8 @@ class Attention : public infinicore::nn::Module { const infinicore::Tensor &hidden_states) const; protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::QKVParallelLinear, qkv_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj); + std::shared_ptr qkv_proj_; + std::shared_ptr o_proj_; std::shared_ptr rotary_emb_; std::shared_ptr attn_; @@ -47,7 +51,11 @@ class Attention : public infinicore::nn::Module { size_t head_dim_; // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); + infinicore::nn::Parameter kv_cache_k_scale_; + infinicore::nn::Parameter kv_cache_v_scale_; }; +void init_kv_cache_quant_params(std::function register_fn, + const infinicore::Device &device, + infinicore::nn::Parameter &kv_cache_k_scale, + infinicore::nn::Parameter &kv_cache_v_scale); } // namespace infinilm::layers::attention diff --git a/csrc/layers/attention/backends/static_attn.cpp b/csrc/layers/attention/backends/static_attn.cpp index 3f779a54..2d1b7e11 100644 --- a/csrc/layers/attention/backends/static_attn.cpp +++ b/csrc/layers/attention/backends/static_attn.cpp @@ -30,7 +30,7 @@ infinicore::Tensor StaticAttentionImpl::forward(const AttentionLayer &layer, auto k_scale = layer.get_k_scale(); auto v_scale = layer.get_v_scale(); - if (infinicore::quantization::KVQuantAlgo::NONE != this->kv_quant_scheme_) { + if (infinilm::quantization::KVQuantAlgo::NONE != this->kv_quant_scheme_) { infinilm::KVQuantUtils::quantize( k_reshaped, v_reshaped, this->kv_quant_scheme_, @@ -65,7 +65,7 @@ infinicore::Tensor StaticAttentionImpl::forward(const AttentionLayer &layer, } else { size_t total_seq_len = reinterpret_cast(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0]; - if (infinicore::quantization::KVQuantAlgo::NONE != this->kv_quant_scheme_) { + if (infinilm::quantization::KVQuantAlgo::NONE != this->kv_quant_scheme_) { infinilm::KVQuantUtils::dequantize( k_total, v_total, this->kv_quant_scheme_, diff --git a/csrc/layers/attention/backends/static_attn.hpp b/csrc/layers/attention/backends/static_attn.hpp index 7c453f00..849d8792 100644 --- a/csrc/layers/attention/backends/static_attn.hpp +++ b/csrc/layers/attention/backends/static_attn.hpp @@ -41,6 +41,6 @@ class StaticAttentionImpl { size_t layer_idx_; size_t head_dim_; // Note: head_dim equals to head_size - infinicore::quantization::KVQuantAlgo kv_quant_scheme_; + infinilm::quantization::KVQuantAlgo kv_quant_scheme_; }; } // namespace infinilm::layers::attention::backends diff --git a/csrc/layers/causal_lm_templates/text_causal_lm.hpp b/csrc/layers/causal_lm_templates/text_causal_lm.hpp index eb4f2b47..14ad5473 100644 --- a/csrc/layers/causal_lm_templates/text_causal_lm.hpp +++ b/csrc/layers/causal_lm_templates/text_causal_lm.hpp @@ -55,8 +55,8 @@ class TextCausalLM : public InfinilmModel { Model &model() { return *model_; } protected: - INFINICORE_NN_MODULE(Model, model); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); + std::shared_ptr model_; + std::shared_ptr lm_head_; }; } // namespace infinilm::layers::causal_lm_templates diff --git a/csrc/layers/causal_lm_templates/text_decoder_layer.hpp b/csrc/layers/causal_lm_templates/text_decoder_layer.hpp index 8d70a041..7f7e46d0 100644 --- a/csrc/layers/causal_lm_templates/text_decoder_layer.hpp +++ b/csrc/layers/causal_lm_templates/text_decoder_layer.hpp @@ -62,10 +62,10 @@ class TextDecoderLayer : public infinicore::nn::Module { size_t layer_idx() const { return layer_idx_; } protected: - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); - INFINICORE_NN_MODULE(Attention, self_attn); - INFINICORE_NN_MODULE(MLP, mlp); + std::shared_ptr input_layernorm_; + std::shared_ptr post_attention_layernorm_; + std::shared_ptr self_attn_; + std::shared_ptr mlp_; size_t layer_idx_; }; diff --git a/csrc/layers/causal_lm_templates/text_model.hpp b/csrc/layers/causal_lm_templates/text_model.hpp index 62a52798..f0f9b373 100644 --- a/csrc/layers/causal_lm_templates/text_model.hpp +++ b/csrc/layers/causal_lm_templates/text_model.hpp @@ -99,9 +99,9 @@ class TextModel : public infinicore::nn::Module { } protected: - INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); - INFINICORE_NN_MODULE_VEC(DecoderLayer, layers); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); + std::shared_ptr embed_tokens_; + std::vector> layers_; + std::shared_ptr norm_; }; } // namespace infinilm::layers::causal_lm_templates diff --git a/csrc/layers/linear/base_linear.cpp b/csrc/layers/linear/base_linear.cpp new file mode 100644 index 00000000..eebc482c --- /dev/null +++ b/csrc/layers/linear/base_linear.cpp @@ -0,0 +1,133 @@ +#include "base_linear.hpp" +#include "infinicore/ops.hpp" +#include + +namespace infinilm::nn { + +BaseLinear::BaseLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads) + : in_features_(in_features), + out_features_(out_features), + has_bias_(bias), + dtype_(dtype), + split_dim_(split_dim), + quantization_(quantization) { + + device_ = device; + + auto layout = quantization_->get_param_layout( + in_features, out_features, split_dim, tp_rank, tp_size, + tp_num_heads, dtype, bias); + + for (const auto &desc : layout) { + infinicore::nn::Parameter param( + desc.shape, desc.dtype, device, + desc.split_dim, desc.tp_rank, desc.tp_size, + desc.tp_num_heads >= 0 ? desc.tp_num_heads : 0); + this->register_parameter(desc.name, param); + } +} + +infinicore::Tensor BaseLinear::compute_linear(infinicore::Tensor &input) const { + // Build params map from direct parameters only (not state_dict which uses a + // static local and is not thread-safe across RankWorker threads). + infinilm::quantization::ParamsMap params; + for (const auto &[name, param] : parameters_) { + params[name] = static_cast(param); + } + + return quantization_->forward(params, input, has_bias_, alpha_); +} + +infinicore::Tensor BaseLinear::forward(infinicore::Tensor &input) const { + return compute_linear(input); +} + +infinicore::Tensor BaseLinear::forward(infinicore::Tensor &input, infinicore::Tensor &residual) const { + auto output = compute_linear(input); + infinicore::op::add_(output, output, residual); + return output; +} + +void BaseLinear::process_weights_after_loading() { + infinilm::quantization::ParamsMap params; + for (const auto &[name, param] : parameters_) { + params[name] = static_cast(param); + } + + auto new_quant = quantization_->process_weights_after_loading(params, device_); + if (!new_quant) return; + + for (auto &[name, param] : parameters_) { + param = infinicore::nn::Parameter(); + } + + for (const auto &[name, tensor] : params) { + auto it = parameters_.find(name); + if (it == parameters_.end()) continue; + it->second = infinicore::nn::Parameter(tensor); + } + + quantization_ = std::move(new_quant); +} + +// Backward compatible accessors + +infinicore::Tensor BaseLinear::weight() const { + auto it = parameters_.find("weight"); + if (it != parameters_.end()) return it->second; + it = parameters_.find("qweight"); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +infinicore::Tensor BaseLinear::bias() const { + auto it = parameters_.find("bias"); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +infinicore::Tensor BaseLinear::weight_scale() const { + auto it = parameters_.find("weight_scale"); + if (it != parameters_.end()) return it->second; + it = parameters_.find("scales"); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +infinicore::Tensor BaseLinear::weight_zeros() const { + auto it = parameters_.find("weight_zeros"); + if (it != parameters_.end()) return it->second; + it = parameters_.find("qzeros"); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +infinicore::Tensor BaseLinear::gidx() const { + auto it = parameters_.find("g_idx"); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +infinicore::Tensor BaseLinear::get_param(const std::string &name) const { + auto it = parameters_.find(name); + if (it != parameters_.end()) return it->second; + return infinicore::Tensor(); +} + +const infinicore::nn::Parameter &BaseLinear::get_parameter_ref(const std::string &name) const { + return parameters_.at(name); +} + +std::vector BaseLinear::split_params( + const std::vector &splits, + int tp_rank, int tp_size, int tp_num_heads) const { + return quantization_->split_params( + parameters_, splits, split_dim_, tp_rank, tp_size, tp_num_heads); +} + +} // namespace infinilm::nn diff --git a/csrc/layers/linear/base_linear.hpp b/csrc/layers/linear/base_linear.hpp new file mode 100644 index 00000000..a304f457 --- /dev/null +++ b/csrc/layers/linear/base_linear.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include "infinicore/ops.hpp" +#include "../quantization/quantization.hpp" +#include "infinicore/nn/module.hpp" +#include +#include + +namespace infinilm::nn { + +using namespace infinicore::nn; + +class BaseLinear : public infinicore::nn::Module { +public: + BaseLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization = std::make_shared(nullptr), + bool bias = true, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + int split_dim = -1, int tp_rank = 0, int tp_size = 1, + int tp_num_heads = -1); + + // Forward pass: output = input @ weight.T + bias + infinicore::Tensor forward(infinicore::Tensor &input) const; + + // Forward pass with residual connection + infinicore::Tensor forward(infinicore::Tensor &input, infinicore::Tensor &residual) const; + + // Module information + size_t in_features() const { return in_features_; } + size_t out_features() const { return out_features_; } + bool has_bias() const { return has_bias_; } + infinicore::DataType dtype() const { return dtype_; } + float alpha() const { return alpha_; } + void set_alpha(float alpha) { alpha_ = alpha; } + + // Accessors for parameters (backward compatible) + infinicore::Tensor weight() const; + infinicore::Tensor bias() const; + infinicore::Tensor weight_scale() const; + infinicore::Tensor weight_zeros() const; + infinicore::Tensor gidx() const; + + // Get parameter by name + infinicore::Tensor get_param(const std::string &name) const; + + std::shared_ptr get_quantization() const { return quantization_; } + virtual void process_weights_after_loading(); + + // Split fused linear parameters into named sub-parameters + std::vector split_params( + const std::vector &splits, + int tp_rank, int tp_size, int tp_num_heads) const; + + // Allow subclasses to access the raw parameters map + const infinicore::nn::Parameter &get_parameter_ref(const std::string &name) const; + +protected: + infinicore::Tensor compute_linear(infinicore::Tensor &input) const; + + size_t in_features_; + size_t out_features_; + bool has_bias_; + infinicore::DataType dtype_; + int split_dim_ = -1; + float alpha_ = 1.0f; + std::shared_ptr quantization_; +}; + +} // namespace infinilm::nn diff --git a/csrc/layers/linear/fused_linear.cpp b/csrc/layers/linear/fused_linear.cpp index bb734aac..04d0ad31 100644 --- a/csrc/layers/linear/fused_linear.cpp +++ b/csrc/layers/linear/fused_linear.cpp @@ -6,74 +6,11 @@ namespace infinilm::layers::linear { // --------------------------------------------------------- // QKV Parallel Linear // --------------------------------------------------------- -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, size_t num_kv_head, - bool bias, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : QKVParallelLinear(hidden_size, - head_dim, head_dim, head_dim, - num_q_head, num_kv_head, num_kv_head, - bias, bias, bias, - dtype, device, rank_info) {} - -QKVParallelLinear::QKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear( - hidden_size, - num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, - (q_bias || k_bias || v_bias), - dtype, - device, - rank_info.tp_rank, - rank_info.tp_size), - q_dim_(q_dim), - k_dim_(k_dim), - v_dim_(v_dim), - num_q_head_(num_q_head), - num_k_head_(num_k_head), - num_v_head_(num_v_head), - q_bias_(q_bias), - k_bias_(k_bias), - v_bias_(v_bias) { - if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { - throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); - } - - if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { - throw std::runtime_error("q_bias, k_bias, v_bias must all match"); - } - - q_out_size_ = num_q_head_ * q_dim_ / tp_size_; - k_out_size_ = num_k_head_ * k_dim_ / tp_size_; - v_out_size_ = num_v_head_ * v_dim_ / tp_size_; -} - -QKVParallelLinear::QKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, - size_t num_kv_head, - std::shared_ptr quantization, + std::shared_ptr quantization, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, @@ -89,11 +26,11 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, size_t num_q_head, size_t num_k_head, size_t num_v_head, bool q_bias, bool k_bias, bool v_bias, - std::shared_ptr quantization, + std::shared_ptr quantization, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear( + : infinilm::nn::ColumnParallelLinear( hidden_size, calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info), quantization, @@ -133,194 +70,57 @@ QKVParallelLinear::forward_split(infinicore::Tensor &input) { return std::make_tuple(q_out, k_out, v_out); } -infinicore::nn::Parameter QKVParallelLinear::get_q_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, 0, q_out_size_}}), - 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_v_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, 0, q_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const { - if (!q_bias_) { - return infinicore::nn::Parameter(); - } - return infinicore::nn::Parameter( - bias_->narrow({{0, 0, q_out_size_}}), - 0, tp_rank_, tp_size_); -} +bool QKVParallelLinear::has_q_bias() const { return q_bias_; } +bool QKVParallelLinear::has_k_bias() const { return k_bias_; } +bool QKVParallelLinear::has_v_bias() const { return v_bias_; } -infinicore::nn::Parameter QKVParallelLinear::get_k_bias() const { - if (!k_bias_) { - return infinicore::nn::Parameter(); +QKVParallelLinear::QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, size_t num_kv_head, + const std::string &q_name, const std::string &k_name, const std::string &v_name, + RegisterParamFn register_fn, + std::shared_ptr quantization, + bool bias, + const infinicore::DataType &dtype, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : QKVParallelLinear(hidden_size, head_dim, num_q_head, num_kv_head, quantization, bias, dtype, device, rank_info) { + register_fn_ = register_fn; + split_infos_ = { + {q_name, 0, q_out_size_, 0}, + {k_name, q_out_size_, k_out_size_, num_k_head_}, + {v_name, q_out_size_ + k_out_size_, v_out_size_, num_v_head_}, + }; + auto params = this->split_params(split_infos_, tp_rank_, tp_size_, num_k_head_); + for (auto &sp : params) { + register_fn_(sp.full_name, std::move(sp.param)); } - return infinicore::nn::Parameter( - bias_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_); } -infinicore::nn::Parameter QKVParallelLinear::get_v_bias() const { - if (!v_bias_) { - return infinicore::nn::Parameter(); +void QKVParallelLinear::process_weights_after_loading() { + BaseLinear::process_weights_after_loading(); + if (register_fn_ && !split_infos_.empty()) { + auto params = this->split_params(split_infos_, tp_rank_, tp_size_, num_k_head_); + for (auto &sp : params) { + register_fn_(sp.full_name, std::move(sp.param)); + } } - return infinicore::nn::Parameter( - bias_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_); } -infinicore::nn::Parameter QKVParallelLinear::get_q_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_k_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter QKVParallelLinear::get_v_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_); -} - -bool QKVParallelLinear::has_q_bias() const { return q_bias_; } -bool QKVParallelLinear::has_k_bias() const { return k_bias_; } -bool QKVParallelLinear::has_v_bias() const { return v_bias_; } - // --------------------------------------------------------- // Gate-Up Parallel Linear // --------------------------------------------------------- -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { -} - -GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { - if (gate_bias_ != up_bias_) { - throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); - } -} - -GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, bool bias, +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info) : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) { } GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - std::shared_ptr quantization, + std::shared_ptr quantization, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { + : infinilm::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { if (gate_bias_ != up_bias_) { throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); } @@ -334,85 +134,41 @@ std::tuple GateUpParallelLinear::forward return std::make_tuple(gate_output, up_output); } -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight() const { - return infinicore::nn::Parameter(weight_->narrow({{0, 0, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} +bool GateUpParallelLinear::has_gate_bias() const { return gate_bias_; } +bool GateUpParallelLinear::has_up_bias() const { return up_bias_; } -infinicore::nn::Parameter GateUpParallelLinear::get_gate_bias() const { - if (!gate_bias_) { - return infinicore::nn::Parameter(); - } else { - return infinicore::nn::Parameter(bias_->narrow({{0, 0, bias_->size(0) / 2}}), 0, tp_rank_, tp_size_); +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, + const std::string &gate_name, const std::string &up_name, + RegisterParamFn register_fn, + std::shared_ptr quantization, + bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : GateUpParallelLinear(hidden_size, intermediate_size, quantization, bias, dtype, device, rank_info) { + const std::string &key_name = parameters_.count("qweight") ? "qweight" : "weight"; + const auto &key_param = get_parameter_ref(key_name); + int fused_dim = this->get_quantization()->get_fused_split_dim(); + size_t logical_output = this->get_quantization()->get_logical_dim_size(key_param->size(fused_dim)); + size_t half_size = logical_output / 2; + register_fn_ = register_fn; + split_infos_ = { + {gate_name, 0, half_size}, + {up_name, half_size, half_size}, + }; + auto params = this->split_params(split_infos_, tp_rank_, tp_size_, -1); + for (auto &sp : params) { + register_fn_(sp.full_name, std::move(sp.param)); } } -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight() const { - return infinicore::nn::Parameter(weight_->narrow({{0, weight_->size(0) / 2, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const { - if (!up_bias_) { - return infinicore::nn::Parameter(); - } else { - return infinicore::nn::Parameter(bias_->narrow({{0, bias_->size(0) / 2, bias_->size(0) / 2}}), - 0, tp_rank_, tp_size_); +void GateUpParallelLinear::process_weights_after_loading() { + BaseLinear::process_weights_after_loading(); + if (register_fn_ && !split_infos_.empty()) { + auto params = this->split_params(split_infos_, tp_rank_, tp_size_, -1); + for (auto &sp : params) { + register_fn_(sp.full_name, std::move(sp.param)); + } } } -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_zeros() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_zeros() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -bool GateUpParallelLinear::has_gate_bias() const { - return gate_bias_; -} - -bool GateUpParallelLinear::has_up_bias() const { - return up_bias_; -} - -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_awq() const { - return infinicore::nn::Parameter(weight_->narrow({{1, 0, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_awq() const { - return infinicore::nn::Parameter(weight_->narrow({{1, weight_->size(1) / 2, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale_awq() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{1, 0, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale_awq() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{1, weight_scale_->size(1) / 2, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_zeros_awq() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{1, 0, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_zeros_awq() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{1, weight_zeros_->size(1) / 2, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_gate_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter GateUpParallelLinear::get_up_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_); -} - } // namespace infinilm::layers::linear diff --git a/csrc/layers/linear/fused_linear.hpp b/csrc/layers/linear/fused_linear.hpp index 260c680a..92ec3b90 100644 --- a/csrc/layers/linear/fused_linear.hpp +++ b/csrc/layers/linear/fused_linear.hpp @@ -1,86 +1,48 @@ #pragma once #include "../../engine/distributed/communication_group.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/quantization.hpp" -#include +#include "linear.hpp" +#include "../quantization/quantization.hpp" +#include namespace infinilm::layers::linear { -class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { +using RegisterParamFn = std::function; + +class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear { public: explicit QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, size_t num_q_head, size_t num_k_head, size_t num_v_head, bool q_bias, bool k_bias, bool v_bias, + std::shared_ptr quantization, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - // A more common case where all heads have the same dimension explicit QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, size_t num_kv_head, + std::shared_ptr quantization, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - explicit QKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, size_t num_kv_head, + const std::string &q_name, const std::string &k_name, const std::string &v_name, + RegisterParamFn register_fn, + std::shared_ptr quantization, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - // A more common case where all heads have the same dimension - explicit QKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, size_t num_kv_head, - std::shared_ptr quantization, - bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + void process_weights_after_loading() override; std::tuple forward_split(infinicore::Tensor &input); - infinicore::nn::Parameter get_q_weight() const; - infinicore::nn::Parameter get_k_weight() const; - infinicore::nn::Parameter get_v_weight() const; - - infinicore::nn::Parameter get_q_weight_scale() const; - infinicore::nn::Parameter get_k_weight_scale() const; - infinicore::nn::Parameter get_v_weight_scale() const; - - infinicore::nn::Parameter get_q_weight_zeros() const; - infinicore::nn::Parameter get_k_weight_zeros() const; - infinicore::nn::Parameter get_v_weight_zeros() const; - - // For computing the packing factor in awq quantization: - // Returns the number of low-bit elements packed into a single high-bit container element. - // For example: int4 → int32 yields a packing factor of 8 (32 bits / 4 bits = 8 int4 values per int32). - infinicore::nn::Parameter get_q_weight_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_awq(int scaling_factor) const; - - infinicore::nn::Parameter get_q_weight_scale_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_scale_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_scale_awq(int scaling_factor) const; - - infinicore::nn::Parameter get_q_weight_zeros_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_zeros_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_zeros_awq(int scaling_factor) const; - - infinicore::nn::Parameter get_q_bias() const; - infinicore::nn::Parameter get_k_bias() const; - infinicore::nn::Parameter get_v_bias() const; - - infinicore::nn::Parameter get_q_g_idx_gptq() const; - infinicore::nn::Parameter get_k_g_idx_gptq() const; - infinicore::nn::Parameter get_v_g_idx_gptq() const; - bool has_q_bias() const; bool has_k_bias() const; bool has_v_bias() const; @@ -111,206 +73,49 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { bool q_bias_; bool k_bias_; bool v_bias_; - size_t q_out_size_; // num_q_head * q_dim / tp_size - size_t k_out_size_; // num_k_head * k_dim / tp_size - size_t v_out_size_; // num_v_head * v_dim / tp_size + size_t q_out_size_; + size_t k_out_size_; + size_t v_out_size_; size_t num_kv_head_replicas_ = 1; + RegisterParamFn register_fn_; + std::vector split_infos_; }; -class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { +class GateUpParallelLinear : public infinilm::nn::ColumnParallelLinear { public: - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + std::shared_ptr quantization, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, + const std::string &gate_name, const std::string &up_name, + RegisterParamFn register_fn, + std::shared_ptr quantization, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + void process_weights_after_loading() override; std::tuple forward_split(infinicore::Tensor &input); - infinicore::nn::Parameter get_gate_weight() const; - - infinicore::nn::Parameter get_gate_weight_scale() const; - - infinicore::nn::Parameter get_gate_weight_zeros() const; - - infinicore::nn::Parameter get_gate_bias() const; - - infinicore::nn::Parameter get_up_weight() const; - - infinicore::nn::Parameter get_up_weight_scale() const; - - infinicore::nn::Parameter get_up_weight_zeros() const; - - infinicore::nn::Parameter get_up_bias() const; - - infinicore::nn::Parameter get_gate_weight_awq() const; - - infinicore::nn::Parameter get_up_weight_awq() const; - - infinicore::nn::Parameter get_up_weight_scale_awq() const; - - infinicore::nn::Parameter get_up_weight_zeros_awq() const; - - infinicore::nn::Parameter get_gate_weight_scale_awq() const; - - infinicore::nn::Parameter get_gate_weight_zeros_awq() const; - - infinicore::nn::Parameter get_gate_g_idx_gptq() const; - - infinicore::nn::Parameter get_up_g_idx_gptq() const; - bool has_gate_bias() const; - bool has_up_bias() const; private: bool gate_bias_; bool up_bias_; + RegisterParamFn register_fn_; + std::vector split_infos_; }; -#define INFINILM_QKV_LINEAR_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ - this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ - this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_GATE_UP_LINEAR_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ - this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -// ========================= QKV Quantization ================================== -#define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ - this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \ - this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ - this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \ - this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ - this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - auto awq_ptr = std::static_pointer_cast(name##_->get_quantization()); \ - int packing_num = awq_ptr->get_packing_num(); \ - this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num)); \ - this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1)); \ - this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(packing_num)); \ - this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1)); \ - this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(packing_num)); \ - this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1)); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -// ========================= Gate-Up Quantization ============================== -#define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ - this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \ - this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ - this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq()); \ - this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq()); \ - this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq()); \ - this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq()); \ - this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq()); \ - this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -#define INFINILM_QKV_LINEAR_W4A16GPTQ_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - auto gptq_ptr = std::static_pointer_cast(name##_->get_quantization()); \ - int packing_num = gptq_ptr->get_packing_num(); \ - this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(1)); \ - this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(8)); \ - this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1)); \ - this->register_parameter(std::string(q_name) + ".g_idx", name##_->get_q_g_idx_gptq()); \ - this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(1)); \ - this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(8)); \ - this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1)); \ - this->register_parameter(std::string(k_name) + ".g_idx", name##_->get_k_g_idx_gptq()); \ - this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(1)); \ - this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(8)); \ - this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1)); \ - this->register_parameter(std::string(v_name) + ".g_idx", name##_->get_v_g_idx_gptq()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_GATE_UP_LINEAR_W4A16GPTQ_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq()); \ - this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq()); \ - this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq()); \ - this->register_parameter(std::string(gate_name) + ".g_idx", name##_->get_gate_g_idx_gptq()); \ - this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq()); \ - this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq()); \ - this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq()); \ - this->register_parameter(std::string(up_name) + ".g_idx", name##_->get_up_g_idx_gptq()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); } // namespace infinilm::layers::linear diff --git a/csrc/layers/linear/linear.cpp b/csrc/layers/linear/linear.cpp new file mode 100644 index 00000000..84982409 --- /dev/null +++ b/csrc/layers/linear/linear.cpp @@ -0,0 +1,100 @@ +#include "linear.hpp" +#include "infinicore/ops.hpp" +#include "infinicore/ops/distributed/allreduce.hpp" +#include + +namespace infinilm::nn { + +// ---- Linear ---- + +Linear::Linear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device) + : BaseLinear(in_features, out_features, + std::make_shared(nullptr), + bias, dtype, device, -1, 0, 1) { +} + +Linear::Linear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias, const infinicore::DataType &dtype, const infinicore::Device &device) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device, -1, 0, 1) { +} + +infinicore::Tensor Linear::forward(infinicore::Tensor &input) const { + return BaseLinear::forward(input); +} + +std::string Linear::extra_repr() const { + return "Linear(in_features=" + std::to_string(in_features_) + ", out_features=" + std::to_string(out_features_) + ", bias=" + (has_bias_ ? "true" : "false") + ", dtype=" + std::to_string(static_cast(dtype_)) + ")"; +} + +// ---- ColumnParallelLinear ---- + +ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank, infinicore::Size tp_size, + int tp_num_heads) + : BaseLinear(in_features, out_features, + std::make_shared(nullptr), + bias, dtype, device, 0, tp_rank, tp_size, tp_num_heads), + tp_rank_(tp_rank), + tp_size_(tp_size) { +} + +ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank, infinicore::Size tp_size, + int tp_num_heads) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device, + 0, tp_rank, tp_size, tp_num_heads), + tp_rank_(tp_rank), + tp_size_(tp_size) { +} + +infinicore::Tensor ColumnParallelLinear::forward(infinicore::Tensor &input) const { + return BaseLinear::forward(input); +} + +std::string ColumnParallelLinear::extra_repr() const { + return "ColumnParallelLinear(in_features=" + std::to_string(in_features_) + ", out_features=" + std::to_string(out_features_) + ", bias=" + (has_bias_ ? "true" : "false") + ", dtype=" + std::to_string(static_cast(dtype_)) + ")"; +} + +// ---- RowParallelLinear ---- + +RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank, infinicore::Size tp_size, + infinicclComm_t communicator) + : BaseLinear(in_features, out_features, + std::make_shared(nullptr), + bias, dtype, device, 1, tp_rank, tp_size), + tp_rank_(tp_rank), + tp_size_(tp_size), communicator_(communicator) { +} + +RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank, infinicore::Size tp_size, + infinicclComm_t communicator) + : BaseLinear(in_features, out_features, quantization, bias, dtype, device, + 1, tp_rank, tp_size), + tp_rank_(tp_rank), + tp_size_(tp_size), communicator_(communicator) { +} + +infinicore::Tensor RowParallelLinear::forward(infinicore::Tensor &input) const { + auto output = BaseLinear::forward(input); + + if ((tp_size_ > 1) && (communicator_ != nullptr)) { + infinicore::op::distributed::allreduce_(output, output, INFINICCL_SUM, communicator_); + } + return output; +} + +std::string RowParallelLinear::extra_repr() const { + return "RowParallelLinear(in_features=" + std::to_string(in_features_) + ", out_features=" + std::to_string(out_features_) + ", bias=" + (has_bias_ ? "true" : "false") + ", dtype=" + std::to_string(static_cast(dtype_)) + ")"; +} + +} // namespace infinilm::nn diff --git a/csrc/layers/linear/linear.hpp b/csrc/layers/linear/linear.hpp index 4cfab257..3687eb09 100644 --- a/csrc/layers/linear/linear.hpp +++ b/csrc/layers/linear/linear.hpp @@ -1,12 +1,92 @@ #pragma once + +#include "base_linear.hpp" +#include "infinicore/ops.hpp" +#include "../quantization/quantization.hpp" +#include "infinicore/nn/module.hpp" +#include +#include + +namespace infinilm::nn { + +class Linear : public BaseLinear { +public: + // Without quantization (backward compat) + Linear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device()); + + // With quantization + Linear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias = true, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device()); + + infinicore::Tensor forward(infinicore::Tensor &input) const; + std::string extra_repr() const; +}; + +class ColumnParallelLinear : public BaseLinear { +public: + // Without quantization (backward compat) + ColumnParallelLinear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank = 0, infinicore::Size tp_size = 1, + int tp_num_heads = -1); + + // With quantization + ColumnParallelLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias = true, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + infinicore::Size tp_rank = 0, infinicore::Size tp_size = 1, + int tp_num_heads = -1); + + infinicore::Tensor forward(infinicore::Tensor &input) const; + std::string extra_repr() const; + +protected: + infinicore::Size tp_rank_ = 0; + infinicore::Size tp_size_ = 1; +}; + +class RowParallelLinear : public BaseLinear { +public: + // Without quantization (backward compat) + RowParallelLinear(size_t in_features, size_t out_features, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + infinicore::Size tp_rank = 0, infinicore::Size tp_size = 1, + infinicclComm_t communicator = nullptr); + + // With quantization + RowParallelLinear(size_t in_features, size_t out_features, + std::shared_ptr quantization, + bool bias = true, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + infinicore::Size tp_rank = 0, infinicore::Size tp_size = 1, + infinicclComm_t communicator = nullptr); + + infinicore::Tensor forward(infinicore::Tensor &input) const; + std::string extra_repr() const; + +protected: + infinicore::Size tp_rank_ = 0; + infinicore::Size tp_size_ = 1; + infinicclComm_t communicator_; +}; + +} // namespace infinilm::nn + #include "fused_linear.hpp" namespace infinilm::layers::linear { -using QKVParallelLinear = infinilm::layers::linear::QKVParallelLinear; -using ReplicatedLinear = infinicore::nn::Linear; -using ColumnParallelLinear = infinicore::nn::ColumnParallelLinear; -using RowParallelLinear = infinicore::nn::RowParallelLinear; -using GateUpParallelLinear = infinilm::layers::linear::GateUpParallelLinear; +using ReplicatedLinear = infinilm::nn::Linear; +using ColumnParallelLinear = infinilm::nn::ColumnParallelLinear; +using RowParallelLinear = infinilm::nn::RowParallelLinear; +using BaseLinear = infinilm::nn::BaseLinear; } // namespace infinilm::layers::linear diff --git a/csrc/layers/mlp/mlp.cpp b/csrc/layers/mlp/mlp.cpp index 893062d8..f7604c50 100644 --- a/csrc/layers/mlp/mlp.cpp +++ b/csrc/layers/mlp/mlp.cpp @@ -16,49 +16,14 @@ MLP::MLP(std::shared_ptr model_config, int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; - auto quant_scheme = model_config->get_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: { - INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quantization_method, - use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quantization_method, - use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16: { - INFINILM_GATE_UP_LINEAR_W4A16GPTQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quantization_method, use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quantization_method, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { - INFINILM_GATE_UP_LINEAR_W4A16GPTQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quantization_method, use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quantization_method, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quantization_method, - use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - default: { - throw std::runtime_error("infinilm::layers::mlp::MLP: unsupported quantization scheme"); - break; - } - } + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + gate_up_proj_ = std::make_shared( + hidden_size_, intermediate_size_, "gate_proj", "up_proj", register_fn, + quantization_method, use_bias_, dtype, device, rank_info); + down_proj_ = this->register_module( + "down_proj", intermediate_size_, hidden_size_, quantization_method, + use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); } infinicore::Tensor MLP::forward(const infinicore::Tensor &hidden_states) const { diff --git a/csrc/layers/mlp/mlp.hpp b/csrc/layers/mlp/mlp.hpp index d0ba5c09..91349fe9 100644 --- a/csrc/layers/mlp/mlp.hpp +++ b/csrc/layers/mlp/mlp.hpp @@ -36,13 +36,17 @@ class MLP : public infinicore::nn::Module { */ infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; + void process_fused_weights_after_loading() { + gate_up_proj_->process_weights_after_loading(); + } + // Module information size_t hidden_size() const { return hidden_size_; } size_t intermediate_size() const { return intermediate_size_; } protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::GateUpParallelLinear, gate_up_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, down_proj); + std::shared_ptr gate_up_proj_; + std::shared_ptr down_proj_; size_t hidden_size_; size_t intermediate_size_; diff --git a/csrc/layers/mlp/moe_mlp.cpp b/csrc/layers/mlp/moe_mlp.cpp index 2d338404..c3021dab 100644 --- a/csrc/layers/mlp/moe_mlp.cpp +++ b/csrc/layers/mlp/moe_mlp.cpp @@ -19,12 +19,12 @@ MoeMLP::MoeMLP(std::shared_ptr model_config, auto quant_scheme = model_config->get_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: { - INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, moe_intermediate_size_, false, + case infinilm::quantization::QuantScheme::NONE: { + gate_proj_ = this->register_module("gate_proj", hidden_size_, moe_intermediate_size_, false, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, moe_intermediate_size_, false, + up_proj_ = this->register_module("up_proj", hidden_size_, moe_intermediate_size_, false, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(down_proj, moe_intermediate_size_, hidden_size_, false, + down_proj_ = this->register_module("down_proj", moe_intermediate_size_, hidden_size_, false, dtype, device, tp_rank, tp_size, rank_info.comm); break; } diff --git a/csrc/layers/mlp/moe_mlp.hpp b/csrc/layers/mlp/moe_mlp.hpp index c0835909..970ea3c5 100644 --- a/csrc/layers/mlp/moe_mlp.hpp +++ b/csrc/layers/mlp/moe_mlp.hpp @@ -18,9 +18,9 @@ class MoeMLP : public infinicore::nn::Module { void set_alpha(float alpha) { down_proj_->set_alpha(alpha); } protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, gate_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, up_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, down_proj); + std::shared_ptr gate_proj_; + std::shared_ptr up_proj_; + std::shared_ptr down_proj_; size_t hidden_size_; size_t moe_intermediate_size_; diff --git a/csrc/layers/quantization/awq.cpp b/csrc/layers/quantization/awq.cpp new file mode 100644 index 00000000..50e830f4 --- /dev/null +++ b/csrc/layers/quantization/awq.cpp @@ -0,0 +1,96 @@ +#include "awq.hpp" +#include "infinicore/ops/linear_w4a16_awq.hpp" +#include + +namespace infinilm::quantization { + +std::vector AWQ::get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int /*tp_num_heads*/, + const infinicore::DataType &dtype, + bool bias) const { + + // AWQ weight layout is transposed relative to the standard [out, in]: + // qweight: [in_features, out_features / packing_num] + // So the TP split dimension for AWQ is the "other" dimension: + // ColumnParallel (split_dim=0, split output) → AWQ tp_dim=1 + // RowParallel (split_dim=1, split input) → AWQ tp_dim=0 + int awq_tp_dim = (split_dim >= 0) ? (1 - split_dim) : -1; + int group_size = get_group_size(); + int packing_num = get_packing_num(); + + std::vector descs; + descs.push_back({"qweight", {in_features, out_features / packing_num}, + infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size}); + descs.push_back({"scales", {in_features / group_size, out_features}, + dtype, awq_tp_dim, tp_rank, tp_size}); + descs.push_back({"qzeros", {in_features / group_size, out_features / packing_num}, + infinicore::DataType::I32, awq_tp_dim, tp_rank, tp_size}); + if (bias) { + descs.push_back({"bias", {out_features}, dtype, -1, 0, 1}); + } + return descs; +} + +infinicore::Tensor AWQ::forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float /*alpha*/) const { + + auto input_contiguous = input->is_contiguous() ? input : input->contiguous(); + auto qweight = params.at("qweight"); + auto scales = params.at("scales"); + auto qzeros = params.at("qzeros"); + + std::optional bias_opt; + if (has_bias) { + bias_opt = params.at("bias"); + } + + return infinicore::op::linear_w4a16_awq(input_contiguous->contiguous(), qweight, scales, qzeros, bias_opt); +} + +std::vector AWQ::split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int /*narrow_dim*/, + int tp_rank, int tp_size, int tp_num_heads) const { + + // AWQ parameters have output dimension on dim1, so fused split is on dim1. + int fused_dim = get_fused_split_dim(); + int packing_num = get_packing_num(); + std::vector result; + auto qw_it = params.find("qweight"); + auto sc_it = params.find("scales"); + auto qz_it = params.find("qzeros"); + auto bias_it = params.find("bias"); + + for (const auto &s : splits) { + // qweight: narrow along fused_dim, divide size by packing_num + result.push_back({s.prefix + ".qweight", + infinicore::nn::Parameter( + qw_it->second->narrow({{static_cast(fused_dim), s.start / packing_num, s.size / packing_num}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + // scales: narrow along fused_dim + result.push_back({s.prefix + ".scales", + infinicore::nn::Parameter( + sc_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + // qzeros: narrow along fused_dim, divide size by packing_num + result.push_back({s.prefix + ".qzeros", + infinicore::nn::Parameter( + qz_it->second->narrow({{static_cast(fused_dim), s.start / packing_num, s.size / packing_num}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + if (bias_it != params.end()) { + result.push_back({s.prefix + ".bias", + infinicore::nn::Parameter( + bias_it->second->narrow({{0, s.start, s.size}}), + 0, tp_rank, tp_size, s.num_shards)}); + } + } + return result; +} + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/awq.hpp b/csrc/layers/quantization/awq.hpp new file mode 100644 index 00000000..383e574a --- /dev/null +++ b/csrc/layers/quantization/awq.hpp @@ -0,0 +1,48 @@ +#pragma once +#include "base_quantization.hpp" +namespace infinilm::quantization { + +class AWQ : public BaseQuantization { +public: + explicit AWQ(const nlohmann::json &quant_config) + : BaseQuantization(quant_config){}; + + QuantScheme get_quant_scheme() const override { + return QuantScheme::AWQ_W4A16; + }; + + int get_packing_num() const { + return 32 / get_or("bits", 4); + } + + int get_group_size() const { + return get_or("group_size", 128); + } + + std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const override; + + int get_fused_split_dim() const override { return 1; } + + size_t get_logical_dim_size(size_t raw_size) const override { + return raw_size * get_packing_num(); + } + + infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const override; + + std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const override; +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/base_quantization.hpp b/csrc/layers/quantization/base_quantization.hpp new file mode 100644 index 00000000..1fd261bd --- /dev/null +++ b/csrc/layers/quantization/base_quantization.hpp @@ -0,0 +1,122 @@ +#pragma once +#include "infinicore/nn/module.hpp" +#include "infinicore/tensor.hpp" +#include "nlohmann/json.hpp" +#include "quantization_scheme.hpp" +#include +#include +#include + +namespace infinilm::quantization { + +struct ParamDescriptor { + std::string name; + std::vector shape; + infinicore::DataType dtype; + int split_dim = -1; + int tp_rank = 0; + int tp_size = 1; + int tp_num_heads = -1; +}; + +using ParamsMap = std::unordered_map; + +// Describes one shard of a fused linear (e.g., Q, K, V or gate, up) +struct SplitInfo { + std::string prefix; // "q_proj", "k_proj", "v_proj" or "gate_proj", "up_proj" + size_t start; // start offset along narrow_dim + size_t size; // size of this shard along narrow_dim + size_t num_shards = 0; // number of logical shards for KV replication (0 = standard TP split) +}; + +// A named parameter produced by splitting a fused linear +struct SplitParam { + std::string full_name; // "q_proj.weight", "gate_proj.qweight", etc. + infinicore::nn::Parameter param; +}; + +class BaseQuantization : public std::enable_shared_from_this { +public: + explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; + virtual ~BaseQuantization() = default; + + const nlohmann::json &get_config() const { return quant_config_; } + + virtual QuantScheme get_quant_scheme() const = 0; + + // Return the list of parameters this quantization scheme needs + virtual std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const = 0; + + // Forward pass using the registered parameters + virtual infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const = 0; + + // Dimension for fused-split (gate/up, q/k/v) of a column-parallel weight. + // For NoneQuantization weight [out, in], split is on dim0. + // For AWQ qweight [in, out/pack], split is on dim1. + virtual int get_fused_split_dim() const { return 0; } + + // Logical output size along fused_split_dim from a parameter's raw dimension size. + // For packed formats (AWQ, GPTQ), raw size needs to be multiplied by packing_num. + // Default: raw size is already logical size. + virtual size_t get_logical_dim_size(size_t raw_size) const { return raw_size; } + + // Split fused linear parameters into named sub-parameters (for QKV/GateUp) + // params: the fused linear's registered parameters (by name) + // splits: description of each shard + // narrow_dim: 0=narrow dim0, 1=narrow dim1 (for weight-like params) + // Returns a list of (full_name, Parameter) pairs + virtual std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const = 0; + + // Post-loading weight processing (e.g., GPTQ->GPTQ_QY conversion). + // Returns a replacement quantization object if the scheme changed (e.g. GPTQ -> GPTQ_QY), + // or nullptr if no replacement is needed. + virtual std::shared_ptr process_weights_after_loading( + ParamsMap ¶ms, + const infinicore::Device &device) const { + (void)params; + (void)device; + return nullptr; + } + + template + T get(const std::string &key) const { + if (!quant_config_.contains(key)) { + throw std::out_of_range("Key '" + key + "' not found in config."); + } + try { + return quant_config_.at(key).get(); + } catch (const nlohmann::json::type_error &e) { + throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what())); + } + } + + template + T get_or(const std::string &key, const T &default_value) const { + if (!quant_config_.contains(key) || quant_config_.at(key).is_null()) { + return default_value; + } + try { + return quant_config_.at(key).get(); + } catch (const nlohmann::json::type_error &) { + return default_value; + } + } + +protected: + nlohmann::json quant_config_; +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/compressed_tensors.cpp b/csrc/layers/quantization/compressed_tensors.cpp new file mode 100644 index 00000000..66a4a3ef --- /dev/null +++ b/csrc/layers/quantization/compressed_tensors.cpp @@ -0,0 +1,76 @@ +#include "compressed_tensors.hpp" +#include "infinicore/ops/linear_w8a8i8.hpp" +#include + +namespace infinilm::quantization { + +std::vector CompressedTensors::get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int /*tp_num_heads*/, + const infinicore::DataType &dtype, + bool bias) const { + + std::vector descs; + descs.push_back({"weight", {out_features, in_features}, infinicore::DataType::I8, split_dim, tp_rank, tp_size}); + // weight_scale is per-output-channel [out_features, 1]; always split on + // dim0 (output dimension) for ColumnParallel, and don't split for RowParallel. + int scale_split_dim = (split_dim == 0) ? 0 : -1; + int scale_tp_size = (split_dim == 0) ? tp_size : 1; + int scale_tp_rank = (split_dim == 0) ? tp_rank : 0; + descs.push_back({"weight_scale", {out_features, 1}, infinicore::DataType::F32, scale_split_dim, scale_tp_rank, scale_tp_size}); + if (bias) { + descs.push_back({"bias", {out_features}, dtype, -1, 0, 1}); + } + return descs; +} + +infinicore::Tensor CompressedTensors::forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float /*alpha*/) const { + + auto input_contiguous = input->is_contiguous() ? input : input->contiguous(); + auto weight = params.at("weight"); + auto weight_scale = params.at("weight_scale"); + + std::optional bias_opt; + if (has_bias) { + bias_opt = params.at("bias"); + } + + return infinicore::op::linear_w8a8i8(input_contiguous->contiguous(), weight, weight_scale, bias_opt); +} + +std::vector CompressedTensors::split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int /*tp_num_heads*/) const { + + std::vector result; + auto weight_it = params.find("weight"); + auto scale_it = params.find("weight_scale"); + auto bias_it = params.find("bias"); + + for (const auto &s : splits) { + result.push_back({s.prefix + ".weight", + infinicore::nn::Parameter( + weight_it->second->narrow({{static_cast(narrow_dim), s.start, s.size}}), + narrow_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".weight_scale", + infinicore::nn::Parameter( + scale_it->second->narrow({{static_cast(narrow_dim), s.start, s.size}}), + narrow_dim, tp_rank, tp_size, s.num_shards)}); + if (bias_it != params.end()) { + result.push_back({s.prefix + ".bias", + infinicore::nn::Parameter( + bias_it->second->narrow({{0, s.start, s.size}}), + 0, tp_rank, tp_size, s.num_shards)}); + } + } + return result; +} + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/compressed_tensors.hpp b/csrc/layers/quantization/compressed_tensors.hpp new file mode 100644 index 00000000..dcf65c2e --- /dev/null +++ b/csrc/layers/quantization/compressed_tensors.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "base_quantization.hpp" +namespace infinilm::quantization { + +class CompressedTensors : public BaseQuantization { +public: + explicit CompressedTensors(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + QuantScheme get_quant_scheme() const override { + return QuantScheme::COMPRESSED_TENSOR_W8A8I8; + }; + + std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const override; + + infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const override; + + std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const override; +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/gptq.cpp b/csrc/layers/quantization/gptq.cpp new file mode 100644 index 00000000..e7688be5 --- /dev/null +++ b/csrc/layers/quantization/gptq.cpp @@ -0,0 +1,91 @@ +#include "gptq.hpp" +#include "gptq_qy.hpp" + +namespace infinilm::quantization { + +std::vector GPTQ::get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int /*tp_num_heads*/, + const infinicore::DataType &dtype, + bool bias) const { + + // GPTQ weight layout is transposed: qweight [in_features/8, out_features] + // ColumnParallel (split_dim=0, split output) → GPTQ tp_dim=1 + // RowParallel (split_dim=1, split input) → GPTQ tp_dim=0 + int gptq_tp_dim = (split_dim >= 0) ? (1 - split_dim) : -1; + int group_size = get_group_size(); + + std::vector descs; + descs.push_back({"qweight", {in_features / 8, out_features}, infinicore::DataType::I32, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"qzeros", {in_features / group_size, out_features / 8}, infinicore::DataType::I32, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"scales", {in_features / group_size, out_features}, dtype, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"g_idx", {in_features}, infinicore::DataType::I32, 0, tp_rank, tp_size}); + if (bias) { + descs.push_back({"bias", {out_features}, dtype, -1, 0, 1}); + } + return descs; +} + +infinicore::Tensor GPTQ::forward( + const ParamsMap & /*params*/, + const infinicore::Tensor & /*input*/, + bool /*has_bias*/, + float /*alpha*/) const { + throw std::runtime_error("GPTQ_W4A16 must be converted to GPTQ_QY before forward pass. " + "Call process_weights_after_loading() first."); +} + +std::shared_ptr GPTQ::process_weights_after_loading( + ParamsMap ¶ms, + const infinicore::Device &device) const { + + if (device.getType() == infinicore::Device::Type::QY) { + return GPTQ_QY::convert_from_gptq(params, device, get_config()); + } + return std::const_pointer_cast(shared_from_this()); +} + +std::vector GPTQ::split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int /*narrow_dim*/, + int tp_rank, int tp_size, int tp_num_heads) const { + + // GPTQ parameters have output dimension on dim1. + int fused_dim = get_fused_split_dim(); + std::vector result; + auto qw_it = params.find("qweight"); + auto qz_it = params.find("qzeros"); + auto sc_it = params.find("scales"); + auto gidx_it = params.find("g_idx"); + auto bias_it = params.find("bias"); + + for (const auto &s : splits) { + result.push_back({s.prefix + ".qweight", + infinicore::nn::Parameter( + qw_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".qzeros", + infinicore::nn::Parameter( + qz_it->second->narrow({{static_cast(fused_dim), s.start / 8, s.size / 8}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".scales", + infinicore::nn::Parameter( + sc_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".g_idx", + infinicore::nn::Parameter( + gidx_it->second->narrow({{0, 0, gidx_it->second->size(0)}}), + 0, 0, 1, 0)}); + if (bias_it != params.end()) { + result.push_back({s.prefix + ".bias", + infinicore::nn::Parameter( + bias_it->second->narrow({{0, s.start, s.size}}), + 0, tp_rank, tp_size, s.num_shards)}); + } + } + return result; +} + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/gptq.hpp b/csrc/layers/quantization/gptq.hpp new file mode 100644 index 00000000..455dde2c --- /dev/null +++ b/csrc/layers/quantization/gptq.hpp @@ -0,0 +1,48 @@ +#pragma once +#include "base_quantization.hpp" +namespace infinilm::quantization { + +class GPTQ : public BaseQuantization { +public: + explicit GPTQ(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + QuantScheme get_quant_scheme() const override { + return QuantScheme::GPTQ_W4A16; + }; + + int get_packing_num() const { + return 32 / get_or("bits", 4); + } + + int get_group_size() const { + return get_or("group_size", 128); + } + + std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const override; + + int get_fused_split_dim() const override { return 1; } + + infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const override; + + std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const override; + + std::shared_ptr process_weights_after_loading( + ParamsMap ¶ms, + const infinicore::Device &device) const override; +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/gptq_qy.cpp b/csrc/layers/quantization/gptq_qy.cpp new file mode 100644 index 00000000..4098e452 --- /dev/null +++ b/csrc/layers/quantization/gptq_qy.cpp @@ -0,0 +1,259 @@ +#include "gptq_qy.hpp" +#include "infinicore/ops.hpp" +#include "infinicore/ops/linear_w4a16_gptq_qy.hpp" +#include + +namespace infinilm::quantization { + +std::vector GPTQ_QY::get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int /*tp_num_heads*/, + const infinicore::DataType &dtype, + bool bias) const { + + // GPTQ_QY weight layout is transposed: qweight [in_features/2, out_features] + // ColumnParallel (split_dim=0, split output) → tp_dim=1 + // RowParallel (split_dim=1, split input) → tp_dim=0 + int gptq_tp_dim = (split_dim >= 0) ? (1 - split_dim) : -1; + int group_size = get_group_size(); + + std::vector descs; + descs.push_back({"qweight", {in_features / 2, out_features}, infinicore::DataType::U8, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"qzeros", {in_features / group_size, out_features}, dtype, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"scales", {in_features / group_size, out_features}, dtype, gptq_tp_dim, tp_rank, tp_size}); + descs.push_back({"g_idx", {in_features}, infinicore::DataType::I32, -1, 0, 1}); + if (bias) { + descs.push_back({"bias", {out_features}, dtype, -1, 0, 1}); + } + return descs; +} + +infinicore::Tensor GPTQ_QY::forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float /*alpha*/) const { + auto input_contiguous = input->is_contiguous() ? input : input->contiguous(); + auto qweight = params.at("qweight"); + auto qzeros = params.at("qzeros"); + auto scales = params.at("scales"); + + auto output = infinicore::op::linear_w4a16_gptq_qy(input_contiguous->contiguous(), qweight, qzeros, scales, 0, 4); + + if (has_bias) { + auto bias = params.at("bias"); + infinicore::op::add_(output, output, bias->as_strided(output->shape(), {0, 0, 1})); + } + return output; +} + +std::vector GPTQ_QY::split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int /*narrow_dim*/, + int tp_rank, int tp_size, int tp_num_heads) const { + + // GPTQ_QY parameters have output dimension on dim1. + int fused_dim = get_fused_split_dim(); + std::vector result; + auto qw_it = params.find("qweight"); + auto qz_it = params.find("qzeros"); + auto sc_it = params.find("scales"); + auto gidx_it = params.find("g_idx"); + auto bias_it = params.find("bias"); + + for (const auto &s : splits) { + result.push_back({s.prefix + ".qweight", + infinicore::nn::Parameter( + qw_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".qzeros", + infinicore::nn::Parameter( + qz_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".scales", + infinicore::nn::Parameter( + sc_it->second->narrow({{static_cast(fused_dim), s.start, s.size}}), + fused_dim, tp_rank, tp_size, s.num_shards)}); + result.push_back({s.prefix + ".g_idx", + infinicore::nn::Parameter( + gidx_it->second->narrow({{0, 0, gidx_it->second->size(0)}}), + 0, 0, 1, 0)}); + if (bias_it != params.end()) { + result.push_back({s.prefix + ".bias", + infinicore::nn::Parameter( + bias_it->second->narrow({{0, s.start, s.size}}), + 0, tp_rank, tp_size, s.num_shards)}); + } + } + return result; +} + +// ---- Conversion from GPTQ_W4A16 ---- + +std::shared_ptr GPTQ_QY::convert_from_gptq( + ParamsMap ¶ms, + const infinicore::Device &device, + const nlohmann::json &quant_config) { + + auto gptq_qy = std::make_shared(quant_config); + const int bits = gptq_qy->weight_bits(); + const int values_per_int32 = 32 / bits; + + const auto &original_qweight = params.at("qweight"); + const auto &original_qzeros = params.at("qzeros"); + const auto &original_scales = params.at("scales"); + const auto &g_idx = params.at("g_idx"); + + { + const auto &shape = original_qweight->shape(); + assert(shape.size() == 2); + size_t M = shape[0], N = shape[1]; + + auto weight_unpacked = unpack_int32_to_nibbles_3d_(original_qweight, bits); + auto weight_packed = combine_nibbles_last_dim_(weight_unpacked, M, values_per_int32, N); + + size_t dimY = N; + size_t total_bytes = M * values_per_int32 * (N / 2); + size_t dimX = total_bytes / dimY; + + assert(dimX * dimY == total_bytes && "Weight shape calculation mismatch"); + + params["qweight"] = make_tensor_from_host_( + weight_packed.data(), total_bytes * sizeof(uint8_t), + {dimX, dimY}, infinicore::DataType::U8, device); + } + + { + const auto &shape = original_qzeros->shape(); + assert(shape.size() == 2); + size_t P = shape[0], Q = shape[1]; + + auto zeros_fp32 = unpack_zeros_to_fp32_2d_(original_qzeros, bits); + auto zeros_fp16 = infinilm::detail::float_to_fp16_bits(zeros_fp32); + + params["qzeros"] = make_tensor_from_host_( + zeros_fp16.data(), zeros_fp16.size() * sizeof(uint16_t), + {P, Q * static_cast(values_per_int32)}, + infinicore::DataType::F16, device); + } + + { + auto scales_cpu = original_scales->to(infinicore::Device::Type::CPU); + size_t num_elements = scales_cpu->numel(); + const void *raw_data = scales_cpu->data(); + + std::vector scales_fp16(num_elements); + if (scales_cpu->dtype() == infinicore::DataType::F16) { + std::memcpy(scales_fp16.data(), raw_data, num_elements * sizeof(uint16_t)); + } else if (scales_cpu->dtype() == infinicore::DataType::F32) { + std::vector scales_fp32(num_elements); + std::memcpy(scales_fp32.data(), raw_data, num_elements * sizeof(float)); + scales_fp16 = infinilm::detail::float_to_fp16_bits(scales_fp32); + } else { + spdlog::error("Unsupported scales dtype, expected F16 or F32"); + assert(false && "Unsupported scales dtype"); + } + + params["scales"] = make_tensor_from_host_( + scales_fp16.data(), scales_fp16.size() * sizeof(uint16_t), + original_scales->shape(), infinicore::DataType::F16, device); + } + + if (g_idx->numel() > 0) { + params["g_idx"] = g_idx->to(device); + } + + return gptq_qy; +} + +// ---- Private helpers ---- + +std::vector GPTQ_QY::unpack_int32_to_nibbles_3d_(const infinicore::Tensor &packed, int bits) { + assert(bits == 4 || bits == 8); + const int values_per_int32 = 32 / bits; + + auto packed_cpu = packed->to(infinicore::Device::Type::CPU); + const int32_t *packed_host = reinterpret_cast(packed_cpu->data()); + + const auto &shape = packed->shape(); + assert(shape.size() == 2); + size_t M = shape[0], N = shape[1]; + + std::vector unpacked(M * values_per_int32 * N); + + for (size_t i = 0; i < M; ++i) { + for (int k = 0; k < values_per_int32; ++k) { + for (size_t j = 0; j < N; ++j) { + int32_t val = packed_host[i * N + j]; + uint8_t extracted = static_cast((val >> (k * bits)) & ((1 << bits) - 1)); + size_t idx = i * (values_per_int32 * N) + k * N + j; + unpacked[idx] = extracted; + } + } + } + return unpacked; +} + +std::vector GPTQ_QY::combine_nibbles_last_dim_( + const std::vector &nibbles, size_t M, size_t K, size_t N) { + assert(N % 2 == 0 && "Last dimension must be even for nibble pairing"); + + std::vector combined(M * K * (N / 2)); + size_t out_idx = 0; + + for (size_t i = 0; i < M; ++i) { + for (size_t k = 0; k < K; ++k) { + size_t row_base = i * (K * N) + k * N; + for (size_t j = 0; j < N; j += 2) { + uint8_t low = nibbles[row_base + j] & 0x0F; + uint8_t high = nibbles[row_base + j + 1] & 0x0F; + combined[out_idx++] = static_cast((high << 4) | low); + } + } + } + return combined; +} + +std::vector GPTQ_QY::unpack_zeros_to_fp32_2d_(const infinicore::Tensor &packed_zeros, int bits) { + assert(bits == 4 || bits == 8); + const int values_per_int32 = 32 / bits; + const int mask = (1 << bits) - 1; + + auto packed_cpu = packed_zeros->to(infinicore::Device::Type::CPU); + const int32_t *packed_host = reinterpret_cast(packed_cpu->data()); + + const auto &shape = packed_zeros->shape(); + assert(shape.size() == 2); + size_t P = shape[0], Q = shape[1]; + + std::vector result(P * Q * values_per_int32); + size_t out_idx = 0; + + for (size_t p = 0; p < P; ++p) { + for (size_t q = 0; q < Q; ++q) { + int32_t val = packed_host[p * Q + q]; + for (int k = 0; k < values_per_int32; ++k) { + uint8_t extracted = static_cast((val >> (k * bits)) & mask); + int dequant_val = (static_cast(extracted) + 1) & mask; + result[out_idx++] = static_cast(dequant_val); + } + } + } + return result; +} + +infinicore::Tensor GPTQ_QY::make_tensor_from_host_(const void *data, size_t bytes, + const std::vector &shape, + infinicore::DataType dtype, const infinicore::Device &device) { + auto tensor = infinicore::Tensor::empty(shape, dtype, infinicore::Device::Type::CPU); + std::memcpy(reinterpret_cast(tensor->data()), data, bytes); + + if (device != infinicore::Device::Type::CPU) { + return tensor->to(device); + } + return tensor; +} + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/gptq_qy.hpp b/csrc/layers/quantization/gptq_qy.hpp new file mode 100644 index 00000000..634b4aaf --- /dev/null +++ b/csrc/layers/quantization/gptq_qy.hpp @@ -0,0 +1,139 @@ +#pragma once + +#include "base_quantization.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace infinilm::detail { + +inline uint16_t fp32_to_fp16_bits(float value) { + union { + float f; + uint32_t u; + } f2u; + f2u.f = value; + uint32_t x = f2u.u; + + uint32_t sign = (x >> 16) & 0x8000; + int32_t exp = ((x >> 23) & 0xFF) - 127; + uint32_t mantissa = x & 0x007FFFFF; + + if (exp == 128) { + if (mantissa == 0) { + return static_cast(sign | 0x7C00); + } + return static_cast(sign | 0x7C00 | (mantissa >> 13)); + } + if (exp > 15) { + return static_cast(sign | 0x7C00); + } + if (exp < -14) { + if (exp < -24) { + return static_cast(sign); + } + mantissa |= 0x00800000; + uint32_t shift = -exp - 14; + mantissa >>= shift; + if ((mantissa & 0x1000) && ((mantissa & 0x2FFF) != 0)) { + mantissa += 0x2000; + } + return static_cast(sign | (mantissa >> 13)); + } + + uint32_t exp16 = static_cast(exp + 15) << 10; + uint32_t mantissa16 = mantissa >> 13; + if ((mantissa & 0x1000) && ((mantissa & 0x2FFF) || (mantissa16 & 1))) { + mantissa16++; + if (mantissa16 == 0x400) { + exp16 += 0x400; + mantissa16 = 0; + } + } + return static_cast(sign | exp16 | mantissa16); +} + +inline std::vector float_to_fp16_bits(const std::vector &values) { + std::vector result; + result.reserve(values.size()); + for (float f : values) { + result.push_back(fp32_to_fp16_bits(f)); + } + return result; +} + +} // namespace infinilm::detail + +namespace infinilm::quantization { + +class GPTQ_QY : public BaseQuantization { +public: + explicit GPTQ_QY(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) { + int bits = weight_bits(); + if (bits != 4) { + spdlog::warn("GPTQ_QY: bits={} not fully tested, expected 4", bits); + } + } + + QuantScheme get_quant_scheme() const override { + return QuantScheme::GPTQ_W4A16_QY; + } + + int get_packing_num() const { + return 32 / weight_bits(); + } + + int get_group_size() const { + return get_or("group_size", 128); + } + + int weight_bits() const { return get_or("bits", 4); } + bool desc_act() const { return get_or("desc_act", false); } + + // Parameter layout for GPTQ_QY (already converted format) + std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const override; + + int get_fused_split_dim() const override { return 1; } + + infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const override; + + // Split fused linear parameters into named sub-parameters + std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const override; + + // Convert from GPTQ_W4A16 format and update params in-place. + // Returns a new GPTQ_QY quantization instance. Returns nullptr if + // the device is not QY. + static std::shared_ptr convert_from_gptq( + ParamsMap ¶ms, + const infinicore::Device &device, + const nlohmann::json &quant_config); + +private: + static std::vector unpack_int32_to_nibbles_3d_(const infinicore::Tensor &packed, int bits); + static std::vector combine_nibbles_last_dim_(const std::vector &nibbles, size_t M, size_t K, size_t N); + static std::vector unpack_zeros_to_fp32_2d_(const infinicore::Tensor &packed_zeros, int bits); + static infinicore::Tensor make_tensor_from_host_(const void *data, size_t bytes, + const std::vector &shape, + infinicore::DataType dtype, const infinicore::Device &device); +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/kv_quant.cpp b/csrc/layers/quantization/kv_quant.cpp index 5994f644..458e568b 100644 --- a/csrc/layers/quantization/kv_quant.cpp +++ b/csrc/layers/quantization/kv_quant.cpp @@ -7,11 +7,11 @@ namespace infinilm { void KVQuantUtils::quantize( infinicore::Tensor &k, infinicore::Tensor &v, - infinicore::quantization::KVQuantAlgo algo, + infinilm::quantization::KVQuantAlgo algo, const infinicore::Tensor &k_scale, const infinicore::Tensor &v_scale) { - if (algo == infinicore::quantization::KVQuantAlgo::NONE) { + if (algo == infinilm::quantization::KVQuantAlgo::NONE) { return; } @@ -26,12 +26,12 @@ void KVQuantUtils::quantize( void KVQuantUtils::dequantize( infinicore::Tensor &k, infinicore::Tensor &v, - infinicore::quantization::KVQuantAlgo algo, + infinilm::quantization::KVQuantAlgo algo, const infinicore::Tensor &k_scale, const infinicore::Tensor &v_scale, const infinicore::Tensor &reference) { - if (algo == infinicore::quantization::KVQuantAlgo::NONE) { + if (algo == infinilm::quantization::KVQuantAlgo::NONE) { return; // 无需反量化 } diff --git a/csrc/layers/quantization/kv_quant.hpp b/csrc/layers/quantization/kv_quant.hpp index 94383af0..91d1f500 100644 --- a/csrc/layers/quantization/kv_quant.hpp +++ b/csrc/layers/quantization/kv_quant.hpp @@ -1,6 +1,6 @@ #pragma once -#include "infinicore/quantization.hpp" +#include "quantization_scheme.hpp" #include "infinicore/tensor.hpp" #include @@ -19,7 +19,7 @@ class KVQuantUtils { static void quantize( infinicore::Tensor &k, infinicore::Tensor &v, - infinicore::quantization::KVQuantAlgo algo, + infinilm::quantization::KVQuantAlgo algo, const infinicore::Tensor &k_scale, const infinicore::Tensor &v_scale); @@ -35,7 +35,7 @@ class KVQuantUtils { static void dequantize( infinicore::Tensor &k, infinicore::Tensor &v, - infinicore::quantization::KVQuantAlgo algo, + infinilm::quantization::KVQuantAlgo algo, const infinicore::Tensor &k_scale, const infinicore::Tensor &v_scale, const infinicore::Tensor &reference); diff --git a/csrc/layers/quantization/none_quantization.cpp b/csrc/layers/quantization/none_quantization.cpp new file mode 100644 index 00000000..6f49a394 --- /dev/null +++ b/csrc/layers/quantization/none_quantization.cpp @@ -0,0 +1,65 @@ +#include "none_quantization.hpp" +#include "infinicore/ops/linear.hpp" +#include + +namespace infinilm::quantization { + +std::vector NoneQuantization::get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int /*tp_num_heads*/, + const infinicore::DataType &dtype, + bool bias) const { + + std::vector descs; + descs.push_back({"weight", {out_features, in_features}, dtype, split_dim, tp_rank, tp_size}); + if (bias) { + descs.push_back({"bias", {out_features}, dtype, split_dim >= 0 ? 0 : -1, + split_dim >= 0 ? tp_rank : 0, split_dim >= 0 ? tp_size : 1}); + } + return descs; +} + +infinicore::Tensor NoneQuantization::forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha) const { + + auto input_contiguous = input->is_contiguous() ? input : input->contiguous(); + auto weight = params.at("weight"); + + std::optional bias_opt; + if (has_bias) { + bias_opt = params.at("bias"); + } + + return infinicore::op::linear(input_contiguous->contiguous(), weight->contiguous(), bias_opt, alpha); +} + +std::vector NoneQuantization::split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int /*tp_num_heads*/) const { + + std::vector result; + auto weight_it = params.find("weight"); + auto bias_it = params.find("bias"); + + for (const auto &s : splits) { + result.push_back({s.prefix + ".weight", + infinicore::nn::Parameter( + weight_it->second->narrow({{static_cast(narrow_dim), s.start, s.size}}), + narrow_dim, tp_rank, tp_size, s.num_shards)}); + if (bias_it != params.end()) { + result.push_back({s.prefix + ".bias", + infinicore::nn::Parameter( + bias_it->second->narrow({{0, s.start, s.size}}), + 0, tp_rank, tp_size, s.num_shards)}); + } + } + return result; +} + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/none_quantization.hpp b/csrc/layers/quantization/none_quantization.hpp new file mode 100644 index 00000000..44fd890d --- /dev/null +++ b/csrc/layers/quantization/none_quantization.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "base_quantization.hpp" +namespace infinilm::quantization { + +class NoneQuantization : public BaseQuantization { +public: + explicit NoneQuantization(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) {}; + + QuantScheme get_quant_scheme() const override { + return QuantScheme::NONE; + }; + + std::vector get_param_layout( + size_t in_features, size_t out_features, + int split_dim, int tp_rank, int tp_size, + int tp_num_heads, + const infinicore::DataType &dtype, + bool bias) const override; + + infinicore::Tensor forward( + const ParamsMap ¶ms, + const infinicore::Tensor &input, + bool has_bias, + float alpha = 1.0f) const override; + + std::vector split_params( + const std::unordered_map ¶ms, + const std::vector &splits, + int narrow_dim, + int tp_rank, int tp_size, int tp_num_heads) const override; +}; + +} // namespace infinilm::quantization diff --git a/csrc/layers/quantization/quantization.hpp b/csrc/layers/quantization/quantization.hpp new file mode 100644 index 00000000..fffcc873 --- /dev/null +++ b/csrc/layers/quantization/quantization.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "awq.hpp" +#include "base_quantization.hpp" +#include "compressed_tensors.hpp" +#include "gptq.hpp" +#include "gptq_qy.hpp" +#include "none_quantization.hpp" +#include "quantization_scheme.hpp" diff --git a/csrc/layers/quantization/quantization_scheme.hpp b/csrc/layers/quantization/quantization_scheme.hpp new file mode 100644 index 00000000..be7eb269 --- /dev/null +++ b/csrc/layers/quantization/quantization_scheme.hpp @@ -0,0 +1,18 @@ +#pragma once + +namespace infinilm::quantization { + +enum class QuantScheme { + NONE, + COMPRESSED_TENSOR_W8A8I8, + AWQ_W4A16, + GPTQ_W4A16_QY, + GPTQ_W4A16, +}; + +enum class KVQuantAlgo { + NONE, + INT8, +}; + +} // namespace infinilm::quantization diff --git a/csrc/models/infinilm_model.cpp b/csrc/models/infinilm_model.cpp index 586d7f2c..3923474e 100644 --- a/csrc/models/infinilm_model.cpp +++ b/csrc/models/infinilm_model.cpp @@ -2,6 +2,8 @@ #include "../backends/attention_backends.hpp" #include "../cache/kv_cache.hpp" #include "../global_state/global_state.hpp" +#include "../layers/attention/attention.hpp" +#include "../layers/mlp/mlp.hpp" #include namespace infinilm { @@ -91,13 +93,20 @@ void InfinilmModel::process_weights_after_loading() { } void InfinilmModel::process_weights_recursive_(infinicore::nn::Module *module) { - auto submodules = module->modules_dict(); - for (auto &[name, sub] : submodules) { - process_weights_recursive_(sub); + for (const auto &[name, sub] : module->children()) { + process_weights_recursive_(sub.get()); } - if (auto *linear = dynamic_cast(module)) { + // Process BaseLinear (o_proj, down_proj, lm_head, etc.) + if (auto *linear = dynamic_cast(module)) { linear->process_weights_after_loading(); } + // Process fused linear held by Attention/MLP as non-registered members + if (auto *attn = dynamic_cast(module)) { + attn->process_fused_weights_after_loading(); + } + if (auto *mlp = dynamic_cast(module)) { + mlp->process_fused_weights_after_loading(); + } } } // namespace infinilm diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index 7570a6f2..33e3909e 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -3,7 +3,7 @@ #include "../backends/attention_backends.hpp" #include "../cache/cache.hpp" #include "../config/model_config.hpp" -#include "infinicore/nn/linear.hpp" +#include "../layers/linear/linear.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/tensor.hpp" diff --git a/csrc/models/llama_legacy/llama_attention.cpp b/csrc/models/llama_legacy/llama_attention.cpp index a95bb74c..ded2dbd3 100644 --- a/csrc/models/llama_legacy/llama_attention.cpp +++ b/csrc/models/llama_legacy/llama_attention.cpp @@ -1,7 +1,8 @@ #include "llama_attention.hpp" +#include "../../layers/attention/attention.hpp" +#include "../../layers/linear/linear.hpp" #include "../../utils.hpp" -#include "infinicore/nn/linear.hpp" #include "infinicore/nn/rope.hpp" #include "infinicore/ops.hpp" #include "infinicore/ops/mha_kvcache.hpp" @@ -20,65 +21,6 @@ namespace infinilm::models::llama_legacy { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaAttention::LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : layer_idx_(layer_idx), - hidden_size_(config.hidden_size), - num_attention_heads_(config.num_attention_heads), - num_key_value_heads_(config.num_key_value_heads), - head_dim_(config.head_dim), - kv_dim_(config.kv_dim()), - use_bias_(config.attention_bias), - use_output_bias_(config.attention_output_bias), - use_qk_norm_(config.qk_norm), - max_position_embeddings_(config.max_position_embeddings), - rank_info_(rank_info), - attention_backend_(attention_backend) { - const auto &dtype{config.dtype}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - int num_attention_heads = config.num_attention_heads; - int num_key_value_heads = config.num_key_value_heads; - - if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { - this->num_attention_heads_ = num_attention_heads / tp_size; - this->num_key_value_heads_ = num_key_value_heads / tp_size; - } else { - throw std::runtime_error("num_attention_heads / tp_size error."); - } - scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); - - // Initialize projection layers - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, - dtype, device, rank_info); - // Output projection uses attention_output_bias (can be different from qkv) - INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads * head_dim_, hidden_size_, use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - - // Initialize qk RMSNorm - if (use_qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, config.rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, config.rms_norm_eps, dtype, device); - } -} - LlamaAttention::LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -112,54 +54,21 @@ LlamaAttention::LlamaAttention(std::shared_ptr mo } scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); - auto quant_scheme = this->model_config_->get_quant_scheme(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { - - INFINILM_QKV_LINEAR_W4A16GPTQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info); - - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - - break; - } - default: - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } + auto quantization_method = this->model_config_->get_quantization_method(); + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + qkv_proj_ = std::make_shared( + hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), + "q_proj", "k_proj", "v_proj", register_fn, + quantization_method, use_bias_, dtype, device, rank_info); + o_proj_ = this->register_module( + "o_proj", model_config_->get("num_attention_heads") * head_dim_, hidden_size_, quantization_method, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); if (model_config_->get("model_type") == "qwen3") { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + q_norm_ = this->register_module("q_norm", head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + k_norm_ = this->register_module("k_norm", head_dim_, model_config_->get("rms_norm_eps"), dtype, device); } - switch (this->model_config_->get_kv_quant_scheme()) { - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; - } - default: { - break; - } - } + infinilm::layers::attention::init_kv_cache_quant_params(register_fn, device, kv_cache_k_scale_, kv_cache_v_scale_); } infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states, diff --git a/csrc/models/llama_legacy/llama_attention.hpp b/csrc/models/llama_legacy/llama_attention.hpp index c1fb4871..a4aa080b 100644 --- a/csrc/models/llama_legacy/llama_attention.hpp +++ b/csrc/models/llama_legacy/llama_attention.hpp @@ -8,7 +8,7 @@ #include "../../layers/quantization/kv_quant.hpp" #include "llama_config.hpp" -#include "infinicore/nn/linear.hpp" +#include "../../layers/linear/linear.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/nn/rope.hpp" @@ -39,24 +39,6 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -115,18 +97,18 @@ class LlamaAttention : public infinicore::nn::Module { protected: // Projection layers - INFINICORE_NN_MODULE(infinilm::layers::linear::QKVParallelLinear, qkv_proj); - INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, o_proj); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); + std::shared_ptr qkv_proj_; + std::shared_ptr o_proj_; + std::shared_ptr q_norm_; + std::shared_ptr k_norm_; engine::distributed::RankInfo rank_info_; // Shared Rotary Position Embeddings (RoPE) std::shared_ptr rotary_emb_; // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); + infinicore::nn::Parameter kv_cache_k_scale_; + infinicore::nn::Parameter kv_cache_v_scale_; private: std::shared_ptr model_config_ = std::make_shared(); diff --git a/csrc/models/llama_legacy/llama_decoder_layer.cpp b/csrc/models/llama_legacy/llama_decoder_layer.cpp index 6ea5215e..0cb7fb83 100644 --- a/csrc/models/llama_legacy/llama_decoder_layer.cpp +++ b/csrc/models/llama_legacy/llama_decoder_layer.cpp @@ -4,35 +4,6 @@ #include namespace infinilm::models::llama_legacy { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) : layer_idx_(layer_idx), rank_info_(rank_info) { - const auto &dtype{config.dtype}; - - // Initialize layer normalization layers - INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps, - dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, config.hidden_size, config.rms_norm_eps, - dtype, device); - - // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_, attention_backend); - INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); -} LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, @@ -40,15 +11,13 @@ LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptrget_dtype()}; - // Initialize layer normalization layers - INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + input_layernorm_ = this->register_module("input_layernorm", model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + post_attention_layernorm_ = this->register_module("post_attention_layernorm", model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); - // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_, attention_backend); - INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_); + self_attn_ = this->register_module("self_attn", model_config_, device, layer_idx, rank_info_, attention_backend); + mlp_ = this->register_module("mlp", model_config_, device, rank_info_); } std::tuple diff --git a/csrc/models/llama_legacy/llama_decoder_layer.hpp b/csrc/models/llama_legacy/llama_decoder_layer.hpp index 3ea152bf..9943639b 100644 --- a/csrc/models/llama_legacy/llama_decoder_layer.hpp +++ b/csrc/models/llama_legacy/llama_decoder_layer.hpp @@ -33,24 +33,6 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaDecoderLayer(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -92,12 +74,11 @@ class LlamaDecoderLayer : public infinicore::nn::Module { protected: // Layer normalization - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); + std::shared_ptr input_layernorm_; + std::shared_ptr post_attention_layernorm_; - // Attention and MLP - INFINICORE_NN_MODULE(LlamaAttention, self_attn); - INFINICORE_NN_MODULE(LlamaMLP, mlp); + std::shared_ptr self_attn_; + std::shared_ptr mlp_; engine::distributed::RankInfo rank_info_; std::shared_ptr model_config_; diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.cpp b/csrc/models/llama_legacy/llama_for_causal_lm.cpp index d4e24310..2b0f5d72 100644 --- a/csrc/models/llama_legacy/llama_for_causal_lm.cpp +++ b/csrc/models/llama_legacy/llama_for_causal_lm.cpp @@ -3,36 +3,6 @@ #include "infinicore/nn/linear.hpp" #include "infinicore/ops.hpp" namespace infinilm::models::llama_legacy { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) { - spdlog::warn("infinilm::models::llama_legacy: LlamaForCausalLM is no longer supported, please use the new model instead."); - - // Initialize module's device_ member - device_ = device; - const auto &dtype{config.dtype}; - // Initialize base model - INFINICORE_NN_MODULE_INIT(model, config, device, rank_info, attention_backend); - - // Initialize language modeling head - // Note: If tie_word_embeddings is true, we would share weights with embed_tokens - // For now, we create a separate linear layer - INFINICORE_NN_MODULE_INIT(lm_head, config.hidden_size, config.vocab_size, false, - dtype, device); -} LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, @@ -40,17 +10,11 @@ LlamaForCausalLM::LlamaForCausalLM(std::shared_ptrget_dtype()}; - // Initialize base model - INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info, attention_backend); - // Initialize language modeling head - // Note: If tie_word_embeddings is true, we would share weights with embed_tokens - // For now, we create a separate linear layer - - INFINICORE_NN_MODULE_INIT(lm_head, model_config->get("hidden_size"), model_config->get("vocab_size"), false, + model_ = this->register_module("model", model_config, device, rank_info, attention_backend); + lm_head_ = this->register_module("lm_head", model_config->get("hidden_size"), model_config->get("vocab_size"), false, dtype, device); } @@ -64,11 +28,9 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { auto block_tables = input.block_tables; auto slot_mapping = input.slot_mapping; - // 1. Forward through base model to get hidden states auto hidden_states = model_->forward( input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping); - // 2. Apply language modeling head to get logits auto logits = lm_head_->forward(hidden_states); return {logits}; } diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.hpp b/csrc/models/llama_legacy/llama_for_causal_lm.hpp index 1e4ac9bc..1ccc8db0 100644 --- a/csrc/models/llama_legacy/llama_for_causal_lm.hpp +++ b/csrc/models/llama_legacy/llama_for_causal_lm.hpp @@ -28,23 +28,6 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaForCausalLM(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), @@ -70,10 +53,10 @@ class LlamaForCausalLM : public InfinilmModel { protected: // Base model - INFINICORE_NN_MODULE(LlamaModel, model); + std::shared_ptr model_; // Language modeling head - INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head); + std::shared_ptr lm_head_; std::unique_ptr cache_config_; }; diff --git a/csrc/models/llama_legacy/llama_mlp.cpp b/csrc/models/llama_legacy/llama_mlp.cpp index 6dd4eaaa..955824c0 100644 --- a/csrc/models/llama_legacy/llama_mlp.cpp +++ b/csrc/models/llama_legacy/llama_mlp.cpp @@ -1,78 +1,29 @@ #include "llama_mlp.hpp" -#include "infinicore/nn/linear.hpp" +#include "../../layers/linear/linear.hpp" #include "infinicore/ops.hpp" namespace infinilm::models::llama_legacy { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaMLP::LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : hidden_size_(config.hidden_size), - intermediate_size_(config.intermediate_size), - use_bias_(config.mlp_bias), rank_info_(rank_info) { - const auto &dtype{config.dtype}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - // Initialize projection layers - INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); -} LlamaMLP::LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) - : model_config_(model_config), hidden_size_(model_config->get("hidden_size")), - intermediate_size_(model_config->get("intermediate_size")), - use_bias_(model_config->get_or("mlp_bias", false)), rank_info_(rank_info) { + : model_config_(model_config), hidden_size_(model_config_->get("hidden_size")), + intermediate_size_(model_config_->get("intermediate_size")), + use_bias_(model_config_->get_or("mlp_bias", false)), rank_info_(rank_info) { const auto &dtype{model_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; - // Initialize projection layers - auto quant_scheme = this->model_config_->get_quant_scheme(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - case infinicore::quantization::QuantScheme::AWQ_W4A16: - INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: - INFINILM_GATE_UP_LINEAR_W4A16GPTQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - default: - INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } + auto quantization_method = this->model_config_->get_quantization_method(); + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + gate_up_proj_ = std::make_shared( + hidden_size_, intermediate_size_, "gate_proj", "up_proj", register_fn, + quantization_method, use_bias_, dtype, device, rank_info_); + down_proj_ = this->register_module( + "down_proj", intermediate_size_, hidden_size_, quantization_method, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); } infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const { diff --git a/csrc/models/llama_legacy/llama_mlp.hpp b/csrc/models/llama_legacy/llama_mlp.hpp index 8ad03b52..16f733e7 100644 --- a/csrc/models/llama_legacy/llama_mlp.hpp +++ b/csrc/models/llama_legacy/llama_mlp.hpp @@ -5,7 +5,7 @@ #include "../../config/model_config.hpp" #include "infinicore/device.hpp" -#include "infinicore/nn/linear.hpp" +#include "../../layers/linear/linear.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/tensor.hpp" #include "llama_config.hpp" @@ -34,22 +34,6 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -67,8 +51,8 @@ class LlamaMLP : public infinicore::nn::Module { size_t intermediate_size() const { return intermediate_size_; } protected: - INFINICORE_NN_MODULE(layers::linear::GateUpParallelLinear, gate_up_proj); - INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, down_proj); + std::shared_ptr gate_up_proj_; + std::shared_ptr down_proj_; engine::distributed::RankInfo rank_info_; size_t hidden_size_; diff --git a/csrc/models/llama_legacy/llama_model.cpp b/csrc/models/llama_legacy/llama_model.cpp index 20724135..b859bef8 100644 --- a/csrc/models/llama_legacy/llama_model.cpp +++ b/csrc/models/llama_legacy/llama_model.cpp @@ -6,54 +6,6 @@ #include namespace infinilm::models::llama_legacy { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaModel::LlamaModel(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : config_(config), rank_info_(rank_info) { - const auto &dtype{config.dtype}; - // Initialize token embeddings - INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size, - std::nullopt, dtype, device); - - // Initialize decoder layers with layer indices - // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments - // (e.g., via a factory function or lambda that receives the layer index) - // Currently, we can't use the macro because each layer needs a different layer_idx - layers_.reserve(config.num_hidden_layers); - for (size_t i = 0; i < config.num_hidden_layers; ++i) { - layers_.push_back(this->register_module( - "layers." + std::to_string(i), config, device, i, rank_info, attention_backend)); - } - - // Initialize final layer normalization - INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps, - dtype, device); - - // Initialize Rotary Position Embeddings (shared across all layers) - // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing - INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings, - config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX, - dtype, device, config.rope_scaling); - - for (auto &layer : layers_) { - if (layer) { - layer->set_rotary_emb(rotary_emb_); - } - } -} LlamaModel::LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, @@ -61,24 +13,16 @@ LlamaModel::LlamaModel(std::shared_ptr model_conf backends::AttentionBackend attention_backend) : model_config_(model_config), rank_info_(rank_info) { const auto &dtype{model_config_->get_dtype()}; - // Initialize token embeddings - INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get("vocab_size"), model_config_->get("hidden_size"), + embed_tokens_ = this->register_module("embed_tokens", model_config_->get("vocab_size"), model_config_->get("hidden_size"), std::nullopt, dtype, device); - // Initialize decoder layers with layer indices - // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments - // (e.g., via a factory function or lambda that receives the layer index) - // Currently, we can't use the macro because each layer needs a different layer_idx layers_.reserve(model_config_->get("num_hidden_layers")); for (size_t i = 0; i < model_config_->get("num_hidden_layers"); ++i) { layers_.push_back(this->register_module( "layers." + std::to_string(i), model_config_, device, i, rank_info, attention_backend)); } - // Initialize final layer normalization - INFINICORE_NN_MODULE_INIT(norm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + norm_ = this->register_module("norm", model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); - // Initialize Rotary Position Embeddings (shared across all layers) - // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing - INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config_->get("max_position_embeddings"), + rotary_emb_ = this->register_module("rotary_emb", model_config_->get_head_dim(), model_config_->get("max_position_embeddings"), model_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, dtype, device, model_config_->get_rope_scaling()); @@ -150,30 +94,7 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { kv_cache_ = nullptr; return; } - if (auto kv_cache_config = dynamic_cast(cache_config); - kv_cache_config && model_config_ == nullptr) { - kv_cache_ = std::make_shared( - config_.head_dim, - config_.head_dim, - config_.num_key_value_heads, - config_.num_key_value_heads, - config_.num_hidden_layers, - config_.max_position_embeddings, - model_config_->get_kv_cache_dtype(), - *kv_cache_config, - rank_info_); - } else if (auto paged_kv_cache_config = dynamic_cast(cache_config); - paged_kv_cache_config && model_config_ == nullptr) { - kv_cache_ = std::make_shared( - config_.head_dim, - config_.head_dim, - config_.num_key_value_heads, - config_.num_key_value_heads, - config_.num_hidden_layers, - model_config_->get_kv_cache_dtype(), - *paged_kv_cache_config, - rank_info_); - } else if (auto kv_cache_config = dynamic_cast(cache_config)) { + if (auto kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( model_config_->get_head_dim(), model_config_->get_head_dim(), diff --git a/csrc/models/llama_legacy/llama_model.hpp b/csrc/models/llama_legacy/llama_model.hpp index 6c90cbc2..2fcc914f 100644 --- a/csrc/models/llama_legacy/llama_model.hpp +++ b/csrc/models/llama_legacy/llama_model.hpp @@ -37,23 +37,6 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaModel(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), @@ -96,25 +79,19 @@ class LlamaModel : public infinicore::nn::Module { size_t num_layers() const { return model_config_->get("num_hidden_layers"); } protected: - // Token embeddings - INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); + std::shared_ptr embed_tokens_; - // Decoder layers - INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers); + std::vector> layers_; - // Final normalization - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); + std::shared_ptr norm_; - // Rotary Position Embeddings (shared across all layers) - INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb); + std::shared_ptr rotary_emb_; engine::distributed::RankInfo rank_info_; std::shared_ptr kv_cache_; private: - LlamaConfig config_; - std::shared_ptr model_config_; }; diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index c1e20f76..03df3f64 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -1,5 +1,6 @@ #include "minicpm_sala_attention.hpp" #include "../../global_state/global_state.hpp" +#include "../../layers/attention/attention.hpp" #include namespace infinilm::models::minicpm_sala { @@ -35,14 +36,14 @@ AttentionBase::AttentionBase(std::shared_ptr mode auto quant_scheme = model_config->get_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: - INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, total_num_heads * head_dim_, quantization_method, + case infinilm::quantization::QuantScheme::NONE: + q_proj_ = this->register_module("q_proj", hidden_size_, total_num_heads * head_dim_, quantization_method, use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method, + k_proj_ = this->register_module("k_proj", hidden_size_, total_num_kv_heads * head_dim_, quantization_method, use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method, + v_proj_ = this->register_module("v_proj", hidden_size_, total_num_kv_heads * head_dim_, quantization_method, use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, + o_proj_ = this->register_module("o_proj", total_num_heads * head_dim_, hidden_size_, quantization_method, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); break; default: @@ -57,21 +58,8 @@ AttentionBase::AttentionBase(std::shared_ptr mode num_key_value_heads_, layer_idx_, kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_); - auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme(); - switch (kv_quant_scheme) { - case (infinicore::quantization::KVQuantAlgo::NONE): { - break; - } - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; - } - default: { - throw std::runtime_error("infinilm::layers::attention: unsupported kv_quant_scheme"); - break; - } - } + infinilm::layers::attention::init_kv_cache_quant_params([this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }, + device, kv_cache_k_scale_, kv_cache_v_scale_); } InfLLMv2Attention::InfLLMv2Attention(std::shared_ptr model_config, @@ -85,7 +73,7 @@ InfLLMv2Attention::InfLLMv2Attention(std::shared_ptrget_dtype()}; size_t num_attention_heads = model_config->get("num_attention_heads"); if (use_output_gate_) { - INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, num_attention_heads * head_dim_, + o_gate_ = this->register_module("o_gate", hidden_size_, num_attention_heads * head_dim_, model_config->get_quantization_method(), use_bias_, dtype, device); } } @@ -112,14 +100,14 @@ LightningAttention::LightningAttention(std::shared_ptrget("num_attention_heads"); if (qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, rms_norm_eps, dtype, device); + q_norm_ = this->register_module("q_norm", head_dim_, rms_norm_eps, dtype, device); + k_norm_ = this->register_module("k_norm", head_dim_, rms_norm_eps, dtype, device); } if (use_output_norm_) { - INFINICORE_NN_MODULE_INIT(o_norm, num_attention_heads * head_dim_, rms_norm_eps, dtype, device); + o_norm_ = this->register_module("o_norm", num_attention_heads * head_dim_, rms_norm_eps, dtype, device); } if (use_output_gate_) { - INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, num_attention_heads * head_dim_, + z_proj_ = this->register_module("z_proj", hidden_size_, num_attention_heads * head_dim_, model_config->get_quantization_method(), use_bias_, dtype, device); } } diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 81a032b6..f70e60b4 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -24,10 +24,10 @@ class AttentionBase : public infinicore::nn::Module { size_t hidden_size() const { return hidden_size_; } protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, q_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, k_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, v_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj); + std::shared_ptr q_proj_; + std::shared_ptr k_proj_; + std::shared_ptr v_proj_; + std::shared_ptr o_proj_; std::shared_ptr attn_; ::infinilm::backends::AttentionBackend attention_backend_; @@ -42,8 +42,8 @@ class AttentionBase : public infinicore::nn::Module { bool use_output_bias_; // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); + infinicore::nn::Parameter kv_cache_k_scale_; + infinicore::nn::Parameter kv_cache_v_scale_; }; /** @@ -60,7 +60,7 @@ class InfLLMv2Attention : public AttentionBase { protected: bool use_output_gate_; - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, o_gate); + std::shared_ptr o_gate_; }; /** @@ -79,10 +79,10 @@ class LightningAttention : public AttentionBase { bool qk_norm_; bool use_output_norm_; bool use_output_gate_; - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, z_proj); + std::shared_ptr q_norm_; + std::shared_ptr k_norm_; + std::shared_ptr o_norm_; + std::shared_ptr z_proj_; }; } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp index ff3c113f..35d78596 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp @@ -15,9 +15,9 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptrget("hidden_size"); double rms_norm_eps = model_config->get("rms_norm_eps"); - INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(mlp, model_config, device); + input_layernorm_ = this->register_module("input_layernorm", hidden_size, rms_norm_eps, dtype, device); + post_attention_layernorm_ = this->register_module("post_attention_layernorm", hidden_size, rms_norm_eps, dtype, device); + mlp_ = this->register_module("mlp", model_config, device); std::vector mixer_types = model_config->get>("mixer_types"); std::string mixer_type = mixer_types[layer_idx]; diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp index 5e8faafb..47ebeee5 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp @@ -23,10 +23,10 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { infinicore::Tensor &hidden_states); protected: - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); - INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn); - INFINICORE_NN_MODULE(MiniCPMMLP, mlp); + std::shared_ptr input_layernorm_; + std::shared_ptr post_attention_layernorm_; + std::shared_ptr self_attn_; + std::shared_ptr mlp_; size_t layer_idx_; }; diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index 793f86bd..ddda5b54 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -13,8 +13,8 @@ MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(std::shared_ptrget("vocab_size"); const auto &dtype{model_config->get_dtype()}; - INFINICORE_NN_MODULE_INIT(model, model_config, device); - INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); + model_ = this->register_module("model", model_config, device); + lm_head_ = this->register_module("lm_head", hidden_size, vocab_size, false, dtype, device); } infinilm::InfinilmModel::Output MiniCPMSALAForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const { diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp index f0d0aaae..a3271300 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp @@ -18,8 +18,8 @@ class MiniCPMSALAForCausalLM : public InfinilmModel { void reset_cache(const cache::CacheConfig *cache_config) override; protected: - INFINICORE_NN_MODULE(MiniCPMSALAModel, model); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); + std::shared_ptr model_; + std::shared_ptr lm_head_; }; std::shared_ptr create_minicpm_sala_model_config(std::shared_ptr model_config); diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index bb765a3c..0a54257e 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,60 +1,7 @@ #include "model_factory.hpp" -#include "llama_legacy/llama_for_causal_lm.hpp" #include "models_registry.hpp" namespace infinilm { -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -std::shared_ptr InfinilmModelFactory::createModel( - const InfinilmModel::Config &config, - engine::distributed::RankInfo rank_info, - const cache::CacheConfig *cache, - backends::AttentionBackend attention_backend) { - std::shared_ptr model; - if (const auto llama_config_ptr = dynamic_cast(&config)) { - const auto &llama_config = *llama_config_ptr; - model = std::make_shared( - llama_config, rank_info.device, rank_info, attention_backend); - } else { - throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); - } - - if (cache) { - model->reset_cache(cache); - } - - return model; -} - -std::shared_ptr InfinilmModelFactory::createModel( - std::shared_ptr model_config, - engine::distributed::RankInfo rank_info, - const cache::CacheConfig *cache, - backends::AttentionBackend attention_backend) { - std::shared_ptr model; - if (true) { - model = std::make_shared( - model_config, rank_info.device, rank_info, attention_backend); - } else { - throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); - } - - if (cache) { - model->reset_cache(cache); - } - - return model; -} std::shared_ptr InfinilmModelFactory::createModel( std::shared_ptr model_config, diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index 787a6406..b3c22bc3 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -1,39 +1,10 @@ #pragma once -#include "../backends/attention_backends.hpp" -#include "../engine/distributed/distributed.hpp" #include "infinilm_model.hpp" namespace infinilm { class InfinilmModelFactory { public: - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - static std::shared_ptr createModel( - const InfinilmModel::Config &config, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const cache::CacheConfig *cache = nullptr, - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release. - */ - static std::shared_ptr createModel( - std::shared_ptr model_config, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const cache::CacheConfig *cache = nullptr, - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - static std::shared_ptr createModel( std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/qwen3/qwen3_attention.cpp b/csrc/models/qwen3/qwen3_attention.cpp index befab10a..902779c2 100644 --- a/csrc/models/qwen3/qwen3_attention.cpp +++ b/csrc/models/qwen3/qwen3_attention.cpp @@ -1,5 +1,6 @@ #include "qwen3_attention.hpp" #include "../../global_state/global_state.hpp" +#include "../../layers/attention/attention.hpp" #include "../../utils.hpp" namespace infinilm::models::qwen3 { @@ -29,35 +30,15 @@ Qwen3Attention::Qwen3Attention(std::shared_ptr mo num_attention_heads_ = total_num_heads / tp_size; num_key_value_heads_ = total_num_kv_heads / tp_size; - auto quant_scheme = model_config->get_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: { - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, - quantization_method, use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - default: { - throw std::runtime_error("infinilm::models::qwen3::Qwen3Attention: unsupported quantization scheme"); - break; - } - } + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + qkv_proj_ = std::make_shared( + hidden_size_, head_dim_, total_num_heads, total_num_kv_heads, + "q_proj", "k_proj", "v_proj", register_fn, + quantization_method, use_bias, dtype, device, rank_info); + o_proj_ = this->register_module( + "o_proj", total_num_heads * head_dim_, hidden_size_, quantization_method, + use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device); @@ -65,24 +46,10 @@ Qwen3Attention::Qwen3Attention(std::shared_ptr mo attn_ = std::make_shared(num_attention_heads_, head_dim_, scaling, num_key_value_heads_, layer_idx_, kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_); - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, rms_norm_eps, dtype, device); + q_norm_ = this->register_module("q_norm", head_dim_, rms_norm_eps, dtype, device); + k_norm_ = this->register_module("k_norm", head_dim_, rms_norm_eps, dtype, device); - auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme(); - switch (kv_quant_scheme) { - case (infinicore::quantization::KVQuantAlgo::NONE): { - break; - } - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; - } - default: { - throw std::runtime_error("infinilm::layers::attention: unsupported kv_quant_scheme"); - break; - } - } + infinilm::layers::attention::init_kv_cache_quant_params(register_fn, device, kv_cache_k_scale_, kv_cache_v_scale_); } infinicore::Tensor Qwen3Attention::forward(const infinicore::Tensor &positions, diff --git a/csrc/models/qwen3/qwen3_attention.hpp b/csrc/models/qwen3/qwen3_attention.hpp index 7d8b7180..35ba7b86 100644 --- a/csrc/models/qwen3/qwen3_attention.hpp +++ b/csrc/models/qwen3/qwen3_attention.hpp @@ -26,10 +26,10 @@ class Qwen3Attention : public infinicore::nn::Module { const infinicore::Tensor &hidden_states) const; protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::QKVParallelLinear, qkv_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); + std::shared_ptr qkv_proj_; + std::shared_ptr o_proj_; + std::shared_ptr q_norm_; + std::shared_ptr k_norm_; std::shared_ptr rotary_emb_; std::shared_ptr attn_; @@ -41,7 +41,7 @@ class Qwen3Attention : public infinicore::nn::Module { size_t head_dim_; // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); + infinicore::nn::Parameter kv_cache_k_scale_; + infinicore::nn::Parameter kv_cache_v_scale_; }; } // namespace infinilm::models::qwen3 diff --git a/csrc/models/qwen3_next/qwen3_next_attention.cpp b/csrc/models/qwen3_next/qwen3_next_attention.cpp index 331773b2..7890e56c 100644 --- a/csrc/models/qwen3_next/qwen3_next_attention.cpp +++ b/csrc/models/qwen3_next/qwen3_next_attention.cpp @@ -35,43 +35,23 @@ Qwen3NextAttention::Qwen3NextAttention(std::shared_ptrget_quant_scheme(); auto quantization_method = model_config->get_quantization_method(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: { - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads * (1 + attn_output_gate), total_num_kv_heads, quantization_method, - use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads * (1 + attn_output_gate), total_num_kv_heads, quantization_method, - use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, total_num_heads * (1 + attn_output_gate), total_num_kv_heads, quantization_method, - use_bias, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - default: { - throw std::runtime_error("infinilm::models::qwen3_next::Qwen3NextAttention: unsupported quantization scheme"); - } - } + auto register_fn = [this](const std::string &n, infinicore::nn::Parameter p) { this->register_parameter(n, std::move(p)); }; + qkv_proj_ = std::make_shared( + hidden_size_, head_dim_, total_num_heads * (1 + attn_output_gate), total_num_kv_heads, + "q_proj", "k_proj", "v_proj", register_fn, + quantization_method, use_bias, dtype, device, rank_info); + o_proj_ = this->register_module( + "o_proj", total_num_heads * head_dim_, hidden_size_, quantization_method, + use_output_bias, dtype, device, tp_rank, tp_size, rank_info.comm); + q_norm_ = this->register_module("q_norm", head_dim_, rms_norm_eps, dtype, device); + k_norm_ = this->register_module("k_norm", head_dim_, rms_norm_eps, dtype, device); rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device); float scaling = 1.0f / std::sqrt(static_cast(head_dim_)); attn_ = std::make_shared(num_attention_heads_, head_dim_, scaling, num_key_value_heads_, layer_idx_, kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_); - - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, rms_norm_eps, dtype, device); } infinicore::Tensor Qwen3NextAttention::forward(const infinicore::Tensor &positions, diff --git a/csrc/models/qwen3_next/qwen3_next_attention.hpp b/csrc/models/qwen3_next/qwen3_next_attention.hpp index abcf56ed..a2e0f313 100644 --- a/csrc/models/qwen3_next/qwen3_next_attention.hpp +++ b/csrc/models/qwen3_next/qwen3_next_attention.hpp @@ -20,10 +20,10 @@ class Qwen3NextAttention : public infinicore::nn::Module { size_t hidden_size() const { return hidden_size_; } protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::QKVParallelLinear, qkv_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); + std::shared_ptr qkv_proj_; + std::shared_ptr o_proj_; + std::shared_ptr q_norm_; + std::shared_ptr k_norm_; std::shared_ptr rotary_emb_; std::shared_ptr attn_; @@ -35,8 +35,8 @@ class Qwen3NextAttention : public infinicore::nn::Module { size_t head_dim_; // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); + infinicore::nn::Parameter kv_cache_k_scale_; + infinicore::nn::Parameter kv_cache_v_scale_; }; } // namespace infinilm::models::qwen3_next diff --git a/csrc/models/qwen3_next/qwen3_next_decoderLayer.cpp b/csrc/models/qwen3_next/qwen3_next_decoderLayer.cpp index 7b396cb2..3cd6a725 100644 --- a/csrc/models/qwen3_next/qwen3_next_decoderLayer.cpp +++ b/csrc/models/qwen3_next/qwen3_next_decoderLayer.cpp @@ -15,16 +15,16 @@ Qwen3NextDecoderLayer::Qwen3NextDecoderLayer(std::shared_ptrget("hidden_size"); double rms_norm_eps = model_config->get("rms_norm_eps"); - INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(mlp, model_config, device); + input_layernorm_ = this->register_module("input_layernorm", hidden_size, rms_norm_eps, dtype, device); + post_attention_layernorm_ = this->register_module("post_attention_layernorm", hidden_size, rms_norm_eps, dtype, device); + mlp_ = this->register_module("mlp", model_config, device); const std::vector layer_types = model_config->get>("layer_types"); layer_type_ = layer_types[layer_idx]; if ("linear_attention" == layer_type_) { - INFINICORE_NN_MODULE_INIT(linear_attn, model_config, layer_idx, device); + linear_attn_ = this->register_module("linear_attn", model_config, layer_idx, device); } else if ("full_attention" == layer_type_) { - INFINICORE_NN_MODULE_INIT(self_attn, model_config, layer_idx, device); + self_attn_ = this->register_module("self_attn", model_config, layer_idx, device); } else { throw std::runtime_error("infinilm::models::qwen3_next::Qwen3NextDecoderLayer: unsupported layer_type '" + layer_type_ + "' for layer " + std::to_string(layer_idx)); } diff --git a/csrc/models/qwen3_next/qwen3_next_decoderLayer.hpp b/csrc/models/qwen3_next/qwen3_next_decoderLayer.hpp index 5ae636ed..4a887275 100644 --- a/csrc/models/qwen3_next/qwen3_next_decoderLayer.hpp +++ b/csrc/models/qwen3_next/qwen3_next_decoderLayer.hpp @@ -25,11 +25,11 @@ class Qwen3NextDecoderLayer : public infinicore::nn::Module { size_t layer_idx() const { return layer_idx_; } protected: - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); - INFINICORE_NN_MODULE(Qwen3NextAttention, self_attn); - INFINICORE_NN_MODULE(Qwen3NextGatedDeltaNet, linear_attn); - INFINICORE_NN_MODULE(Qwen3NextSparseMoeBlock, mlp); + std::shared_ptr input_layernorm_; + std::shared_ptr post_attention_layernorm_; + std::shared_ptr self_attn_; + std::shared_ptr linear_attn_; + std::shared_ptr mlp_; private: size_t layer_idx_; diff --git a/csrc/models/qwen3_next/qwen3_next_for_causal_lm.cpp b/csrc/models/qwen3_next/qwen3_next_for_causal_lm.cpp index 7d2f8a6e..86a637c6 100644 --- a/csrc/models/qwen3_next/qwen3_next_for_causal_lm.cpp +++ b/csrc/models/qwen3_next/qwen3_next_for_causal_lm.cpp @@ -14,8 +14,8 @@ Qwen3NextForCausalLM::Qwen3NextForCausalLM(std::shared_ptrget("vocab_size"); const auto &dtype{model_config->get_dtype()}; - INFINICORE_NN_MODULE_INIT(model, model_config, device); - INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); + model_ = this->register_module("model", model_config, device); + lm_head_ = this->register_module("lm_head", hidden_size, vocab_size, false, dtype, device); } infinilm::InfinilmModel::Output Qwen3NextForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const { diff --git a/csrc/models/qwen3_next/qwen3_next_for_causal_lm.hpp b/csrc/models/qwen3_next/qwen3_next_for_causal_lm.hpp index b8d1a6a9..495de855 100644 --- a/csrc/models/qwen3_next/qwen3_next_for_causal_lm.hpp +++ b/csrc/models/qwen3_next/qwen3_next_for_causal_lm.hpp @@ -18,8 +18,8 @@ class Qwen3NextForCausalLM : public InfinilmModel { void reset_cache(const cache::CacheConfig *cache_config) override; protected: - INFINICORE_NN_MODULE(Qwen3NextModel, model); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); + std::shared_ptr model_; + std::shared_ptr lm_head_; }; std::shared_ptr create_qwen3_next_model_config(std::shared_ptr model_config); diff --git a/csrc/models/qwen3_next/qwen3_next_gated_deltanet.cpp b/csrc/models/qwen3_next/qwen3_next_gated_deltanet.cpp index 19094003..9d3c7dfa 100644 --- a/csrc/models/qwen3_next/qwen3_next_gated_deltanet.cpp +++ b/csrc/models/qwen3_next/qwen3_next_gated_deltanet.cpp @@ -14,7 +14,8 @@ FakeConv1d::FakeConv1d(size_t in_channels, const infinicore::DataType dtype, const infinicore::Device device) { - INFINICORE_NN_PARAMETER_INIT(weight, ({out_channels, 1, kernel_size}, dtype, device)); + weight_ = infinicore::nn::Parameter({out_channels, 1, kernel_size}, dtype, device); + this->register_parameter("weight", weight_); } Qwen3NextGatedDeltaNet::Qwen3NextGatedDeltaNet(std::shared_ptr model_config, @@ -36,19 +37,21 @@ Qwen3NextGatedDeltaNet::Qwen3NextGatedDeltaNet(std::shared_ptrget("rms_norm_eps"); size_t conv_dim = key_dim * 2 + value_dim; - INFINICORE_NN_MODULE_INIT(conv1d, conv_dim, conv_dim, linear_conv_kernel_dim, 1, linear_conv_kernel_dim - 1, 1, 1, false, dtype, device); + conv1d_ = this->register_module("conv1d", conv_dim, conv_dim, linear_conv_kernel_dim, 1, linear_conv_kernel_dim - 1, 1, 1, false, dtype, device); size_t projection_size_qkvz = key_dim * 2 + value_dim * 2; size_t projection_size_ba = linear_num_value_heads * 2; - INFINICORE_NN_MODULE_INIT(in_proj_qkvz, hidden_size, projection_size_qkvz, false, dtype, device); - INFINICORE_NN_MODULE_INIT(in_proj_ba, hidden_size, projection_size_ba, false, dtype, device); + in_proj_qkvz_ = this->register_module("in_proj_qkvz", hidden_size, projection_size_qkvz, false, dtype, device); + in_proj_ba_ = this->register_module("in_proj_ba", hidden_size, projection_size_ba, false, dtype, device); - INFINICORE_NN_PARAMETER_INIT(dt_bias, ({linear_num_value_heads}, dtype, device)); - INFINICORE_NN_PARAMETER_INIT(A_log, ({linear_num_value_heads}, dtype, device)); + dt_bias_ = infinicore::nn::Parameter({linear_num_value_heads}, dtype, device); + this->register_parameter("dt_bias", dt_bias_); + A_log_ = infinicore::nn::Parameter({linear_num_value_heads}, dtype, device); + this->register_parameter("A_log", A_log_); - INFINICORE_NN_MODULE_INIT(norm, linear_value_head_dim, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(out_proj, value_dim, hidden_size, false, dtype, device); + norm_ = this->register_module("norm", linear_value_head_dim, rms_norm_eps, dtype, device); + out_proj_ = this->register_module("out_proj", value_dim, hidden_size, false, dtype, device); } infinicore::Tensor Qwen3NextGatedDeltaNet::forward(const infinicore::Tensor &positions, diff --git a/csrc/models/qwen3_next/qwen3_next_gated_deltanet.hpp b/csrc/models/qwen3_next/qwen3_next_gated_deltanet.hpp index 0ebcace2..a892e6ef 100644 --- a/csrc/models/qwen3_next/qwen3_next_gated_deltanet.hpp +++ b/csrc/models/qwen3_next/qwen3_next_gated_deltanet.hpp @@ -20,7 +20,7 @@ class FakeConv1d : public infinicore::nn::Module { private: size_t layer_idx_; - INFINICORE_NN_PARAMETER(weight); + infinicore::nn::Parameter weight_; }; class Qwen3NextGatedDeltaNet : public infinicore::nn::Module { @@ -33,13 +33,13 @@ class Qwen3NextGatedDeltaNet : public infinicore::nn::Module { const infinicore::Tensor &hidden_states) const; private: - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, in_proj_qkvz); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, in_proj_ba); - INFINICORE_NN_MODULE(FakeConv1d, conv1d); - INFINICORE_NN_PARAMETER(dt_bias); - INFINICORE_NN_PARAMETER(A_log); - INFINICORE_NN_MODULE(Qwen3Next_Fake_RMSNormGated, norm); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, out_proj); + std::shared_ptr in_proj_qkvz_; + std::shared_ptr in_proj_ba_; + std::shared_ptr conv1d_; + infinicore::nn::Parameter dt_bias_; + infinicore::nn::Parameter A_log_; + std::shared_ptr norm_; + std::shared_ptr out_proj_; size_t layer_idx_; }; diff --git a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp index 512fe307..a11a1ad4 100644 --- a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp +++ b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.cpp @@ -12,7 +12,7 @@ Qwen3VLModel::Qwen3VLModel(std::shared_ptr model_ nlohmann::json &text_config_json = config_json["text_config"]; std::shared_ptr text_config = std::make_shared(text_config_json); - INFINICORE_NN_MODULE_INIT(language_model, text_config, device); + language_model_ = this->register_module("language_model", text_config, device); } infinicore::Tensor Qwen3VLModel::forward(const infinilm::InfinilmModel::Input &input) const { @@ -31,8 +31,8 @@ Qwen3VLForConditionalGeneration::Qwen3VLForConditionalGeneration(std::shared_ptr size_t vocab_size = text_config->get("vocab_size"); const auto &dtype{model_config->get_dtype()}; - INFINICORE_NN_MODULE_INIT(model, model_config, device); - INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); + model_ = this->register_module("model", model_config, device); + lm_head_ = this->register_module("lm_head", hidden_size, vocab_size, false, dtype, device); } infinilm::InfinilmModel::Output Qwen3VLForConditionalGeneration::forward(const infinilm::InfinilmModel::Input &input) const { @@ -57,7 +57,6 @@ void Qwen3VLForConditionalGeneration::reset_cache(const cache::CacheConfig *cach const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend; kv_cache_vec = std::move(default_allocate_kv_cache_tensors(cache_config, text_model_config, attention_backend)); } - std::shared_ptr create_qwen3_vl_model_config(std::shared_ptr model_config) { const std::string &model_type = model_config->get("model_type"); if ("qwen3_vl" != model_type) { diff --git a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp index dae5bc39..16486fab 100644 --- a/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp +++ b/csrc/models/qwen3_vl/qwen3_vl_for_conditional_generation.hpp @@ -14,7 +14,7 @@ class Qwen3VLModel : public infinicore::nn::Module { infinicore::Tensor forward(const infinilm::InfinilmModel::Input &input) const; protected: - INFINICORE_NN_MODULE(Qwen3VLTextModel, language_model); + std::shared_ptr language_model_; }; class Qwen3VLForConditionalGeneration : public InfinilmModel { @@ -27,8 +27,8 @@ class Qwen3VLForConditionalGeneration : public InfinilmModel { void reset_cache(const cache::CacheConfig *cache_config) override; protected: - INFINICORE_NN_MODULE(Qwen3VLModel, model); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); + std::shared_ptr model_; + std::shared_ptr lm_head_; }; std::shared_ptr create_qwen3_vl_model_config(std::shared_ptr model_config); diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index 2741c9cd..0d480bbf 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -30,50 +30,6 @@ namespace infinilm::engine { inline void bind_infer_engine(py::module &m) { py::class_> infer_engine(m, "InferEngine"); - infer_engine - .def(py::init([]( - const InfinilmModel::Config &cfg, - const distributed::DistConfig &dist, - infinicore::Device::Type dev, - std::shared_ptr cache_cfg, - bool enable_graph_compiling, - const std::string &attention_backend) { - return std::make_shared( - cfg, - dist, - dev, - cache_cfg ? cache_cfg.get() : nullptr, - enable_graph_compiling, - infinilm::backends::parse_attention_backend(attention_backend)); - }), - py::arg("config"), - py::arg("distributed_config") = distributed::DistConfig(), - py::arg("device_type") = infinicore::context::getDevice().getType(), - py::arg("cache_config") = py::none(), - py::arg("enable_graph_compiling") = false, - py::arg("attention_backend") = "default") - .def("load_param", &InferEngine::load_param, - py::arg("name"), py::arg("param"), - "Load a parameter tensor into all workers (each worker picks its shard)") - .def("state_dict", [](InferEngine &self) { - py::list state_dict_tp_all; - for (const auto &state_dict_tp : self.state_dict()) { - py::dict result; - for (const auto &[name, param] : state_dict_tp) { - result[py::cast(name)] = infinicore::Tensor(param); - } - state_dict_tp_all.append(result); - } - return state_dict_tp_all; - }) - .def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)") - .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") - .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) - .def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr { - auto cfg = self.get_cache_config(); - return cfg ? std::shared_ptr(cfg->unique_copy()) : nullptr; }) - .def("__repr__", [](const InferEngine &self) { return ""; }); - infer_engine .def(py::init([]( const std::string &model_path, @@ -116,9 +72,9 @@ inline void bind_infer_engine(py::module &m) { .def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)") .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) - .def("get_cache_config", [](const InferEngine &self) { + .def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr { auto cfg = self.get_cache_config(); - return std::shared_ptr(std::move(cfg->unique_copy())); }) + return cfg ? std::shared_ptr(cfg->unique_copy()) : nullptr; }) .def("__repr__", [](const InferEngine &self) { return ""; }); py::class_(infer_engine, "Input") diff --git a/csrc/utils.hpp b/csrc/utils.hpp index 805e1254..94f4a197 100644 --- a/csrc/utils.hpp +++ b/csrc/utils.hpp @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include @@ -7,6 +8,7 @@ #include #include #include +#include inline void assertTrue(int expr, const char *msg, const char *function, const char *file, int line) { if (!expr) { @@ -120,6 +122,16 @@ inline uint16_t f32_to_bf16(float val) { return bf16_bits; } +inline void set_zeros(infinicore::Tensor &tensor) { + std::vector zeros(tensor->nbytes(), 0); + infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false); +} + +inline void set_minus_one(infinicore::Tensor &tensor) { + std::vector minus_one(tensor->nbytes(), 0xFF); + infinicore::context::memcpyH2D(tensor->data(), minus_one.data(), tensor->nbytes(), false); +} + // Hash combine utility (similar to boost::hash_combine) inline void hash_combine(size_t &seed, size_t value) { seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);