From d9f14e6ee703c6cbd808491f7a055d5caad41131 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Fri, 26 Jun 2026 21:09:34 +0800 Subject: [PATCH] feat: support video nsa --- csrc/engine/infer_engine.cpp | 4 +- csrc/engine/rank_worker.hpp | 2 + csrc/global_state/forward_context.hpp | 2 + csrc/models/infinilm_model.hpp | 2 + csrc/models/videonsa/videonsa_attention.cpp | 469 ++++++++++++++++ csrc/models/videonsa/videonsa_attention.hpp | 38 ++ .../videonsa_for_conditional_generation.cpp | 174 ++++++ .../videonsa_for_conditional_generation.hpp | 33 ++ csrc/models/videonsa/videonsa_vision.cpp | 288 ++++++++++ csrc/models/videonsa/videonsa_vision.hpp | 129 +++++ csrc/pybind11/engine/engine.hpp | 6 +- examples/bench_videonsa.py | 238 ++++++++ examples/test_infer.py | 13 +- python/infinilm/base_config.py | 71 ++- python/infinilm/infer_engine.py | 28 +- python/infinilm/modeling_utils.py | 15 +- python/infinilm/multimodal/multimodal.py | 17 +- python/infinilm/processors/__init__.py | 11 + .../infinilm/processors/videonsa_processor.py | 523 ++++++++++++++++++ 19 files changed, 2052 insertions(+), 11 deletions(-) create mode 100644 csrc/models/videonsa/videonsa_attention.cpp create mode 100644 csrc/models/videonsa/videonsa_attention.hpp create mode 100644 csrc/models/videonsa/videonsa_for_conditional_generation.cpp create mode 100644 csrc/models/videonsa/videonsa_for_conditional_generation.hpp create mode 100644 csrc/models/videonsa/videonsa_vision.cpp create mode 100644 csrc/models/videonsa/videonsa_vision.hpp create mode 100644 examples/bench_videonsa.py create mode 100644 python/infinilm/processors/videonsa_processor.py diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 7c8429236..13d839c53 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -156,6 +156,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const { to_device_vec(pixel_values), to_device_vec(image_bound), to_device_vec(tgt_sizes), + visual_token_ranges, }; infinilm::global_state::get_forward_context().attn_metadata = { @@ -167,7 +168,8 @@ InferEngine::Input::to_model_input(infinicore::Device device) const { input.slot_mapping}; global_state::get_forward_context().mm_metadata = { - image_req_ids}; + image_req_ids, + visual_token_ranges}; return input; } diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index a26a3cd69..28f2776d4 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -60,6 +60,8 @@ class RankWorker { std::optional> tgt_sizes; /// req_id for each pixel_values among a batch std::optional> image_req_ids; + /// Flattened [start, end) visual token ranges in the packed language sequence. + std::optional> visual_token_ranges; float temperature{1}; diff --git a/csrc/global_state/forward_context.hpp b/csrc/global_state/forward_context.hpp index 2568fc7ee..0d397a147 100644 --- a/csrc/global_state/forward_context.hpp +++ b/csrc/global_state/forward_context.hpp @@ -42,6 +42,8 @@ struct AttentionMetadata { struct MultiModalMetadata { std::optional> image_req_ids; + // Flattened [start, end) token ranges in the current packed language sequence. + std::optional> visual_token_ranges; }; struct ForwardContext { diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index c76a29f08..fdebc49d7 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -43,6 +43,8 @@ class InfinilmModel : public infinicore::nn::Module { /// Target patch sizes for each image (MiniCPM-V). /// Vector of tensors shape: [n_path, 2] if pre-flattened. std::optional> tgt_sizes; + /// Flattened [start, end) visual token ranges in the packed language sequence. + std::optional> visual_token_ranges; }; struct Output { diff --git a/csrc/models/videonsa/videonsa_attention.cpp b/csrc/models/videonsa/videonsa_attention.cpp new file mode 100644 index 000000000..828d10860 --- /dev/null +++ b/csrc/models/videonsa/videonsa_attention.cpp @@ -0,0 +1,469 @@ +#include "videonsa_attention.hpp" +#include "../../global_state/global_state.hpp" +#include "../../utils.hpp" +#include "infinicore/context/context.hpp" +#include "infinicore/ops/add.hpp" +#include "infinicore/ops/cat.hpp" +#include "infinicore/ops/matmul.hpp" +#include "infinicore/ops/mha_varlen.hpp" +#include "infinicore/ops/mrope.hpp" +#include "infinicore/ops/mul.hpp" +#include "infinicore/ops/nsa_compress_paged_cache.hpp" +#include "infinicore/ops/nsa_paged_attention.hpp" +#include "infinicore/ops/paged_caching.hpp" +#include "infinicore/ops/sigmoid.hpp" +#include "infinicore/ops/silu.hpp" +#include "infinicore/ops/softmax.hpp" +#include "infinicore/ops/sum.hpp" +#include "infinicore/ops/take.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace infinilm::models::videonsa { + +namespace { + +constexpr size_t kNsaBlockSize = 64; +constexpr int kNsaSelectBlocks = 4; + +infinicore::Tensor scalar_tensor(float value, const infinicore::Device &device) { + auto cpu = infinicore::Tensor::from_blob(&value, {1}, infinicore::DataType::F32, infinicore::Device::cpu()); + return cpu->to(device); +} + +// Temporarily creating cache here. Scheduled to be replaced by a revamped mrope module. +std::pair build_mrope_cache(size_t max_seq_len, + size_t rotary_dim, + double theta, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + if (rotary_dim == 0 || rotary_dim % 2 != 0) { + throw std::invalid_argument("VideoNSAAttention: invalid mrope rotary_dim"); + } + const size_t cache_dim = rotary_dim / 2; + const size_t numel = max_seq_len * cache_dim; + std::vector sin_data(numel); + std::vector cos_data(numel); + for (size_t pos = 0; pos < max_seq_len; ++pos) { + for (size_t dim_idx = 0; dim_idx < cache_dim; ++dim_idx) { + const float inv_freq = 1.0f / std::pow(static_cast(theta), 2.0f * static_cast(dim_idx) / static_cast(rotary_dim)); + const float angle = static_cast(pos) * inv_freq; + const size_t offset = pos * cache_dim + dim_idx; + sin_data[offset] = std::sin(angle); + cos_data[offset] = std::cos(angle); + } + } + + const auto cpu = infinicore::Device::cpu(); + auto sin_cache = infinicore::Tensor::empty({max_seq_len, cache_dim}, dtype, device); + auto cos_cache = infinicore::Tensor::empty({max_seq_len, cache_dim}, dtype, device); + if (dtype == infinicore::DataType::F32) { + auto sin_cpu = infinicore::Tensor::from_blob(sin_data.data(), {max_seq_len, cache_dim}, infinicore::DataType::F32, cpu); + auto cos_cpu = infinicore::Tensor::from_blob(cos_data.data(), {max_seq_len, cache_dim}, infinicore::DataType::F32, cpu); + sin_cache->copy_from(sin_cpu); + cos_cache->copy_from(cos_cpu); + return {sin_cache, cos_cache}; + } + if (dtype == infinicore::DataType::BF16) { + std::vector sin_bf16(numel); + std::vector cos_bf16(numel); + for (size_t i = 0; i < numel; ++i) { + sin_bf16[i] = f32_to_bf16(sin_data[i]); + cos_bf16[i] = f32_to_bf16(cos_data[i]); + } + auto sin_cpu = infinicore::Tensor::from_blob(sin_bf16.data(), {max_seq_len, cache_dim}, infinicore::DataType::BF16, cpu); + auto cos_cpu = infinicore::Tensor::from_blob(cos_bf16.data(), {max_seq_len, cache_dim}, infinicore::DataType::BF16, cpu); + sin_cache->copy_from(sin_cpu); + cos_cache->copy_from(cos_cpu); + return {sin_cache, cos_cache}; + } + if (dtype == infinicore::DataType::F16) { + std::vector sin_f16(numel); + std::vector cos_f16(numel); + for (size_t i = 0; i < numel; ++i) { + sin_f16[i] = f32_to_f16(sin_data[i]); + cos_f16[i] = f32_to_f16(cos_data[i]); + } + auto sin_cpu = infinicore::Tensor::from_blob(sin_f16.data(), {max_seq_len, cache_dim}, infinicore::DataType::F16, cpu); + auto cos_cpu = infinicore::Tensor::from_blob(cos_f16.data(), {max_seq_len, cache_dim}, infinicore::DataType::F16, cpu); + sin_cache->copy_from(sin_cpu); + cos_cache->copy_from(cos_cpu); + return {sin_cache, cos_cache}; + } + throw std::runtime_error("VideoNSAAttention: mrope cache dtype is unsupported"); +} + +std::pair apply_mrope(const infinicore::Tensor &q, + const infinicore::Tensor &k, + const infinicore::Tensor &positions, + const infinicore::Tensor &cos_cache, + const infinicore::Tensor &sin_cache, + size_t head_dim, + size_t rotary_dim, + const std::array §ion, + bool interleaved) { + const size_t num_tokens = q->size(0); + auto q_flat = q->contiguous()->view({num_tokens, q->size(1) * head_dim}); + auto k_flat = k->contiguous()->view({num_tokens, k->size(1) * head_dim}); + auto qk_rope = infinicore::op::mrope(q_flat, + k_flat, + cos_cache, + sin_cache, + positions, + static_cast(head_dim), + static_cast(rotary_dim), + section[0], + section[1], + section[2], + interleaved); + return {qk_rope.first->view(q->shape()), qk_rope.second->view(k->shape())}; +} + +infinicore::Tensor mean_pool_blocks(const infinicore::Tensor &x, size_t block_size) { + const size_t seq_len = x->size(0); + std::vector blocks; + blocks.reserve((seq_len + block_size - 1) / block_size); + for (size_t start = 0; start < seq_len; start += block_size) { + const size_t len = std::min(block_size, seq_len - start); + auto block = x->narrow({{0, start, len}}); + auto pooled = infinicore::op::sum(block, {0}, false); + blocks.push_back(pooled->unsqueeze(0)); + } + return blocks.size() == 1 ? blocks.front() : infinicore::op::cat(blocks, 0); +} + +infinicore::Tensor repeat_group_tensor(const infinicore::Tensor &x, size_t repeats) { + std::vector copies; + copies.reserve(repeats); + for (size_t i = 0; i < repeats; ++i) { + copies.push_back(x); + } + return copies.size() == 1 ? copies.front() : infinicore::op::cat(copies, 0); +} + +infinicore::Tensor grouped_dense_attention(const infinicore::Tensor &q, + const infinicore::Tensor &k, + const infinicore::Tensor &v, + size_t num_heads, + size_t num_kv_heads, + size_t head_dim, + float scale) { + const size_t seq_len = q->size(0); + const size_t kv_len = k->size(0); + const size_t heads_per_group = num_heads / num_kv_heads; + std::vector group_outputs; + group_outputs.reserve(num_kv_heads); + for (size_t g = 0; g < num_kv_heads; ++g) { + auto q_group = q->narrow({{1, g * heads_per_group, heads_per_group}}) + ->permute({1, 0, 2}) + ->contiguous(); // [heads_per_group, seq, dim] + auto k_group = k->narrow({{1, g, 1}})->squeeze(1)->unsqueeze(0); // [1, kv, dim] + auto v_group = v->narrow({{1, g, 1}})->squeeze(1)->unsqueeze(0); // [1, kv, dim] + auto k_repeated = repeat_group_tensor(k_group, heads_per_group); + auto v_repeated = repeat_group_tensor(v_group, heads_per_group); + auto scores = infinicore::op::matmul(q_group, k_repeated->permute({0, 2, 1}), scale); + infinicore::op::softmax_(scores, scores, -1); + auto out = infinicore::op::matmul(scores, v_repeated) + ->view({heads_per_group, seq_len, head_dim}) + ->permute({1, 0, 2}) + ->contiguous(); + group_outputs.push_back(out); + } + return group_outputs.size() == 1 ? group_outputs.front() : infinicore::op::cat(group_outputs, 1); +} + +infinicore::Tensor expand_head_gate(const infinicore::Tensor &gate, size_t head_dim) { + const size_t seq_len = gate->size(1); + const size_t num_heads = gate->size(2); + auto flat_gate = gate->contiguous()->view({seq_len * num_heads}); + std::vector expand_indices(seq_len * num_heads * head_dim); + for (size_t row = 0; row < seq_len * num_heads; ++row) { + for (size_t d = 0; d < head_dim; ++d) { + expand_indices[row * head_dim + d] = static_cast(row); + } + } + auto indices = infinicore::Tensor::empty({seq_len, num_heads, head_dim}, infinicore::DataType::I64, gate->device()); + infinicore::context::memcpyH2D(indices->data(), expand_indices.data(), expand_indices.size() * sizeof(int64_t), false); + return infinicore::op::take(flat_gate, indices); +} + +std::array read_mrope_section(const std::shared_ptr &model_config) { + const auto &config_json = model_config->get_config_json(); + if (!config_json.contains("rope_scaling") || !config_json["rope_scaling"].contains("mrope_section")) { + throw std::runtime_error("VideoNSAAttention: rope_scaling.mrope_section is required"); + } + auto section = config_json["rope_scaling"]["mrope_section"].get>(); + if (section.size() != 3) { + throw std::runtime_error("VideoNSAAttention: mrope_section must contain three entries"); + } + return {section[0], section[1], section[2]}; +} + +infinicore::Tensor local_head_gates(const infinicore::Tensor &gates, + size_t head_dim_index, + size_t total_num_heads, + size_t local_num_heads) { + if (total_num_heads == local_num_heads) { + return gates; + } + const size_t tp_rank = static_cast(infinilm::global_state::get_tensor_model_parallel_rank()); + return gates->narrow({{head_dim_index, tp_rank * local_num_heads, local_num_heads}}); +} + +} // namespace + +VideoNSAAttention::VideoNSAAttention(std::shared_ptr model_config, + size_t layer_idx, + const infinicore::Device &device) + : infinilm::layers::attention::Attention(model_config, layer_idx, device) { + const auto &dtype{model_config->get_dtype()}; + const size_t hidden_size = model_config->get("hidden_size"); + const size_t total_num_heads = model_config->get("num_attention_heads"); + total_num_attention_heads_ = total_num_heads; + max_position_embeddings_ = model_config->get("max_position_embeddings"); + + mrope_section_ = read_mrope_section(model_config); + mrope_rotary_dim_ = 2 * static_cast(mrope_section_[0] + mrope_section_[1] + mrope_section_[2]); + if (mrope_rotary_dim_ > head_dim_) { + throw std::runtime_error("VideoNSAAttention: mrope rotary dim exceeds head dim"); + } + auto mrope_cache = build_mrope_cache(max_position_embeddings_, + mrope_rotary_dim_, + model_config->get("rope_theta"), + dtype, + device); + mrope_sin_cache_ = mrope_cache.first; + mrope_cos_cache_ = mrope_cache.second; + INFINICORE_NN_MODULE_INIT(g_proj_1, hidden_size, hidden_size, true, dtype, device); + INFINICORE_NN_MODULE_INIT(g_proj_2, hidden_size, 3 * total_num_heads, true, dtype, device); +} + +infinicore::Tensor VideoNSAAttention::forward(const infinicore::Tensor &positions, + const infinicore::Tensor &hidden_states) const { + const auto &forward_context = infinilm::global_state::get_forward_context(); + const auto &mm_metadata = forward_context.mm_metadata; + const bool has_visual_ranges = mm_metadata.visual_token_ranges.has_value() && !mm_metadata.visual_token_ranges->empty(); + const auto &attn_metadata = forward_context.attn_metadata; + const bool is_paged = ::infinilm::backends::AttentionBackend::PAGED_ATTN == attention_backend_ + || ::infinilm::backends::AttentionBackend::FLASH_ATTN == attention_backend_; + const bool is_flash_attn = ::infinilm::backends::AttentionBackend::FLASH_ATTN == attention_backend_; + const bool has_paged_metadata = attn_metadata.total_sequence_lengths.has_value() + && attn_metadata.slot_mapping.has_value() + && attn_metadata.block_tables.has_value(); + const bool is_decode = has_paged_metadata + && hidden_states->size(0) == 1 + && hidden_states->size(1) == attn_metadata.total_sequence_lengths.value()->shape()[0]; + const bool can_use_paged_decode_nsa = is_paged && has_paged_metadata && is_decode; + const bool can_use_fast_prefill = is_flash_attn + && has_paged_metadata + && hidden_states->size(0) == 1 + && !is_decode; + const bool can_use_paged_prefill_nsa = false && has_visual_ranges + && is_paged + && has_paged_metadata + && hidden_states->size(0) == 1 + && hidden_states->size(1) != attn_metadata.total_sequence_lengths.value()->shape()[0]; + + const bool can_use_static_scattered_nsa = ::infinilm::backends::AttentionBackend::STATIC_ATTN == attention_backend_; + if (!can_use_static_scattered_nsa && !can_use_fast_prefill && !can_use_paged_prefill_nsa && !can_use_paged_decode_nsa) { + return infinilm::layers::attention::Attention::forward(positions, hidden_states); + } + + if (can_use_fast_prefill && !can_use_paged_prefill_nsa) { + return infinilm::layers::attention::Attention::forward(positions, hidden_states); + } + + auto hidden_states_mutable = hidden_states; + const size_t batch_size = hidden_states->size(0); + const size_t seq_len = hidden_states->size(1); + auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); + + auto pos_shape = positions->shape(); + infinicore::Tensor pos_ids_for_rope = positions; + if (pos_shape.size() != 1 && pos_shape.size() != 2 && pos_shape.size() != 3) { + throw std::runtime_error("VideoNSAAttention: Unexpected position_ids shape"); + } + const float scale = 1.0f / std::sqrt(static_cast(head_dim_)); + + if (can_use_static_scattered_nsa) { + auto q_static = q->view({batch_size, seq_len, num_attention_heads_, head_dim_}); + auto k_static = k->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + auto v_static = v->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + + auto q_flat = q_static->contiguous()->view({batch_size * seq_len, num_attention_heads_, head_dim_}); + auto k_flat = k_static->contiguous()->view({batch_size * seq_len, num_key_value_heads_, head_dim_}); + auto qk_rope = apply_mrope(q_flat, + k_flat, + pos_ids_for_rope, + mrope_cos_cache_.value(), + mrope_sin_cache_.value(), + head_dim_, + mrope_rotary_dim_, + mrope_section_, + mrope_interleaved_); + auto q_rope = qk_rope.first->view({batch_size, seq_len, num_attention_heads_, head_dim_}); + k_static = qk_rope.second->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + + auto &kv_cache = forward_context.kv_cache_vec[layer_idx_]; + auto k_cache_layer = kv_cache->narrow({{0, 0, 1}})->squeeze(0); + auto v_cache_layer = kv_cache->narrow({{0, 1, 1}})->squeeze(0); + const size_t cache_pos = reinterpret_cast(attn_metadata.past_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0]; + const size_t total_seq_len = cache_pos + seq_len; + k_cache_layer->narrow({{2, cache_pos, seq_len}})->copy_from(k_static->permute({0, 2, 1, 3})); + v_cache_layer->narrow({{2, cache_pos, seq_len}})->copy_from(v_static->permute({0, 2, 1, 3})); + auto k_total = k_cache_layer->narrow({{2, 0, total_seq_len}}); + auto v_total = v_cache_layer->narrow({{2, 0, total_seq_len}}); + + auto gate_hidden = g_proj_1_->forward(hidden_states_mutable); + gate_hidden = infinicore::op::silu(gate_hidden); + auto gates = g_proj_2_->forward(gate_hidden); + gates = gates->view({batch_size, seq_len, 3, total_num_attention_heads_}); + gates = local_head_gates(gates, 3, total_num_attention_heads_, num_attention_heads_); + gates = infinicore::op::sigmoid(gates); + + std::vector batch_outputs; + batch_outputs.reserve(batch_size); + for (size_t b = 0; b < batch_size; ++b) { + auto q_b = q_rope->narrow({{0, b, 1}})->squeeze(0); + auto k_b = k_total->narrow({{0, b, 1}})->squeeze(0)->permute({1, 0, 2}); + auto v_b = v_total->narrow({{0, b, 1}})->squeeze(0)->permute({1, 0, 2}); + + auto k_cmp = mean_pool_blocks(k_b, kNsaBlockSize); + auto v_cmp = mean_pool_blocks(v_b, kNsaBlockSize); + auto comp_heads = grouped_dense_attention(q_b, k_cmp, v_cmp, num_attention_heads_, num_key_value_heads_, head_dim_, scale); + + const size_t win_len = std::min(256, total_seq_len); + auto k_win = k_b->narrow({{0, total_seq_len - win_len, win_len}}); + auto v_win = v_b->narrow({{0, total_seq_len - win_len, win_len}}); + auto win_heads = grouped_dense_attention(q_b, k_win, v_win, num_attention_heads_, num_key_value_heads_, head_dim_, scale); + + auto gates_b = gates->narrow({{0, b, 1}}); + auto g_cmp = expand_head_gate(gates_b->narrow({{2, 0, 1}})->squeeze(2), head_dim_); + auto g_sel = expand_head_gate(gates_b->narrow({{2, 1, 1}})->squeeze(2), head_dim_); + auto g_win = expand_head_gate(gates_b->narrow({{2, 2, 1}})->squeeze(2), head_dim_); + + // Experimental scattered path: selected-block attention reuses the compressed output + // to keep this branch expressible with existing dense ops only. + auto comp_part = infinicore::op::mul(comp_heads, g_cmp); + auto sel_part = infinicore::op::mul(comp_heads, g_sel); + auto win_part = infinicore::op::mul(win_heads, g_win); + auto mixed = infinicore::op::add(infinicore::op::add(comp_part, sel_part), win_part); + batch_outputs.push_back(mixed->view({1, seq_len, num_attention_heads_ * head_dim_})); + } + auto attn_output = batch_outputs.size() == 1 ? batch_outputs.front() : infinicore::op::cat(batch_outputs, 0); + return o_proj_->forward(attn_output); + } + + auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_}); + auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); + auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); + auto qk_rope = apply_mrope(q_reshaped, + k_reshaped, + pos_ids_for_rope, + mrope_cos_cache_.value(), + mrope_sin_cache_.value(), + head_dim_, + mrope_rotary_dim_, + mrope_section_, + mrope_interleaved_); + q_reshaped = qk_rope.first; + k_reshaped = qk_rope.second; + + auto &kv_cache = forward_context.kv_cache_vec[layer_idx_]; + auto k_cache_layer = kv_cache->narrow({{0, 0, 1}})->squeeze(0); + auto v_cache_layer = kv_cache->narrow({{0, 1, 1}})->squeeze(0); + auto k_cache_for_nsa = is_flash_attn ? k_cache_layer->permute({0, 2, 1, 3}) : k_cache_layer; + auto v_cache_for_nsa = is_flash_attn ? v_cache_layer->permute({0, 2, 1, 3}) : v_cache_layer; + + infinicore::op::paged_caching_(k_cache_for_nsa, v_cache_for_nsa, k_reshaped, v_reshaped, attn_metadata.slot_mapping.value()); + + if (can_use_paged_decode_nsa) { + auto gate_hidden = g_proj_1_->forward(hidden_states_mutable); + gate_hidden = infinicore::op::silu(gate_hidden); + auto gates = g_proj_2_->forward(gate_hidden); + gates = gates->view({seq_len, 3, total_num_attention_heads_}); + gates = local_head_gates(gates, 2, total_num_attention_heads_, num_attention_heads_); + gates = infinicore::op::sigmoid(gates); + + const size_t page_block_size = k_cache_for_nsa->size(2); + const size_t subblocks_per_page = page_block_size / kNsaBlockSize; + const size_t cmp_blocks = k_cache_for_nsa->size(0) * subblocks_per_page; + const bool need_cmp_alloc = !nsa_k_cmp_cache_.has_value() + || nsa_k_cmp_cache_.value()->size(0) != cmp_blocks + || nsa_k_cmp_cache_.value()->size(1) != num_key_value_heads_ + || nsa_k_cmp_cache_.value()->size(2) != head_dim_; + if (need_cmp_alloc) { + nsa_k_cmp_cache_ = infinicore::Tensor::empty({cmp_blocks, num_key_value_heads_, head_dim_}, k_cache_for_nsa->dtype(), k_cache_layer->device()); + nsa_v_cmp_cache_ = infinicore::Tensor::empty({cmp_blocks, num_key_value_heads_, head_dim_}, v_cache_for_nsa->dtype(), v_cache_layer->device()); + nsa_cmp_cache_ready_ = false; + } + const bool update_last_only = nsa_cmp_cache_ready_; + infinicore::op::nsa_compress_paged_cache_( + nsa_k_cmp_cache_.value(), + nsa_v_cmp_cache_.value(), + k_cache_for_nsa, + v_cache_for_nsa, + attn_metadata.block_tables.value(), + attn_metadata.total_sequence_lengths.value(), + static_cast(kNsaBlockSize), + update_last_only); + nsa_cmp_cache_ready_ = true; + + auto nsa_heads = infinicore::Tensor::empty({seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device()); + infinicore::op::nsa_paged_attention_( + nsa_heads, + q_reshaped, + nsa_k_cmp_cache_.value(), + nsa_v_cmp_cache_.value(), + k_cache_for_nsa, + v_cache_for_nsa, + attn_metadata.block_tables.value(), + attn_metadata.total_sequence_lengths.value(), + gates, + scale, + static_cast(kNsaBlockSize), + 256, + kNsaSelectBlocks); + auto attn_output = nsa_heads->view({1, seq_len, num_attention_heads_ * head_dim_}); + return o_proj_->forward(attn_output); + } + + auto k_cmp = mean_pool_blocks(k_reshaped, kNsaBlockSize); + auto v_cmp = mean_pool_blocks(v_reshaped, kNsaBlockSize); + auto nsa_heads = grouped_dense_attention(q_reshaped, k_cmp, v_cmp, num_attention_heads_, num_key_value_heads_, head_dim_, scale); + + auto gate_hidden = g_proj_1_->forward(hidden_states_mutable); + gate_hidden = infinicore::op::silu(gate_hidden); + auto gates = g_proj_2_->forward(gate_hidden); + gates = gates->view({1, seq_len, 3, total_num_attention_heads_}); + gates = local_head_gates(gates, 3, total_num_attention_heads_, num_attention_heads_); + gates = infinicore::op::sigmoid(gates); + auto gate_sum = infinicore::op::sum(gates, {2}, false); // [1, seq, heads] + auto gate_expanded = expand_head_gate(gate_sum, head_dim_); + nsa_heads = infinicore::op::mul(nsa_heads, gate_expanded); + + auto attn_output = nsa_heads->view({1, seq_len, num_attention_heads_ * head_dim_}); + return o_proj_->forward(attn_output); +} + +void VideoNSAAttention::process_weights_after_loading() { + infinilm::layers::attention::Attention::process_weights_after_loading(); + g_proj_1_->process_weights_after_loading(); + g_proj_2_->process_weights_after_loading(); +} + +void VideoNSAAttention::reset_runtime_state() const { + infinilm::layers::attention::Attention::reset_runtime_state(); + g_proj_1_->reset_runtime_state(); + g_proj_2_->reset_runtime_state(); + nsa_k_cmp_cache_.reset(); + nsa_v_cmp_cache_.reset(); + nsa_cmp_cache_ready_ = false; +} + +} // namespace infinilm::models::videonsa diff --git a/csrc/models/videonsa/videonsa_attention.hpp b/csrc/models/videonsa/videonsa_attention.hpp new file mode 100644 index 000000000..de8786912 --- /dev/null +++ b/csrc/models/videonsa/videonsa_attention.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include "../../layers/attention/attention.hpp" +#include + +#include + +namespace infinilm::models::videonsa { + +class VideoNSAAttention : public infinilm::layers::attention::Attention { +public: + VideoNSAAttention(std::shared_ptr model_config, + size_t layer_idx, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &positions, + const infinicore::Tensor &hidden_states) const; + + void process_weights_after_loading() override; + void reset_runtime_state() const override; + +protected: + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, g_proj_1); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, g_proj_2); + + mutable std::optional nsa_k_cmp_cache_; + mutable std::optional nsa_v_cmp_cache_; + mutable bool nsa_cmp_cache_ready_ = false; + mutable std::optional mrope_sin_cache_; + mutable std::optional mrope_cos_cache_; + std::array mrope_section_ = {0, 0, 0}; + size_t mrope_rotary_dim_ = 0; + bool mrope_interleaved_ = false; + size_t total_num_attention_heads_ = 0; + size_t max_position_embeddings_ = 0; +}; + +} // namespace infinilm::models::videonsa diff --git a/csrc/models/videonsa/videonsa_for_conditional_generation.cpp b/csrc/models/videonsa/videonsa_for_conditional_generation.cpp new file mode 100644 index 000000000..54fbad26b --- /dev/null +++ b/csrc/models/videonsa/videonsa_for_conditional_generation.cpp @@ -0,0 +1,174 @@ +#include "videonsa_for_conditional_generation.hpp" +#include "../../global_state/global_state.hpp" +#include "../models_registry.hpp" +#include "infinicore/ops/cat.hpp" +#include +#include + +namespace infinilm::models::videonsa { + +namespace { + +std::shared_ptr text_config_from(std::shared_ptr model_config) { + nlohmann::json &config_json = model_config->get_config_json(); + nlohmann::json text_config_json = config_json.contains("text_config") ? config_json["text_config"] : config_json; + text_config_json["model_type"] = "videonsa"; + if (!text_config_json.contains("torch_dtype") && config_json.contains("torch_dtype")) { + text_config_json["torch_dtype"] = config_json["torch_dtype"]; + } + if (!text_config_json.contains("head_dim")) { + text_config_json["head_dim"] = text_config_json["hidden_size"].get() / text_config_json["num_attention_heads"].get(); + } + if (!text_config_json.contains("attention_bias")) { + text_config_json["attention_bias"] = true; + } + return std::make_shared(text_config_json); +} + +} // namespace + +VideoNSAForConditionalGeneration::VideoNSAForConditionalGeneration(std::shared_ptr model_config, + const infinicore::Device &device) { + model_config_ = model_config; + auto text_config = text_config_from(model_config); + const size_t hidden_size = text_config->get("hidden_size"); + const size_t vocab_size = text_config->get("vocab_size"); + const auto &dtype{text_config->get_dtype()}; + + INFINICORE_NN_MODULE_INIT(model, text_config, device); + INFINICORE_NN_MODULE_INIT(visual, model_config->get_config_json()["vision_config"], dtype, device); + INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); +} + +void VideoNSAForConditionalGeneration::replace_embeddings(infinicore::Tensor inputs_embeds, + const infinicore::Tensor &vision_hidden, + const infinicore::Tensor &image_bound) const { + auto bounds_cpu = image_bound->to(infinicore::Device::cpu()); + auto out_slice = inputs_embeds->squeeze(0); + auto bound_slice = bounds_cpu->squeeze(0); + auto bound_count = bound_slice->size(0); + size_t vision_offset = 0; + for (size_t i = 0; i < bound_count; ++i) { + auto bound = bound_slice->narrow({{0, i, 1}}); + auto bound_ptr = reinterpret_cast(bound->data()); + auto start = bound_ptr[0]; + auto end = bound_ptr[1]; + if (end <= start) { + continue; + } + const size_t len = static_cast(end - start); + auto patch_embed = vision_hidden->narrow({{0, vision_offset, len}}); + out_slice->narrow({{0, size_t(start), len}})->copy_from(patch_embed); + vision_offset += len; + } +} + +infinilm::InfinilmModel::Output VideoNSAForConditionalGeneration::forward(const infinilm::InfinilmModel::Input &input) const { + if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) { + if (!input.image_bound.has_value() || !input.tgt_sizes.has_value()) { + throw std::runtime_error("VideoNSAForConditionalGeneration: image_bound and tgt_sizes must be provided with pixel_values"); + } + auto input_ids = input.input_ids.value(); + auto inputs_embeds = model_->embed_tokens(input_ids); + auto input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu()); + int32_t *offsets = reinterpret_cast(input_offsets_cpu->data()); + + const auto &image_req_ids = global_state::get_forward_context().mm_metadata.image_req_ids.value(); + if (input.pixel_values->size() != image_req_ids.size() || input.image_bound->size() != image_req_ids.size() || input.tgt_sizes->size() != image_req_ids.size()) { + throw std::runtime_error("VideoNSAForConditionalGeneration: multimodal tensor lists must match image_req_ids"); + } + + std::vector pixel_tensors; + std::vector grid_tensors; + pixel_tensors.reserve(image_req_ids.size()); + grid_tensors.reserve(image_req_ids.size()); + for (size_t media_idx = 0; media_idx < image_req_ids.size(); ++media_idx) { + pixel_tensors.push_back(input.pixel_values.value().at(media_idx)); + grid_tensors.push_back(input.tgt_sizes.value().at(media_idx)); + } + auto batched_pixels = pixel_tensors.size() == 1 ? pixel_tensors.front() : infinicore::op::cat(pixel_tensors, 0); + auto batched_grids = grid_tensors.size() == 1 ? grid_tensors.front() : infinicore::op::cat(grid_tensors, 0); + auto batched_vision_hidden = visual_->forward(batched_pixels, batched_grids); + + size_t vision_offset = 0; + for (size_t media_idx = 0; media_idx < image_req_ids.size(); ++media_idx) { + const size_t req_id = image_req_ids[media_idx]; + auto bounds_cpu = input.image_bound.value().at(media_idx)->to(infinicore::Device::cpu())->squeeze(0); + auto bound_count = bounds_cpu->size(0); + auto bounds = reinterpret_cast(bounds_cpu->data()); + size_t vision_len = 0; + for (size_t i = 0; i < bound_count; ++i) { + auto start = bounds[i * 2]; + auto end = bounds[i * 2 + 1]; + if (end > start) { + vision_len += static_cast(end - start); + } + } + + auto vision_hidden = batched_vision_hidden->narrow({{0, vision_offset, vision_len}}); + auto req_embeds = inputs_embeds->narrow({{1, size_t(offsets[req_id]), size_t(offsets[req_id + 1] - offsets[req_id])}}); + replace_embeddings(req_embeds, vision_hidden, input.image_bound.value().at(media_idx)); + vision_offset += vision_len; + } + + auto hidden_states = model_->forward_embeds(inputs_embeds, input.position_ids.value()); + auto logits = lm_head_->forward(hidden_states); + return {logits}; + } + + auto hidden_states = model_->forward(input); + auto logits = lm_head_->forward(hidden_states); + return {logits}; +} + +void VideoNSAForConditionalGeneration::reset_cache(const cache::CacheConfig *cache_config) { + if (nullptr == cache_config) { + InfinilmModel::reset_cache(nullptr); + return; + } + cache_config_ = cache_config->unique_copy(); + + auto text_config = text_config_from(model_config_); + auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec; + kv_cache_vec.clear(); + const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend; + kv_cache_vec = std::move(default_allocate_kv_cache_tensors(cache_config, text_config, attention_backend)); +} + +std::shared_ptr create_videonsa_model_config(std::shared_ptr model_config) { + const std::string &model_type = model_config->get("model_type"); + if ("videonsa" != model_type) { + throw std::runtime_error("infinilm::models::videonsa::create_videonsa_model_config: model_type is not videonsa"); + } + + nlohmann::json &config_json = model_config->get_config_json(); + if (config_json.contains("text_config")) { + nlohmann::json &text_config_json = config_json["text_config"]; + if (!text_config_json.contains("head_dim")) { + text_config_json["head_dim"] = text_config_json["hidden_size"].get() / text_config_json["num_attention_heads"].get(); + } + if (!text_config_json.contains("attention_bias")) { + text_config_json["attention_bias"] = true; + } + if (!config_json.contains("torch_dtype") && text_config_json.contains("torch_dtype")) { + config_json["torch_dtype"] = text_config_json["torch_dtype"]; + } + } else { + if (!config_json.contains("head_dim")) { + config_json["head_dim"] = model_config->get("hidden_size") / model_config->get("num_attention_heads"); + } + if (!config_json.contains("attention_bias")) { + config_json["attention_bias"] = true; + } + } + return model_config; +} + +} // namespace infinilm::models::videonsa + +namespace { +INFINILM_REGISTER_CAUSAL_LM_MODEL( + videonsa, + infinilm::models::videonsa::VideoNSAForConditionalGeneration, + infinilm::models::videonsa::create_videonsa_model_config); +} // namespace diff --git a/csrc/models/videonsa/videonsa_for_conditional_generation.hpp b/csrc/models/videonsa/videonsa_for_conditional_generation.hpp new file mode 100644 index 000000000..a31f5db87 --- /dev/null +++ b/csrc/models/videonsa/videonsa_for_conditional_generation.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include "../../layers/common_modules.hpp" +#include "videonsa_attention.hpp" +#include "videonsa_vision.hpp" + +namespace infinilm::models::videonsa { + +using VideoNSAMLP = infinilm::layers::MLP; +using VideoNSADecoderLayer = infinilm::layers::causal_lm_templates::TextDecoderLayer; +using VideoNSATextModel = infinilm::layers::causal_lm_templates::TextModel; + +class VideoNSAForConditionalGeneration : public InfinilmModel { +public: + VideoNSAForConditionalGeneration(std::shared_ptr model_config, + const infinicore::Device &device); + + Output forward(const Input &input) const override; + void reset_cache(const cache::CacheConfig *cache_config) override; + +protected: + void replace_embeddings(infinicore::Tensor inputs_embeds, + const infinicore::Tensor &vision_hidden, + const infinicore::Tensor &image_bound) const; + + INFINICORE_NN_MODULE(VideoNSATextModel, model); + INFINICORE_NN_MODULE(VideoNSAVisionModel, visual); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); +}; + +std::shared_ptr create_videonsa_model_config(std::shared_ptr model_config); + +} // namespace infinilm::models::videonsa diff --git a/csrc/models/videonsa/videonsa_vision.cpp b/csrc/models/videonsa/videonsa_vision.cpp new file mode 100644 index 000000000..0b7a88faa --- /dev/null +++ b/csrc/models/videonsa/videonsa_vision.cpp @@ -0,0 +1,288 @@ +#include "videonsa_vision.hpp" + +#include "infinicore/context/context.hpp" +#include "infinicore/ops.hpp" +#include "infinicore/ops/cat.hpp" +#include "infinicore/ops/take.hpp" +#include +#include +#include +#include + +namespace infinilm::models::videonsa { + +VideoNSAPatchEmbed::VideoNSAPatchEmbed(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : out_hidden_size_(config.value("hidden_size", 1280)), + patch_dim_(config.value("in_channels", 3) + * config.value("temporal_patch_size", 2) + * config.value("patch_size", 14) + * config.value("patch_size", 14)) { + INFINICORE_NN_PARAMETER_INIT(proj_weight, ({out_hidden_size_, patch_dim_}, dtype, device)); + this->register_parameter("proj.weight", proj_weight_); +} + +infinicore::Tensor VideoNSAPatchEmbed::forward(const infinicore::Tensor &pixel_values) const { + auto input = pixel_values->view({pixel_values->size(0), patch_dim_}); + return infinicore::op::linear(input, proj_weight_, std::nullopt); +} + +VideoNSAVisionAttention::VideoNSAVisionAttention(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(config.value("hidden_size", 1280)), + num_heads_(config.value("num_heads", 16)), + head_dim_(hidden_size_ / num_heads_), + scale_(1.0f / std::sqrt(static_cast(head_dim_))) { + INFINICORE_NN_MODULE_INIT(qkv, hidden_size_, 3 * hidden_size_, true, dtype, device); + INFINICORE_NN_MODULE_INIT(proj, hidden_size_, hidden_size_, true, dtype, device); +} + +infinicore::Tensor VideoNSAVisionAttention::forward(const infinicore::Tensor &hidden_states, + const std::vector *cu_window_seqlens) const { + const size_t seq_len = hidden_states->size(0); + auto hidden = const_cast(hidden_states); + auto qkv = qkv_->forward(hidden)->view({seq_len, 3, num_heads_, head_dim_}); + + auto run_dense = [&](size_t start, size_t len) -> infinicore::Tensor { + auto q = qkv->narrow({{0, start, len}, {1, 0, 1}})->squeeze(1)->permute({1, 0, 2})->contiguous(); + auto k = qkv->narrow({{0, start, len}, {1, 1, 1}})->squeeze(1)->permute({1, 0, 2})->contiguous(); + auto v = qkv->narrow({{0, start, len}, {1, 2, 1}})->squeeze(1)->permute({1, 0, 2})->contiguous(); + + auto q_flat = q->view({num_heads_, len, head_dim_}); + auto k_flat = k->view({num_heads_, len, head_dim_}); + auto v_flat = v->view({num_heads_, len, head_dim_}); + auto attn = infinicore::op::matmul(q_flat, k_flat->permute({0, 2, 1}), scale_); + infinicore::op::softmax_(attn, attn, -1); + return infinicore::op::matmul(attn, v_flat) + ->view({num_heads_, len, head_dim_}) + ->permute({1, 0, 2}) + ->contiguous() + ->view({len, hidden_size_}); + }; + + infinicore::Tensor out; + if (cu_window_seqlens == nullptr || cu_window_seqlens->size() <= 2) { + out = run_dense(0, seq_len); + } else { + const size_t first_len = static_cast((*cu_window_seqlens)[1] - (*cu_window_seqlens)[0]); + bool uniform_windows = first_len > 0; + for (size_t i = 1; i + 1 < cu_window_seqlens->size(); ++i) { + const size_t len = static_cast((*cu_window_seqlens)[i + 1] - (*cu_window_seqlens)[i]); + uniform_windows = uniform_windows && len == first_len; + } + + if (uniform_windows) { + const size_t num_windows = cu_window_seqlens->size() - 1; + auto qkv_windows = qkv->view({num_windows, first_len, 3, num_heads_, head_dim_}); + auto q = qkv_windows->narrow({{2, 0, 1}})->squeeze(2)->permute({0, 2, 1, 3})->contiguous(); + auto k = qkv_windows->narrow({{2, 1, 1}})->squeeze(2)->permute({0, 2, 1, 3})->contiguous(); + auto v = qkv_windows->narrow({{2, 2, 1}})->squeeze(2)->permute({0, 2, 1, 3})->contiguous(); + auto q_flat = q->view({num_windows * num_heads_, first_len, head_dim_}); + auto k_flat = k->view({num_windows * num_heads_, first_len, head_dim_}); + auto v_flat = v->view({num_windows * num_heads_, first_len, head_dim_}); + auto attn = infinicore::op::matmul(q_flat, k_flat->permute({0, 2, 1}), scale_); + infinicore::op::softmax_(attn, attn, -1); + out = infinicore::op::matmul(attn, v_flat) + ->view({num_windows, num_heads_, first_len, head_dim_}) + ->permute({0, 2, 1, 3}) + ->contiguous() + ->view({seq_len, hidden_size_}); + } else { + std::vector chunks; + chunks.reserve(cu_window_seqlens->size() - 1); + for (size_t i = 0; i + 1 < cu_window_seqlens->size(); ++i) { + const size_t start = static_cast((*cu_window_seqlens)[i]); + const size_t end = static_cast((*cu_window_seqlens)[i + 1]); + if (end > start) { + chunks.push_back(run_dense(start, end - start)); + } + } + out = chunks.size() == 1 ? chunks.front() : infinicore::op::cat(chunks, 0); + } + } + return proj_->forward(out); +} + +VideoNSAVisionMLP::VideoNSAVisionMLP(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + const size_t hidden_size = config.value("hidden_size", 1280); + const size_t intermediate_size = config.value("intermediate_size", 3420); + INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size, intermediate_size, true, dtype, device); + INFINICORE_NN_MODULE_INIT(up_proj, hidden_size, intermediate_size, true, dtype, device); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size, hidden_size, true, dtype, device); +} + +infinicore::Tensor VideoNSAVisionMLP::forward(const infinicore::Tensor &hidden_states) const { + auto hidden = const_cast(hidden_states); + auto gate = gate_proj_->forward(hidden); + auto up = up_proj_->forward(hidden); + auto x = infinicore::op::swiglu(up, gate); + return down_proj_->forward(x); +} + +VideoNSAVisionBlock::VideoNSAVisionBlock(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) { + const size_t hidden_size = config.value("hidden_size", 1280); + INFINICORE_NN_MODULE_INIT(norm1, hidden_size, 1e-6, dtype, device); + INFINICORE_NN_MODULE_INIT(attn, config, dtype, device); + INFINICORE_NN_MODULE_INIT(norm2, hidden_size, 1e-6, dtype, device); + INFINICORE_NN_MODULE_INIT(mlp, config, dtype, device); +} + +infinicore::Tensor VideoNSAVisionBlock::forward(const infinicore::Tensor &hidden_states, + const std::vector *cu_window_seqlens) const { + auto residual = hidden_states; + auto x = norm1_->forward(hidden_states); + x = attn_->forward(x, cu_window_seqlens); + x = infinicore::op::add(x, residual); + + residual = x; + x = norm2_->forward(x); + x = mlp_->forward(x); + return infinicore::op::add(x, residual); +} + +VideoNSAPatchMerger::VideoNSAPatchMerger(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : hidden_size_(config.value("hidden_size", 1280)), + spatial_merge_unit_(config.value("spatial_merge_size", 2) * config.value("spatial_merge_size", 2)), + merged_size_(hidden_size_ * spatial_merge_unit_) { + const size_t out_hidden_size = config.value("out_hidden_size", 3584); + INFINICORE_NN_MODULE_INIT(ln_q, hidden_size_, 1e-6, dtype, device); + mlp_0_ = this->register_module("mlp.0", merged_size_, merged_size_, true, dtype, device); + mlp_2_ = this->register_module("mlp.2", merged_size_, out_hidden_size, true, dtype, device); +} + +infinicore::Tensor VideoNSAPatchMerger::forward(const infinicore::Tensor &hidden_states) const { + auto x = ln_q_->forward(hidden_states)->view({hidden_states->size(0) / spatial_merge_unit_, merged_size_}); + x = mlp_0_->forward(x); + x = infinicore::op::gelu(x); + return mlp_2_->forward(x); +} + +VideoNSAVisionModel::VideoNSAVisionModel(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device) + : depth_(config.value("depth", 32)), + hidden_size_(config.value("hidden_size", 1280)), + patch_size_(config.value("patch_size", 14)), + spatial_merge_size_(config.value("spatial_merge_size", 2)), + spatial_merge_unit_(spatial_merge_size_ * spatial_merge_size_), + window_size_(config.value("window_size", 112)) { + if (config.contains("fullatt_block_indexes")) { + for (const auto &idx : config["fullatt_block_indexes"]) { + fullatt_block_indexes_.insert(idx.get()); + } + } + INFINICORE_NN_MODULE_INIT(patch_embed, config, dtype, device); + blocks_.reserve(depth_); + for (size_t i = 0; i < depth_; ++i) { + blocks_.push_back(this->register_module("blocks." + std::to_string(i), config, dtype, device)); + } + INFINICORE_NN_MODULE_INIT(merger, config, dtype, device); +} + +VideoNSAVisionModel::WindowMetadata VideoNSAVisionModel::build_window_metadata_(const infinicore::Tensor &grid_thw) const { + auto grid_cpu = grid_thw->to(infinicore::Device::cpu()); + const int64_t *grid = reinterpret_cast(grid_cpu->data()); + const size_t n = grid_cpu->size(0); + const int64_t vit_window = static_cast(window_size_ / spatial_merge_size_ / patch_size_); + if (vit_window <= 0) { + throw std::runtime_error("VideoNSAVisionModel: invalid vision window size"); + } + + WindowMetadata meta; + meta.cu_window_seqlens.push_back(0); + meta.cu_full_seqlens.push_back(0); + int64_t merged_offset = 0; + for (size_t item = 0; item < n; ++item) { + const int64_t grid_t = grid[item * 3]; + const int64_t grid_h = grid[item * 3 + 1]; + const int64_t grid_w = grid[item * 3 + 2]; + const int64_t llm_h = grid_h / static_cast(spatial_merge_size_); + const int64_t llm_w = grid_w / static_cast(spatial_merge_size_); + const int64_t pad_h = (vit_window - llm_h % vit_window) % vit_window; + const int64_t pad_w = (vit_window - llm_w % vit_window) % vit_window; + const int64_t num_win_h = (llm_h + pad_h) / vit_window; + const int64_t num_win_w = (llm_w + pad_w) / vit_window; + + for (int64_t t = 0; t < grid_t; ++t) { + meta.cu_full_seqlens.push_back(meta.cu_full_seqlens.back() + llm_h * llm_w * static_cast(spatial_merge_unit_)); + for (int64_t wh = 0; wh < num_win_h; ++wh) { + for (int64_t ww = 0; ww < num_win_w; ++ww) { + int64_t merged_in_window = 0; + for (int64_t ih = 0; ih < vit_window; ++ih) { + const int64_t h = wh * vit_window + ih; + if (h >= llm_h) { + continue; + } + for (int64_t iw = 0; iw < vit_window; ++iw) { + const int64_t w = ww * vit_window + iw; + if (w >= llm_w) { + continue; + } + const int64_t merged_idx = merged_offset + t * llm_h * llm_w + h * llm_w + w; + for (size_t u = 0; u < spatial_merge_unit_; ++u) { + meta.patch_order.push_back(merged_idx * static_cast(spatial_merge_unit_) + static_cast(u)); + } + ++merged_in_window; + } + } + meta.cu_window_seqlens.push_back(meta.cu_window_seqlens.back() + merged_in_window * static_cast(spatial_merge_unit_)); + } + } + } + merged_offset += grid_t * llm_h * llm_w; + } + + std::vector window_merged_order; + window_merged_order.reserve(meta.patch_order.size() / spatial_merge_unit_); + for (size_t i = 0; i < meta.patch_order.size(); i += spatial_merge_unit_) { + window_merged_order.push_back(meta.patch_order[i] / static_cast(spatial_merge_unit_)); + } + meta.reverse_order.resize(window_merged_order.size()); + std::iota(meta.reverse_order.begin(), meta.reverse_order.end(), 0); + std::sort(meta.reverse_order.begin(), meta.reverse_order.end(), [&](int64_t a, int64_t b) { + return window_merged_order[a] < window_merged_order[b]; + }); + return meta; +} + +infinicore::Tensor VideoNSAVisionModel::gather_rows_(const infinicore::Tensor &hidden_states, + const std::vector &row_order) const { + const size_t rows = row_order.size(); + const size_t width = hidden_states->size(1); + std::vector flat_indices(rows * width); + for (size_t r = 0; r < rows; ++r) { + const int64_t src_row = row_order[r]; + for (size_t c = 0; c < width; ++c) { + flat_indices[r * width + c] = src_row * static_cast(width) + static_cast(c); + } + } + auto indices = infinicore::Tensor::empty({rows, width}, infinicore::DataType::I64, hidden_states->device()); + infinicore::context::memcpyH2D(indices->data(), flat_indices.data(), flat_indices.size() * sizeof(int64_t), false); + return infinicore::op::take(hidden_states->contiguous(), indices); +} + +infinicore::Tensor VideoNSAVisionModel::forward(const infinicore::Tensor &pixel_values, + const infinicore::Tensor &grid_thw) const { + auto hidden_states = patch_embed_->forward(pixel_values); + auto window_meta = build_window_metadata_(grid_thw); + if (window_meta.patch_order.size() != hidden_states->size(0)) { + throw std::runtime_error("VideoNSAVisionModel: window metadata does not match patch sequence length"); + } + hidden_states = gather_rows_(hidden_states, window_meta.patch_order); + for (size_t i = 0; i < blocks_.size(); ++i) { + const auto *cu_segments = fullatt_block_indexes_.count(i) > 0 ? &window_meta.cu_full_seqlens : &window_meta.cu_window_seqlens; + hidden_states = blocks_[i]->forward(hidden_states, cu_segments); + } + auto merged = merger_->forward(hidden_states); + return gather_rows_(merged, window_meta.reverse_order); +} + +} // namespace infinilm::models::videonsa diff --git a/csrc/models/videonsa/videonsa_vision.hpp b/csrc/models/videonsa/videonsa_vision.hpp new file mode 100644 index 000000000..7fe6f53e9 --- /dev/null +++ b/csrc/models/videonsa/videonsa_vision.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include "../../layers/linear/linear.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/nn/rmsnorm.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include +#include + +namespace infinilm::models::videonsa { + +class VideoNSAPatchEmbed : public infinicore::nn::Module { +public: + VideoNSAPatchEmbed(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &pixel_values) const; + +private: + size_t out_hidden_size_; + size_t patch_dim_; + INFINICORE_NN_PARAMETER(proj_weight); +}; + +class VideoNSAVisionAttention : public infinicore::nn::Module { +public: + VideoNSAVisionAttention(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const std::vector *cu_window_seqlens = nullptr) const; + +private: + size_t hidden_size_; + size_t num_heads_; + size_t head_dim_; + float scale_; + + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, qkv); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, proj); +}; + +class VideoNSAVisionMLP : public infinicore::nn::Module { +public: + VideoNSAVisionMLP(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; + +private: + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, gate_proj); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, up_proj); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, down_proj); +}; + +class VideoNSAVisionBlock : public infinicore::nn::Module { +public: + VideoNSAVisionBlock(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const std::vector *cu_window_seqlens = nullptr) const; + +private: + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm1); + INFINICORE_NN_MODULE(VideoNSAVisionAttention, attn); + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm2); + INFINICORE_NN_MODULE(VideoNSAVisionMLP, mlp); +}; + +class VideoNSAPatchMerger : public infinicore::nn::Module { +public: + VideoNSAPatchMerger(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; + +private: + size_t hidden_size_; + size_t spatial_merge_unit_; + size_t merged_size_; + + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, ln_q); + std::shared_ptr mlp_0_; + std::shared_ptr mlp_2_; +}; + +class VideoNSAVisionModel : public infinicore::nn::Module { +public: + VideoNSAVisionModel(const nlohmann::json &config, + const infinicore::DataType &dtype, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &pixel_values, + const infinicore::Tensor &grid_thw) const; + +private: + struct WindowMetadata { + std::vector patch_order; + std::vector reverse_order; + std::vector cu_window_seqlens; + std::vector cu_full_seqlens; + }; + + WindowMetadata build_window_metadata_(const infinicore::Tensor &grid_thw) const; + infinicore::Tensor gather_rows_(const infinicore::Tensor &hidden_states, + const std::vector &row_order) const; + + size_t depth_; + size_t hidden_size_; + size_t patch_size_; + size_t spatial_merge_size_; + size_t spatial_merge_unit_; + size_t window_size_; + std::unordered_set fullatt_block_indexes_; + + INFINICORE_NN_MODULE(VideoNSAPatchEmbed, patch_embed); + INFINICORE_NN_MODULE_VEC(VideoNSAVisionBlock, blocks); + INFINICORE_NN_MODULE(VideoNSAPatchMerger, merger); +}; + +} // namespace infinilm::models::videonsa diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index 4c5058ac5..d0f54e8d0 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -140,6 +140,7 @@ inline void bind_infer_engine(py::module &m) { std::optional> image_bound, std::optional> tgt_sizes, std::optional> image_req_ids, + std::optional> visual_token_ranges, py::kwargs kwargs) { InferEngine::Input input{ std::move(input_ids), @@ -154,6 +155,7 @@ inline void bind_infer_engine(py::module &m) { std::move(image_bound), std::move(tgt_sizes), std::move(image_req_ids), + std::move(visual_token_ranges), }; // Explicit defaults @@ -198,7 +200,8 @@ inline void bind_infer_engine(py::module &m) { py::arg("pixel_values") = std::nullopt, py::arg("image_bound") = std::nullopt, py::arg("tgt_sizes") = std::nullopt, - py::arg("image_req_ids") = std::nullopt) + py::arg("image_req_ids") = std::nullopt, + py::arg("visual_token_ranges") = std::nullopt) .def_readwrite("input_ids", &InferEngine::Input::input_ids) .def_readwrite("position_ids", &InferEngine::Input::position_ids) .def_readwrite("past_sequence_lengths", &InferEngine::Input::past_sequence_lengths) @@ -211,6 +214,7 @@ inline void bind_infer_engine(py::module &m) { .def_readwrite("image_bound", &InferEngine::Input::image_bound) .def_readwrite("tgt_sizes", &InferEngine::Input::tgt_sizes) .def_readwrite("image_req_ids", &InferEngine::Input::image_req_ids) + .def_readwrite("visual_token_ranges", &InferEngine::Input::visual_token_ranges) .def_readwrite("temperature", &InferEngine::Input::temperature) .def_readwrite("top_k", &InferEngine::Input::top_k) .def_readwrite("top_p", &InferEngine::Input::top_p); diff --git a/examples/bench_videonsa.py b/examples/bench_videonsa.py new file mode 100644 index 000000000..faf0bf7f7 --- /dev/null +++ b/examples/bench_videonsa.py @@ -0,0 +1,238 @@ +import os +import sys +import time + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) + +from infinilm.base_config import BaseConfig +from infinilm.llm.llm import LLM +from infinilm.llm.sampling_params import SamplingParams +from infinilm.processors import AutoInfinilmProcessor +from infinilm.processors.videonsa_processor import decode_video_frames + + +VIDEO_AUTO_MIN_FRAMES = 4 +VIDEO_AUTO_MAX_FRAMES = 8 +VIDEO_AUTO_SAMPLE_FPS = 1.0 +VIDEO_AUTO_MAX_PIXELS_CAP = 50176 + + +def as_int_list(value): + if isinstance(value, list): + return [int(item) for item in value] + return [int(value)] + + +def is_cli_arg_set(name): + return any(arg == name or arg.startswith(f"{name}=") for arg in sys.argv[1:]) + + +def probe_video_metadata(video_path): + try: + import cv2 + except Exception: + cv2 = None + + if cv2 is not None: + cap = cv2.VideoCapture(video_path) + if cap.isOpened(): + try: + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) + fps = float(cap.get(cv2.CAP_PROP_FPS) or 0.0) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0) + finally: + cap.release() + duration = frame_count / fps if frame_count > 0 and fps > 0 else 0.0 + return { + "frame_count": frame_count, + "fps": fps, + "width": width, + "height": height, + "duration": duration, + "source": "cv2", + } + + try: + from torchvision.io import read_video_timestamps + + pts, fps = read_video_timestamps(video_path, pts_unit="sec") + except Exception: + return {} + + frame_count = len(pts) + duration = float(pts[-1]) if frame_count > 0 else 0.0 + return { + "frame_count": frame_count, + "fps": float(fps or 0.0), + "width": 0, + "height": 0, + "duration": duration, + "source": "torchvision_timestamps", + } + + +def apply_video_auto_args(cfg): + if not cfg.video: + return {} + meta = probe_video_metadata(cfg.video) + frame_count = meta.get("frame_count", 0) + duration = meta.get("duration", 0.0) + width = meta.get("width", 0) + height = meta.get("height", 0) + + if cfg.video_num_frames is None: + if duration > 0: + inferred = int(round(duration * VIDEO_AUTO_SAMPLE_FPS)) + elif frame_count > 0: + inferred = frame_count + else: + inferred = VIDEO_AUTO_MIN_FRAMES + if frame_count > 0: + inferred = min(inferred, frame_count) + cfg.video_num_frames = max( + VIDEO_AUTO_MIN_FRAMES, + min(VIDEO_AUTO_MAX_FRAMES, inferred), + ) + + if cfg.video_max_pixels is None: + source_pixels = ( + width * height if width > 0 and height > 0 else VIDEO_AUTO_MAX_PIXELS_CAP + ) + cfg.video_max_pixels = min(source_pixels, VIDEO_AUTO_MAX_PIXELS_CAP) + + return meta + + +def apply_multimodal_env(cfg): + if cfg.image and not cfg.video and cfg.image_max_pixels is not None: + os.environ["INFINILM_VIDEONSA_IMAGE_MAX_PIXELS"] = str(cfg.image_max_pixels) + if cfg.image and not cfg.video and cfg.image_min_pixels is not None: + os.environ["INFINILM_VIDEONSA_IMAGE_MIN_PIXELS"] = str(cfg.image_min_pixels) + if cfg.video_num_frames is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_NUM_FRAMES"] = str(cfg.video_num_frames) + if cfg.video_max_pixels is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_MAX_PIXELS"] = str(cfg.video_max_pixels) + if cfg.video_min_pixels is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_MIN_PIXELS"] = str(cfg.video_min_pixels) + + +def normalize_bench_defaults(cfg): + if cfg.video and not is_cli_arg_set("--video-max-pixels"): + cfg.video_max_pixels = None + if not cfg.image and not cfg.video: + cfg.image = "/data-aisoft/pepe/images/bus.jpg" + if cfg.prompt == "How are you": + cfg.prompt = "describe the video" if cfg.video else "describe the image" + + +def make_prompt(tokenizer, target_len, prompt_seed): + seed = prompt_seed.rstrip() + " " + seed_ids = tokenizer.encode(seed) + if not seed_ids: + raise RuntimeError("Tokenizer returned no tokens for the benchmark seed prompt") + repeat = (target_len + len(seed_ids) - 1) // len(seed_ids) + return tokenizer.decode((seed_ids * repeat)[:target_len], skip_special_tokens=True) + + +def make_messages(prompt, image_path, video_path, video_payload, batch_size): + content = [{"type": "text", "text": prompt}] + if video_path: + content = [ + {"type": "video_url", "video_url": {"url": video_payload or video_path}} + ] + content + elif image_path: + content = [{"type": "image_url", "image_url": {"url": image_path}}] + content + return [[{"role": "user", "content": content}] for _ in range(batch_size)] + + +def run_case(model, tokenizer, cfg, video_payload, batch_size, input_len, output_len): + prompt = make_prompt(tokenizer, input_len, cfg.prompt) + messages = make_messages(prompt, cfg.image, cfg.video, video_payload, batch_size) + sampling = SamplingParams( + max_tokens=output_len, + temperature=cfg.temperature, + top_p=cfg.top_p, + top_k=cfg.top_k, + ignore_eos=cfg.ignore_eos, + ) + + if cfg.warmup: + model.chat(messages=messages, sampling_params=sampling, use_tqdm=False) + + start = time.perf_counter() + outputs = model.chat(messages=messages, sampling_params=sampling, use_tqdm=False) + elapsed_ms = (time.perf_counter() - start) * 1000 + total_new_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) + print( + "case " + f"batch_size={batch_size} input_len={input_len} output_len={output_len} " + f"actual_output_tokens={total_new_tokens} " + f"elapsed_ms={elapsed_ms:.2f} " + f"output_tok_per_s={total_new_tokens / (elapsed_ms / 1000):.2f}" + ) + if outputs and not cfg.skip_output: + print("=== sample prompt ===") + print(outputs[0].prompt) + print("=== sample output ===") + print(outputs[0].outputs[0].text) + + +def main(): + cfg = BaseConfig() + normalize_bench_defaults(cfg) + + input_lens = as_int_list(cfg.input_len) + output_lens = as_int_list(cfg.output_len) + max_batch_size = max(int(cfg.batch_size), int(cfg.max_batch_size)) + max_cache_len = max(max(input_lens) + max(output_lens) + 4096, cfg.max_cache_len) + cache_type = "paged" if cfg.enable_paged_attn else "static" + + video_meta = apply_video_auto_args(cfg) + apply_multimodal_env(cfg) + video_payload = decode_video_frames(cfg.video, cfg.video_num_frames) + + print( + f"bench_config model={cfg.model} image={cfg.image} video={cfg.video} prompt={cfg.prompt!r} " + f"device={cfg.device} paged={cfg.enable_paged_attn} attn={cfg.attn} " + f"videonsa_nsa=always_on " + f"image_max_pixels={cfg.image_max_pixels} " + f"video_num_frames={cfg.video_num_frames} video_max_pixels={cfg.video_max_pixels} " + f"video_predecoded={isinstance(video_payload, list)} " + f"video_meta={video_meta}" + ) + processor = AutoInfinilmProcessor.from_pretrained(cfg.model) + tokenizer = processor.get_tokenizer() + model = LLM( + model_path=cfg.model, + device=cfg.get_device_str(cfg.device), + tensor_parallel_size=cfg.tp, + cache_type=cache_type, + max_batch_size=max_batch_size, + max_tokens=max(output_lens), + num_blocks=cfg.num_blocks, + block_size=cfg.block_size, + max_cache_len=max_cache_len, + temperature=cfg.temperature, + top_p=cfg.top_p, + top_k=cfg.top_k, + attn_backend=cfg.attn, + enable_graph=cfg.enable_graph, + weight_load_mode=cfg.weight_load_mode, + ) + + for input_len in input_lens: + for output_len in output_lens: + run_case( + model, + tokenizer, + cfg, + video_payload, + cfg.batch_size, + input_len, + output_len, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/test_infer.py b/examples/test_infer.py index 14db1b6ef..c8c74b4fc 100644 --- a/examples/test_infer.py +++ b/examples/test_infer.py @@ -2,6 +2,7 @@ import os from infinilm.base_config import BaseConfig from infinilm.llm.llm import LLM +from infinilm.processors.videonsa_processor import decode_video_frames def test( @@ -18,6 +19,8 @@ def test( attn_backend="default", use_mla=False, image_path=None, + video_path=None, + video_num_frames=None, skip_load=False, weight_load_mode="async", ): @@ -49,7 +52,13 @@ def test( [{"role": "user", "content": [{"type": "text", "text": prompt}]}] for prompt in prompts ] - if image_path is not None: + if video_path is not None: + video_payload = decode_video_frames(video_path, video_num_frames) + for conversation in conversations: + conversation[0]["content"] = [ + {"type": "video_url", "video_url": {"url": video_payload}} + ] + conversation[0]["content"] + elif image_path is not None: for conversation in conversations: conversation[0]["content"] = [ {"type": "image_url", "image_url": {"url": image_path}} @@ -107,6 +116,8 @@ def test( attn_backend=cfg.attn, use_mla=cfg.use_mla, image_path=cfg.image, + video_path=cfg.video, + video_num_frames=cfg.video_num_frames, skip_load=cfg.skip_load, weight_load_mode=cfg.weight_load_mode, ) diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py index eb4a1e632..09f450105 100644 --- a/python/infinilm/base_config.py +++ b/python/infinilm/base_config.py @@ -37,7 +37,7 @@ def parse_list(value: str): return int(value) except ValueError: raise argparse.ArgumentTypeError( - f"batch-size must be an int or list[int], got: {value}" + f"value must be an int or list[int], got: {value}" ) @@ -109,6 +109,34 @@ def __init__(self): # Multimodal parameters self.image = self.args.image + self.video = self.args.video + self.image_max_pixels = self.args.image_max_pixels + self.image_min_pixels = self.args.image_min_pixels + self.video_num_frames = self.args.video_num_frames + self.video_max_pixels = self.args.video_max_pixels + self.video_min_pixels = self.args.video_min_pixels + self.skip_output = self.args.skip_output + + if self.image_max_pixels is not None: + os.environ["INFINILM_VIDEONSA_IMAGE_MAX_PIXELS"] = str( + self.image_max_pixels + ) + if self.image_min_pixels is not None: + os.environ["INFINILM_VIDEONSA_IMAGE_MIN_PIXELS"] = str( + self.image_min_pixels + ) + if self.video_num_frames is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_NUM_FRAMES"] = str( + self.video_num_frames + ) + if self.video_max_pixels is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_MAX_PIXELS"] = str( + self.video_max_pixels + ) + if self.video_min_pixels is not None: + os.environ["INFINILM_VIDEONSA_VIDEO_MIN_PIXELS"] = str( + self.video_min_pixels + ) if self.enable_paged_attn and self.attn == "default": self.attn = "paged-attn" @@ -320,6 +348,47 @@ def _add_common_args(self): default=None, help="image path for multimodal models", ) + self.parser.add_argument( + "--image-max-pixels", + type=int, + default=200704, + help="maximum image pixels for VideoNSA/Qwen-VL preprocessing", + ) + self.parser.add_argument( + "--image-min-pixels", + type=int, + default=None, + help="minimum image pixels for VideoNSA/Qwen-VL preprocessing", + ) + self.parser.add_argument( + "--video", + type=str, + default=None, + help="video path for multimodal models", + ) + self.parser.add_argument( + "--video-num-frames", + type=int, + default=None, + help="number of frames for VideoNSA/Qwen-VL preprocessing", + ) + self.parser.add_argument( + "--video-max-pixels", + type=int, + default=200704, + help="maximum video frame pixels for VideoNSA/Qwen-VL preprocessing", + ) + self.parser.add_argument( + "--video-min-pixels", + type=int, + default=None, + help="minimum video frame pixels for VideoNSA/Qwen-VL preprocessing", + ) + self.parser.add_argument( + "--skip-output", + action="store_true", + help="skip printing sample prompt/output", + ) # ---- PD separation arguments ---- self.parser.add_argument( diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index 10cf58be2..5c4d21484 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -13,6 +13,30 @@ import os +def normalize_hf_config_for_infinilm(config_dict): + model_type = config_dict.get("model_type") + + if model_type == "qwen2_5_vl" and config_dict.get("architectures") == [ + "VideoNSAForConditionalGeneration" + ]: + normalized = dict(config_dict) + normalized["model_type"] = "videonsa" + normalized["original_model_type"] = model_type + if "text_config" in normalized: + text_config = dict(normalized["text_config"]) + text_config["model_type"] = "videonsa" + text_config.setdefault("torch_dtype", normalized.get("torch_dtype")) + text_config.setdefault( + "head_dim", + text_config["hidden_size"] // text_config["num_attention_heads"], + ) + text_config.setdefault("attention_bias", True) + normalized["text_config"] = text_config + return normalized + + return config_dict + + def read_hf_config(model_path): config_path = os.path.join(model_path, "config.json") with open(config_path, "r") as f: @@ -28,7 +52,7 @@ def read_hf_config(model_path): raise ValueError( f"`model_type` is not specified in the config file `{config_path}`." ) - return config_dict + return normalize_hf_config_for_infinilm(config_dict) # config.json (required) defines model architecture, while generation_config.json @@ -148,6 +172,7 @@ def forward( image_bound=None, tgt_sizes=None, image_req_ids=None, + visual_token_ranges=None, temperature=None, top_k=None, top_p=None, @@ -204,6 +229,7 @@ def convert_tensor_list(tensor_list_): image_bound=image_bound, tgt_sizes=tgt_sizes, image_req_ids=image_req_ids, + visual_token_ranges=visual_token_ranges, temperature=temperature, top_k=top_k, top_p=top_p, diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 94f4016a9..f4ca09703 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -570,7 +570,6 @@ def _remap_baichuan(state_dict, config=None): hf_config = config or {} hidden_size = hf_config.get("hidden_size", 4096) num_heads = hf_config.get("num_attention_heads", 32) - vocab_size = hf_config.get("vocab_size", 125696) per_head_dim = num_heads * (hidden_size // num_heads) # 1. Split W_pack → q_proj, k_proj, v_proj @@ -660,6 +659,19 @@ def _remap_mamba(state_dict, config=None): return remapped +def _remap_videonsa(state_dict, config=None): + """Adapt VideoNSA/Qwen2.5-VL weights to the InfiniLM C++ module layout.""" + key = "visual.patch_embed.proj.weight" + if key in state_dict: + state_dict = dict(state_dict) + if state_dict[key].ndim == 5: + state_dict[key] = ( + state_dict[key].reshape(state_dict[key].shape[0], -1).contiguous() + ) + state_dict["visual.patch_embed.proj_weight"] = state_dict[key] + return state_dict + + # Model type → remap function mapping _WEIGHT_REMAPPER = { "glm4": _remap_glm4, @@ -667,4 +679,5 @@ def _remap_mamba(state_dict, config=None): "baichuan": _remap_baichuan, "gpt2": _remap_gpt2, "mamba": _remap_mamba, + "videonsa": _remap_videonsa, } diff --git a/python/infinilm/multimodal/multimodal.py b/python/infinilm/multimodal/multimodal.py index 7e14a8c5e..72fea63c6 100644 --- a/python/infinilm/multimodal/multimodal.py +++ b/python/infinilm/multimodal/multimodal.py @@ -1,6 +1,5 @@ -from typing import List, Optional, Union +from typing import List, Union from PIL import Image -import xxhash def has_multimodal_inputs(messages: Union[List[dict], dict]) -> bool: @@ -44,9 +43,17 @@ def resolve_multimodal_inputs(messages: Union[List[dict], dict]): # TODO support other image url formats images.append(Image.open(item["image_url"]["url"])) image_urls.append(item["image_url"]["url"]) - - else: # TODO support video/audio - raise NotImplementedError("Only image input is supported for now") + elif item.get("type") == "video_url": + video = item["video_url"]["url"] + videos.append(video) + if isinstance(video, str): + video_urls.append(video) + else: + video_urls.append( + f"predecoded_video:{len(video_urls)}:{len(video)}" + ) + else: # TODO support audio + raise NotImplementedError("Only image/video input is supported for now") return { "images": images, diff --git a/python/infinilm/processors/__init__.py b/python/infinilm/processors/__init__.py index 67f97acac..a08096389 100644 --- a/python/infinilm/processors/__init__.py +++ b/python/infinilm/processors/__init__.py @@ -1,4 +1,5 @@ import importlib +import json import pkgutil from pathlib import Path from transformers import AutoConfig @@ -32,6 +33,16 @@ def from_pretrained(cls, model_dir_path: str, **kwargs) -> InfinilmProcessor: """ config = AutoConfig.from_pretrained(model_dir_path, trust_remote_code=True) model_type = config.model_type.lower() + raw_config_path = Path(model_dir_path) / "config.json" + architectures = [] + if raw_config_path.exists(): + with raw_config_path.open("r") as f: + architectures = json.load(f).get("architectures", []) or [] + if ( + model_type == "qwen2_5_vl" + and "VideoNSAForConditionalGeneration" in architectures + ): + model_type = "videonsa" processor_cls = get_processor_class(model_type) return processor_cls(model_dir_path) diff --git a/python/infinilm/processors/videonsa_processor.py b/python/infinilm/processors/videonsa_processor.py new file mode 100644 index 000000000..6b57040f8 --- /dev/null +++ b/python/infinilm/processors/videonsa_processor.py @@ -0,0 +1,523 @@ +import os + +from typing_extensions import override + +import torch +from transformers import AutoConfig, AutoProcessor + +from .processor import InfinilmProcessor, register_processor + + +def decode_video_frames(video_path, num_frames=None): + if not video_path: + return None + try: + from decord import VideoReader, cpu + from PIL import Image + except Exception: + return video_path + + reader = VideoReader(video_path, ctx=cpu(0)) + total = len(reader) + if total == 0: + return video_path + num_frames = max(1, min(num_frames or total, total)) + if num_frames == 1: + indices = [0] + else: + indices = [round(i * (total - 1) / (num_frames - 1)) for i in range(num_frames)] + batch = reader.get_batch(indices).asnumpy() + return [Image.fromarray(frame) for frame in batch] + + +@register_processor("videonsa") +class VideoNSAProcessor(InfinilmProcessor): + def __init__(self, model_dir_path: str): + self.config = AutoConfig.from_pretrained(model_dir_path, trust_remote_code=True) + self.processor = AutoProcessor.from_pretrained( + model_dir_path, trust_remote_code=True + ) + self.tokenizer = self.processor.tokenizer + self.image_token_id = getattr(self.processor, "image_token_id", None) + if self.image_token_id is None: + self.image_token_id = self.tokenizer.convert_tokens_to_ids("<|image_pad|>") + self.video_token_id = getattr(self.processor, "video_token_id", None) + if self.video_token_id is None: + self.video_token_id = self.tokenizer.convert_tokens_to_ids("<|video_pad|>") + self.pixel_values_dtype = torch.bfloat16 + self._configure_media_processors_from_env() + + def _configure_media_processors_from_env(self): + image_processor = getattr(self.processor, "image_processor", None) + image_max_pixels = os.getenv("INFINILM_VIDEONSA_IMAGE_MAX_PIXELS") + image_min_pixels = os.getenv("INFINILM_VIDEONSA_IMAGE_MIN_PIXELS") + if image_processor is not None and image_max_pixels: + value = int(image_max_pixels) + image_processor.max_pixels = value + image_processor.size = dict(getattr(image_processor, "size", {}) or {}) + image_processor.size["longest_edge"] = value + if image_processor is not None and image_min_pixels: + value = int(image_min_pixels) + image_processor.min_pixels = value + image_processor.size = dict(getattr(image_processor, "size", {}) or {}) + image_processor.size["shortest_edge"] = value + + video_processor = getattr(self.processor, "video_processor", None) + if video_processor is None: + return + num_frames = os.getenv("INFINILM_VIDEONSA_VIDEO_NUM_FRAMES") + max_pixels = os.getenv("INFINILM_VIDEONSA_VIDEO_MAX_PIXELS") + min_pixels = os.getenv("INFINILM_VIDEONSA_VIDEO_MIN_PIXELS") + if num_frames: + video_processor.num_frames = int(num_frames) + video_processor.do_sample_frames = True + if max_pixels: + value = int(max_pixels) + video_processor.max_pixels = value + video_processor.size = dict(getattr(video_processor, "size", {}) or {}) + video_processor.size["longest_edge"] = value + if min_pixels: + value = int(min_pixels) + video_processor.min_pixels = value + video_processor.size = dict(getattr(video_processor, "size", {}) or {}) + video_processor.size["shortest_edge"] = value + + def _normalize_messages(self, conversation): + normalized = [] + for msg in conversation: + content = msg.get("content", []) + if isinstance(content, str): + normalized.append({"role": msg.get("role", "user"), "content": content}) + continue + items = [] + for item in content: + typ = item.get("type") + if typ == "text": + items.append({"type": "text", "text": item.get("text", "")}) + elif typ == "image_url": + items.append({"type": "image", "image": item["image_url"]["url"]}) + elif typ == "video_url": + items.append({"type": "video", "video": item["video_url"]["url"]}) + else: + raise NotImplementedError( + f"Unsupported VideoNSA content type: {typ}" + ) + normalized.append({"role": msg.get("role", "user"), "content": items}) + return normalized + + @override + def __call__( + self, + prompt, + images=None, + videos=None, + audios=None, + return_tensors: str = None, + **kwargs, + ) -> dict: + results = self.processor( + text=prompt, + images=images or None, + videos=videos or None, + return_tensors=return_tensors, + return_mm_token_type_ids=False, + **kwargs, + ) + input_ids = results["input_ids"] + bounds = [] + for row in input_ids: + row_bounds = [] + start = None + current = None + for idx, token_id in enumerate(row.tolist()): + if token_id in (self.image_token_id, self.video_token_id): + if start is None: + start = idx + current = token_id + elif current != token_id: + row_bounds.append([start, idx]) + start = idx + current = token_id + elif start is not None: + row_bounds.append([start, idx]) + start = None + current = None + if start is not None: + row_bounds.append([start, len(row)]) + bounds.append(row_bounds) + + if bounds and any(bounds): + max_bounds = max(len(b) for b in bounds) + image_bound = torch.zeros((len(bounds), max_bounds, 2), dtype=torch.long) + for i, row_bounds in enumerate(bounds): + if row_bounds: + image_bound[i, : len(row_bounds), :] = torch.tensor( + row_bounds, dtype=torch.long + ) + results["image_bound"] = image_bound + return results + + def _valid_media_bounds(self, processed_inputs): + if processed_inputs is None or "image_bound" not in processed_inputs: + return [] + bounds = processed_inputs["image_bound"] + if isinstance(bounds, torch.Tensor): + bounds = bounds[0].tolist() + else: + bounds = bounds[0] + return [ + (int(start), int(end)) for start, end in bounds if int(end) > int(start) + ] + + def _grid_list(self, processed_inputs, key): + value = None if processed_inputs is None else processed_inputs.get(key) + if value is None: + return [] + if isinstance(value, torch.Tensor): + return [tuple(int(x) for x in row.tolist()) for row in value] + return [tuple(int(x) for x in row) for row in value] + + def _video_second_per_grid(self, processed_inputs, video_idx): + value = ( + None + if processed_inputs is None + else processed_inputs.get("second_per_grid_ts") + ) + if value is None: + return 1.0 + if isinstance(value, torch.Tensor): + value = value.flatten().tolist() + if isinstance(value, (list, tuple)) and video_idx < len(value): + return float(value[video_idx]) + try: + return float(value) + except Exception: + return 1.0 + + def _append_text_positions(self, axes, length, start_pos): + if length <= 0: + return start_pos + vals = list(range(start_pos, start_pos + length)) + axes[0].extend(vals) + axes[1].extend(vals) + axes[2].extend(vals) + return start_pos + length + + def _append_visual_positions( + self, axes, grid, span_len, start_pos, second_per_grid=1.0 + ): + spatial_merge_size = int( + getattr( + getattr(self.config, "vision_config", None), "spatial_merge_size", 2 + ) + ) + tokens_per_second = float( + getattr( + getattr(self.config, "vision_config", None), "tokens_per_second", 1.0 + ) + ) + grid_t, grid_h, grid_w = (int(grid[0]), int(grid[1]), int(grid[2])) + llm_grid_t = max(1, grid_t) + llm_grid_h = max(1, grid_h // spatial_merge_size) + llm_grid_w = max(1, grid_w // spatial_merge_size) + t_step = max(1, int(round(tokens_per_second * second_per_grid))) + + t_ids, h_ids, w_ids = [], [], [] + for t in range(llm_grid_t): + for h in range(llm_grid_h): + for w in range(llm_grid_w): + t_ids.append(t * t_step + start_pos) + h_ids.append(h + start_pos) + w_ids.append(w + start_pos) + + if not t_ids: + t_ids = h_ids = w_ids = [start_pos] + if len(t_ids) < span_len: + repeat = span_len - len(t_ids) + t_ids.extend([t_ids[-1]] * repeat) + h_ids.extend([h_ids[-1]] * repeat) + w_ids.extend([w_ids[-1]] * repeat) + elif len(t_ids) > span_len: + t_ids = t_ids[:span_len] + h_ids = h_ids[:span_len] + w_ids = w_ids[:span_len] + + axes[0].extend(t_ids) + axes[1].extend(h_ids) + axes[2].extend(w_ids) + return max(max(t_ids), max(h_ids), max(w_ids)) + 1 + + def _prompt_mrope_positions(self, token_ids, processed_inputs): + axes = [[], [], []] + bounds = self._valid_media_bounds(processed_inputs) + image_grids = self._grid_list(processed_inputs, "image_grid_thw") + video_grids = self._grid_list(processed_inputs, "video_grid_thw") + image_idx = 0 + video_idx = 0 + cursor = 0 + pos_base = 0 + + for start, end in bounds: + if start > len(token_ids): + break + end = min(end, len(token_ids)) + pos_base = self._append_text_positions(axes, start - cursor, pos_base) + token_id = ( + token_ids[start] if start < len(token_ids) else self.image_token_id + ) + span_len = max(0, end - start) + if token_id == self.video_token_id and video_idx < len(video_grids): + pos_base = self._append_visual_positions( + axes, + video_grids[video_idx], + span_len, + pos_base, + self._video_second_per_grid(processed_inputs, video_idx), + ) + video_idx += 1 + elif image_idx < len(image_grids): + pos_base = self._append_visual_positions( + axes, image_grids[image_idx], span_len, pos_base + ) + image_idx += 1 + else: + pos_base = self._append_text_positions(axes, span_len, pos_base) + cursor = end + + self._append_text_positions(axes, len(token_ids) - cursor, pos_base) + return axes + + def _mrope_delta(self, token_ids, processed_inputs): + positions = self._prompt_mrope_positions(token_ids, processed_inputs) + if not positions[0]: + return 0 + return max(max(axis) for axis in positions) + 1 - len(token_ids) + + @override + def apply_chat_template( + self, + conversation, + add_generation_prompt: bool = False, + tokenize: bool = True, + **kwargs, + ): + return self.processor.apply_chat_template( + self._normalize_messages(conversation), + add_generation_prompt=add_generation_prompt, + tokenize=tokenize, + **kwargs, + ) + + @override + def build_model_inputs( + self, + scheduler_output, + temperature: float = 1.0, + top_p: float = 0.8, + top_k: int = 1, + **kwargs, + ) -> dict: + import infinicore + + if not scheduler_output.scheduled_requests: + raise RuntimeError( + "build_model_inputs called with empty scheduled_requests" + ) + + tokens = [] + seq_lens = [] + seq_offsets = [0] + block_tables = [] + slot_mapping = [] + cached_lens = [] + position_axes = [[], [], []] + cu_seqlens = [0] + mm_data = {} + max_block_table_len = max( + 1, *(len(req.block_table) for req in scheduler_output.scheduled_requests) + ) + current_offset = 0 + + for req_id, req in enumerate(scheduler_output.scheduled_requests): + num_cached = req.num_local_cached_tokens + if scheduler_output.is_prefill: + req_tokens = req.get_input_tokens() + tokens_to_compute = req_tokens[num_cached:] + tokens.extend(tokens_to_compute) + compute_len = len(tokens_to_compute) + seq_len = len(req_tokens) + seq_lens.append(seq_len) + current_offset += compute_len + seq_offsets.append(current_offset) + cached_lens.append(num_cached) + prompt_positions = self._prompt_mrope_positions( + req_tokens, req.processed_inputs + ) + for axis in range(3): + position_axes[axis].extend( + prompt_positions[axis][num_cached : num_cached + compute_len] + ) + + if ( + req.processed_inputs is not None + and "image_bound" in req.processed_inputs + ): + image_bound = req.processed_inputs["image_bound"] + bounds = image_bound[0] + bounds = bounds[bounds[:, 1] > bounds[:, 0]] + num_cached_media = (bounds[:, 1] <= num_cached).sum().item() + if num_cached_media < len(bounds): + grids = [] + tensors = [] + offset = 0 + pixel_values = req.processed_inputs.get("pixel_values") + image_grid = req.processed_inputs.get("image_grid_thw") + if pixel_values is not None: + for media_idx, grid in enumerate(image_grid): + patch_count = int(grid.prod().item()) + if media_idx >= num_cached_media: + grids.append(grid) + tensors.append( + pixel_values[offset : offset + patch_count] + ) + offset += patch_count + pixel_values_videos = req.processed_inputs.get( + "pixel_values_videos" + ) + video_grid = req.processed_inputs.get("video_grid_thw") + if pixel_values_videos is not None: + offset = 0 + base = 0 if image_grid is None else len(image_grid) + for media_idx, grid in enumerate(video_grid): + patch_count = int(grid.prod().item()) + if base + media_idx >= num_cached_media: + grids.append(grid) + tensors.append( + pixel_values_videos[ + offset : offset + patch_count + ] + ) + offset += patch_count + if tensors: + pixel_tensor = torch.cat(tensors, dim=0).to( + self.pixel_values_dtype + ) + grid_tensor = torch.stack(grids).to(torch.int64) + bound = ( + bounds[num_cached_media:].unsqueeze(0).to(torch.int64) + ) + mm_data.setdefault("pixel_values", []).append( + infinicore.from_torch(pixel_tensor) + ) + mm_data.setdefault("tgt_sizes", []).append( + infinicore.from_torch(grid_tensor) + ) + mm_data.setdefault("image_bound", []).append( + infinicore.from_torch(bound) + ) + mm_data.setdefault("image_req_ids", []).append(req_id) + if pixel_values_videos is not None: + for start, end in bound.squeeze(0).tolist(): + if end > start: + packed_start = seq_offsets[-2] + int(start) + packed_end = seq_offsets[-2] + int(end) + mm_data.setdefault( + "visual_token_ranges", [] + ).extend([packed_start, packed_end]) + else: + seq_len = req.get_total_length() + last_token = ( + req.generated_token_ids[-1] + if req.generated_token_ids + else req.prompt_token_ids[-1] + ) + tokens.append(last_token) + seq_lens.append(seq_len) + current_offset += 1 + seq_offsets.append(current_offset) + cached_lens.append(num_cached) + mrope_delta = self._mrope_delta( + req.prompt_token_ids, req.processed_inputs + ) + decode_pos = seq_len - 1 + mrope_delta + for axis in range(3): + position_axes[axis].append(decode_pos) + + if req.slot_mapping: + padded_slot_mapping = req.slot_mapping + else: + padded_slot_mapping = ( + list(range(cu_seqlens[-1], cu_seqlens[-1] + len(tokens_to_compute))) + if scheduler_output.is_prefill + else [current_offset - 1] + ) + slot_mapping.extend(padded_slot_mapping) + + padded_block_table = req.block_table + [-1] * ( + max_block_table_len - len(req.block_table) + ) + block_tables.append(padded_block_table) + cu_seqlens.append(cu_seqlens[-1] + seq_lens[-1]) + + return { + "input_ids": infinicore.from_list([tokens], dtype=infinicore.int64), + "position_ids": infinicore.from_list(position_axes, dtype=infinicore.int64), + "past_kv_lengths": infinicore.from_list( + cached_lens, dtype=infinicore.int32 + ), + "total_kv_lengths": infinicore.from_list(seq_lens, dtype=infinicore.int32), + "input_offsets": infinicore.from_list(seq_offsets, dtype=infinicore.int32), + "cu_seqlens": infinicore.from_list(cu_seqlens, dtype=infinicore.int32), + "block_tables": infinicore.from_list(block_tables, dtype=infinicore.int32), + "slot_mapping": infinicore.from_list(slot_mapping, dtype=infinicore.int64), + "temperature": temperature, + "top_k": top_k, + "top_p": top_p, + **mm_data, + } + + @override + def get_tokenizer(self): + return self.tokenizer + + @override + def get_mm_token_index_list( + self, prompt_token_ids, image_ids=None, video_ids=None, **kwargs + ): + ids = list(image_ids or []) + list(video_ids or []) + out = [] + media_idx = 0 + start = None + current = None + for i, token_id in enumerate(prompt_token_ids): + if token_id in (self.image_token_id, self.video_token_id): + if start is None: + start = i + current = token_id + elif current != token_id: + out.append( + { + "start_index": start, + "end_index": i, + "identifier": ids[media_idx], + } + ) + media_idx += 1 + start = i + current = token_id + elif start is not None: + out.append( + {"start_index": start, "end_index": i, "identifier": ids[media_idx]} + ) + media_idx += 1 + start = None + current = None + if start is not None: + out.append( + { + "start_index": start, + "end_index": len(prompt_token_ids), + "identifier": ids[media_idx], + } + ) + return out