Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
to_device_vec(pixel_values),
to_device_vec(image_bound),
to_device_vec(tgt_sizes),
visual_token_ranges,
};

infinilm::global_state::get_forward_context().attn_metadata = {
Expand All @@ -167,7 +168,8 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
input.slot_mapping};

global_state::get_forward_context().mm_metadata = {
image_req_ids};
image_req_ids,
visual_token_ranges};

return input;
}
Expand Down
2 changes: 2 additions & 0 deletions csrc/engine/rank_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class RankWorker {
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
/// req_id for each pixel_values among a batch
std::optional<std::vector<size_t>> image_req_ids;
/// Flattened [start, end) visual token ranges in the packed language sequence.
std::optional<std::vector<size_t>> visual_token_ranges;

float temperature{1};

Expand Down
2 changes: 2 additions & 0 deletions csrc/global_state/forward_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ struct AttentionMetadata {

struct MultiModalMetadata {
std::optional<std::vector<size_t>> image_req_ids;
// Flattened [start, end) token ranges in the current packed language sequence.
std::optional<std::vector<size_t>> visual_token_ranges;
Comment thread
wooway777 marked this conversation as resolved.
};

struct ForwardContext {
Expand Down
2 changes: 2 additions & 0 deletions csrc/models/infinilm_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class InfinilmModel : public infinicore::nn::Module {
/// Target patch sizes for each image (MiniCPM-V).
/// Vector of tensors shape: [n_path, 2] if pre-flattened.
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
/// Flattened [start, end) visual token ranges in the packed language sequence.
std::optional<std::vector<size_t>> visual_token_ranges;
};

struct Output {
Expand Down
469 changes: 469 additions & 0 deletions csrc/models/videonsa/videonsa_attention.cpp

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions csrc/models/videonsa/videonsa_attention.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include "../../layers/attention/attention.hpp"
#include <array>

#include <optional>

namespace infinilm::models::videonsa {

class VideoNSAAttention : public infinilm::layers::attention::Attention {
public:
VideoNSAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device);

infinicore::Tensor forward(const infinicore::Tensor &positions,
const infinicore::Tensor &hidden_states) const;

void process_weights_after_loading() override;
void reset_runtime_state() const override;

protected:
INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, g_proj_1);
INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, g_proj_2);

mutable std::optional<infinicore::Tensor> nsa_k_cmp_cache_;
mutable std::optional<infinicore::Tensor> nsa_v_cmp_cache_;
mutable bool nsa_cmp_cache_ready_ = false;
mutable std::optional<infinicore::Tensor> mrope_sin_cache_;
mutable std::optional<infinicore::Tensor> mrope_cos_cache_;
std::array<int, 3> mrope_section_ = {0, 0, 0};
size_t mrope_rotary_dim_ = 0;
bool mrope_interleaved_ = false;
size_t total_num_attention_heads_ = 0;
size_t max_position_embeddings_ = 0;
};

} // namespace infinilm::models::videonsa
174 changes: 174 additions & 0 deletions csrc/models/videonsa/videonsa_for_conditional_generation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#include "videonsa_for_conditional_generation.hpp"
#include "../../global_state/global_state.hpp"
#include "../models_registry.hpp"
#include "infinicore/ops/cat.hpp"
#include <stdexcept>
#include <string>

namespace infinilm::models::videonsa {

namespace {

std::shared_ptr<infinilm::config::ModelConfig> text_config_from(std::shared_ptr<infinilm::config::ModelConfig> model_config) {
nlohmann::json &config_json = model_config->get_config_json();
nlohmann::json text_config_json = config_json.contains("text_config") ? config_json["text_config"] : config_json;
text_config_json["model_type"] = "videonsa";
if (!text_config_json.contains("torch_dtype") && config_json.contains("torch_dtype")) {
text_config_json["torch_dtype"] = config_json["torch_dtype"];
}
if (!text_config_json.contains("head_dim")) {
text_config_json["head_dim"] = text_config_json["hidden_size"].get<size_t>() / text_config_json["num_attention_heads"].get<size_t>();
}
if (!text_config_json.contains("attention_bias")) {
text_config_json["attention_bias"] = true;
}
return std::make_shared<infinilm::config::ModelConfig>(text_config_json);
}

} // namespace

VideoNSAForConditionalGeneration::VideoNSAForConditionalGeneration(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device) {
model_config_ = model_config;
auto text_config = text_config_from(model_config);
const size_t hidden_size = text_config->get<size_t>("hidden_size");
const size_t vocab_size = text_config->get<size_t>("vocab_size");
const auto &dtype{text_config->get_dtype()};

INFINICORE_NN_MODULE_INIT(model, text_config, device);
INFINICORE_NN_MODULE_INIT(visual, model_config->get_config_json()["vision_config"], dtype, device);
INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device);
}

void VideoNSAForConditionalGeneration::replace_embeddings(infinicore::Tensor inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const {
auto bounds_cpu = image_bound->to(infinicore::Device::cpu());
auto out_slice = inputs_embeds->squeeze(0);
auto bound_slice = bounds_cpu->squeeze(0);
auto bound_count = bound_slice->size(0);
size_t vision_offset = 0;
for (size_t i = 0; i < bound_count; ++i) {
auto bound = bound_slice->narrow({{0, i, 1}});
auto bound_ptr = reinterpret_cast<const int64_t *>(bound->data());
auto start = bound_ptr[0];
auto end = bound_ptr[1];
if (end <= start) {
continue;
}
const size_t len = static_cast<size_t>(end - start);
auto patch_embed = vision_hidden->narrow({{0, vision_offset, len}});
out_slice->narrow({{0, size_t(start), len}})->copy_from(patch_embed);
vision_offset += len;
}
}

infinilm::InfinilmModel::Output VideoNSAForConditionalGeneration::forward(const infinilm::InfinilmModel::Input &input) const {
if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) {
if (!input.image_bound.has_value() || !input.tgt_sizes.has_value()) {
throw std::runtime_error("VideoNSAForConditionalGeneration: image_bound and tgt_sizes must be provided with pixel_values");
}
auto input_ids = input.input_ids.value();
auto inputs_embeds = model_->embed_tokens(input_ids);
auto input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu());
int32_t *offsets = reinterpret_cast<int32_t *>(input_offsets_cpu->data());

const auto &image_req_ids = global_state::get_forward_context().mm_metadata.image_req_ids.value();
if (input.pixel_values->size() != image_req_ids.size() || input.image_bound->size() != image_req_ids.size() || input.tgt_sizes->size() != image_req_ids.size()) {
throw std::runtime_error("VideoNSAForConditionalGeneration: multimodal tensor lists must match image_req_ids");
}

std::vector<infinicore::Tensor> pixel_tensors;
std::vector<infinicore::Tensor> grid_tensors;
pixel_tensors.reserve(image_req_ids.size());
grid_tensors.reserve(image_req_ids.size());
for (size_t media_idx = 0; media_idx < image_req_ids.size(); ++media_idx) {
pixel_tensors.push_back(input.pixel_values.value().at(media_idx));
grid_tensors.push_back(input.tgt_sizes.value().at(media_idx));
}
auto batched_pixels = pixel_tensors.size() == 1 ? pixel_tensors.front() : infinicore::op::cat(pixel_tensors, 0);
auto batched_grids = grid_tensors.size() == 1 ? grid_tensors.front() : infinicore::op::cat(grid_tensors, 0);
auto batched_vision_hidden = visual_->forward(batched_pixels, batched_grids);

size_t vision_offset = 0;
for (size_t media_idx = 0; media_idx < image_req_ids.size(); ++media_idx) {
const size_t req_id = image_req_ids[media_idx];
auto bounds_cpu = input.image_bound.value().at(media_idx)->to(infinicore::Device::cpu())->squeeze(0);
auto bound_count = bounds_cpu->size(0);
auto bounds = reinterpret_cast<const int64_t *>(bounds_cpu->data());
size_t vision_len = 0;
for (size_t i = 0; i < bound_count; ++i) {
auto start = bounds[i * 2];
auto end = bounds[i * 2 + 1];
if (end > start) {
vision_len += static_cast<size_t>(end - start);
}
}

auto vision_hidden = batched_vision_hidden->narrow({{0, vision_offset, vision_len}});
auto req_embeds = inputs_embeds->narrow({{1, size_t(offsets[req_id]), size_t(offsets[req_id + 1] - offsets[req_id])}});
replace_embeddings(req_embeds, vision_hidden, input.image_bound.value().at(media_idx));
vision_offset += vision_len;
}

auto hidden_states = model_->forward_embeds(inputs_embeds, input.position_ids.value());
auto logits = lm_head_->forward(hidden_states);
return {logits};
}

auto hidden_states = model_->forward(input);
auto logits = lm_head_->forward(hidden_states);
return {logits};
}

void VideoNSAForConditionalGeneration::reset_cache(const cache::CacheConfig *cache_config) {
if (nullptr == cache_config) {
InfinilmModel::reset_cache(nullptr);
return;
}
cache_config_ = cache_config->unique_copy();

auto text_config = text_config_from(model_config_);
auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
kv_cache_vec.clear();
const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend;
kv_cache_vec = std::move(default_allocate_kv_cache_tensors(cache_config, text_config, attention_backend));
}

std::shared_ptr<infinilm::config::ModelConfig> create_videonsa_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config) {
const std::string &model_type = model_config->get<std::string>("model_type");
if ("videonsa" != model_type) {
throw std::runtime_error("infinilm::models::videonsa::create_videonsa_model_config: model_type is not videonsa");
}

nlohmann::json &config_json = model_config->get_config_json();
if (config_json.contains("text_config")) {
nlohmann::json &text_config_json = config_json["text_config"];
if (!text_config_json.contains("head_dim")) {
text_config_json["head_dim"] = text_config_json["hidden_size"].get<size_t>() / text_config_json["num_attention_heads"].get<size_t>();
}
if (!text_config_json.contains("attention_bias")) {
text_config_json["attention_bias"] = true;
}
if (!config_json.contains("torch_dtype") && text_config_json.contains("torch_dtype")) {
config_json["torch_dtype"] = text_config_json["torch_dtype"];
}
} else {
if (!config_json.contains("head_dim")) {
config_json["head_dim"] = model_config->get<size_t>("hidden_size") / model_config->get<size_t>("num_attention_heads");
}
if (!config_json.contains("attention_bias")) {
config_json["attention_bias"] = true;
}
}
return model_config;
}

} // namespace infinilm::models::videonsa

namespace {
INFINILM_REGISTER_CAUSAL_LM_MODEL(
videonsa,
infinilm::models::videonsa::VideoNSAForConditionalGeneration,
infinilm::models::videonsa::create_videonsa_model_config);
} // namespace
33 changes: 33 additions & 0 deletions csrc/models/videonsa/videonsa_for_conditional_generation.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include "../../layers/common_modules.hpp"
#include "videonsa_attention.hpp"
#include "videonsa_vision.hpp"

namespace infinilm::models::videonsa {

using VideoNSAMLP = infinilm::layers::MLP;
using VideoNSADecoderLayer = infinilm::layers::causal_lm_templates::TextDecoderLayer<VideoNSAAttention, VideoNSAMLP>;
using VideoNSATextModel = infinilm::layers::causal_lm_templates::TextModel<VideoNSADecoderLayer>;

class VideoNSAForConditionalGeneration : public InfinilmModel {
public:
VideoNSAForConditionalGeneration(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device);

Output forward(const Input &input) const override;
void reset_cache(const cache::CacheConfig *cache_config) override;

protected:
void replace_embeddings(infinicore::Tensor inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const;

INFINICORE_NN_MODULE(VideoNSATextModel, model);
INFINICORE_NN_MODULE(VideoNSAVisionModel, visual);
INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head);
};

std::shared_ptr<infinilm::config::ModelConfig> create_videonsa_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config);

} // namespace infinilm::models::videonsa
Loading