diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index 46c1485f663..b68b55d1e81 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft); +GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer); + // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4806b90894b..f7052bfc823 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, add_extra_inputs(); } +void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) { + m_cgraph = cgraph; + m_model_inputs.clear(); + m_model_outputs.clear(); + m_node_info_list.clear(); + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto * cur_node = cgraph->nodes[node_n]; + set_input_output(cur_node); + } +} + GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; @@ -330,6 +341,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr auto * mask = node->src[3]; std::string mask_name(mask->name); + model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer); if (mask_name.find("swa") != std::string::npos) { model_params.swa_layers.push_back(layer); model_params.ctx_per_seq_swa = cache_k->ne[1]; @@ -358,7 +370,7 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr break; } if (node->op == GGML_OP_ROPE) { - model_params.rope_params = node->op_params; + memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); } } auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; @@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co // kvcache input_shape = ov::PartialShape{get_shape(input)}; if (!m_is_static) { - // do not fix ctx size to make llama-bench work + // do not fix ctx size to make llama-bench work across test params input_shape[2] = -1; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 260cc0cedbb..c8e3edeaf89 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -5,6 +5,7 @@ #include "openvino/decoder.hpp" #include +#include #include #include #include @@ -20,20 +21,21 @@ struct ModelParams { int n_heads = -1; int n_heads_kv = -1; int head_size = -1; - int32_t * rope_params = nullptr; + int32_t rope_params[15]; std::vector swa_layers; std::vector kv_names; + size_t kv_buffer_ctx_id = 0; - bool operator==(const ModelParams & other) const { - return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv && - head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers && - ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa; + bool same_rope_params(const ModelParams & other) const { + return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0; } - bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; } + bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); } - bool can_reuse_statically(const ModelParams & other) const { return *this == other; } + bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; } + + bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; } }; struct ComputeParams { @@ -170,7 +172,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { int get_input_len() const { return m_compute_params.input_len; } - virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; } + virtual int32_t * get_rope_params() const override { return const_cast(m_model_params.rope_params); } virtual std::map get_kv_param_res_names() const override; @@ -213,6 +215,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::string compute_op_type(const ggml_tensor * node); void add_extra_inputs(); + void update_io(ggml_cgraph * cgraph); + inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) { return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE; } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index b2d5234083b..87577dde9c7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -8,6 +8,7 @@ #include "ggml-quants.hpp" #include "ggml.h" +#include #include #include #include @@ -53,6 +54,7 @@ struct ggml_backend_openvino_buffer_context { int device; std::string name; + size_t id; // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; @@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context { ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) : device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), + id([]() { + static std::atomic next_id{1}; + return next_id.fetch_add(1); + }()), data(nullptr), size(size), is_remote(is_remote) { @@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras + GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, + size / 1024 / 1024); for (auto & pair : tensor_extras) { delete pair.second; } @@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } +size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) { + if (!ggml_backend_buffer_is_openvino(buffer)) { + return 0; + } + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + return ctx->id; +} + bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 83d3b3afee2..69cac19019c 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ComputeParams c_params; std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); - const auto key = compute_graph_key(cgraph); + graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; @@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); + ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; - cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params); + old_m_params = ggml_decoder->get_model_params(); + cache_hit = old_m_params.can_reuse_dynamically(m_params); } if (cache_hit) { std::map> model_weights; - ggml_decoder = decoder_cache[key]; ggml_decoder->set_compute_params(c_params); ggml_decoder->set_model_params(m_params); + if (old_m_params.kv_buffer_changed(m_params)) { + ggml_decoder->update_io(cgraph); + } ggml_decoder->add_extra_inputs(); - infer_request = infer_request_cache[key]; - + infer_request = infer_request_cache.at(key); if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); int32_t * pos_data = (int32_t *) inp_pos->data; @@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto is_prefill = get_is_prefill(inp_pos); - const auto key = compute_graph_key(cgraph); + graph_key key(cgraph); bool cache_hit; int64_t decoder_end_time; @@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { auto it = decoder_cache.find(key); cache_hit = it != decoder_cache.end(); + ModelParams old_m_params; if (cache_hit) { ggml_decoder = it->second; - cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params); + old_m_params = ggml_decoder->get_model_params(); + cache_hit = old_m_params.can_reuse_statically(m_params); } if (cache_hit) { std::map> model_weights; - ggml_decoder = decoder_cache[key]; ggml_decoder->m_is_prefill = is_prefill; ggml_decoder->set_model_params(m_params); ggml_decoder->set_compute_params(c_params); + if (old_m_params.kv_buffer_changed(m_params)) { + ggml_decoder->update_io(cgraph); + } ggml_decoder->add_extra_inputs(); - infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key]; + infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key); decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; @@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) { return inp_pos->ne[0] > 1; } -graph_key compute_graph_key(ggml_cgraph * cgraph) { - graph_key key; - key.n_nodes = cgraph->n_nodes; - - for (int i = 0; i < cgraph->n_nodes; ++i) { - const auto * node = cgraph->nodes[i]; - if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) { - key.cache_k_l0 = node->src[2]; - } - } - return key; -} - #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 72ef904f741..7c403b7d890 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -5,20 +5,33 @@ #include #include #include +#include struct graph_key { - size_t n_nodes; - void * cache_k_l0; + int n_nodes; + std::string first_node_name; + std::string last_node_name; + + graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) { + if (n_nodes > 0) { + first_node_name = cgraph->nodes[0]->name; + last_node_name = cgraph->nodes[n_nodes - 1]->name; + } + } bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; + return n_nodes == other.n_nodes && first_node_name == other.first_node_name && + last_node_name == other.last_node_name; } }; struct graph_key_hash { size_t operator()(const graph_key & key) const { - size_t h = std::hash{}(key.n_nodes); - h ^= std::hash{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2); + size_t h = std::hash{}(key.n_nodes); + if (key.n_nodes > 0) { + h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + } return h; } }; @@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); bool get_is_prefill(const ggml_tensor * inp_pos); -graph_key compute_graph_key(struct ggml_cgraph * cgraph); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name);