Skip to content

Commit e92c699

Browse files
committed
Avoid re-compilation in llama-bench
1 parent 6817c80 commit e92c699

6 files changed

Lines changed: 78 additions & 39 deletions

File tree

ggml/include/ggml-openvino.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t b
2424

2525
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
2626

27+
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
28+
2729
// device buffer
2830
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
2931

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,17 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
7979
add_extra_inputs();
8080
}
8181

82+
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
83+
m_cgraph = cgraph;
84+
m_model_inputs.clear();
85+
m_model_outputs.clear();
86+
m_node_info_list.clear();
87+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
88+
auto * cur_node = cgraph->nodes[node_n];
89+
set_input_output(cur_node);
90+
}
91+
}
92+
8293
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
8394
m_cgraph = cgraph;
8495
m_model_weights = model_weights;
@@ -330,6 +341,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
330341
auto * mask = node->src[3];
331342
std::string mask_name(mask->name);
332343

344+
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
333345
if (mask_name.find("swa") != std::string::npos) {
334346
model_params.swa_layers.push_back(layer);
335347
model_params.ctx_per_seq_swa = cache_k->ne[1];
@@ -358,7 +370,7 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
358370
break;
359371
}
360372
if (node->op == GGML_OP_ROPE) {
361-
model_params.rope_params = node->op_params;
373+
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
362374
}
363375
}
364376
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
@@ -405,7 +417,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
405417
// kvcache
406418
input_shape = ov::PartialShape{get_shape(input)};
407419
if (!m_is_static) {
408-
// do not fix ctx size to make llama-bench work
420+
// do not fix ctx size to make llama-bench work across test params
409421
input_shape[2] = -1;
410422
}
411423

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "openvino/decoder.hpp"
66

77
#include <cstdint>
8+
#include <cstring>
89
#include <map>
910
#include <memory>
1011
#include <openvino/core/partial_shape.hpp>
@@ -20,20 +21,21 @@ struct ModelParams {
2021
int n_heads = -1;
2122
int n_heads_kv = -1;
2223
int head_size = -1;
23-
int32_t * rope_params = nullptr;
24+
int32_t rope_params[15];
2425
std::vector<int> swa_layers;
2526

2627
std::vector<std::string> kv_names;
28+
size_t kv_buffer_ctx_id = 0;
2729

28-
bool operator==(const ModelParams & other) const {
29-
return n_seq == other.n_seq && n_heads == other.n_heads && n_heads_kv == other.n_heads_kv &&
30-
head_size == other.head_size && rope_params == other.rope_params && swa_layers == other.swa_layers &&
31-
ctx_per_seq == other.ctx_per_seq && ctx_per_seq_swa == other.ctx_per_seq_swa;
30+
bool same_rope_params(const ModelParams & other) const {
31+
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
3232
}
3333

34-
bool can_reuse_dynamically(const ModelParams & other) const { return *this == other; }
34+
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
3535

36-
bool can_reuse_statically(const ModelParams & other) const { return *this == other; }
36+
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }
37+
38+
bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
3739
};
3840

3941
struct ComputeParams {
@@ -170,7 +172,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
170172

171173
int get_input_len() const { return m_compute_params.input_len; }
172174

173-
virtual int32_t * get_rope_params() const override { return m_model_params.rope_params; }
175+
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
174176

175177
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
176178

@@ -213,6 +215,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
213215
static std::string compute_op_type(const ggml_tensor * node);
214216
void add_extra_inputs();
215217

218+
void update_io(ggml_cgraph * cgraph);
219+
216220
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
217221
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
218222
}

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "ggml-quants.hpp"
99
#include "ggml.h"
1010

11+
#include <atomic>
1112
#include <cstdint>
1213
#include <cstring>
1314
#include <memory>
@@ -53,6 +54,7 @@
5354
struct ggml_backend_openvino_buffer_context {
5455
int device;
5556
std::string name;
57+
size_t id;
5658

5759
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
5860
void * data;
@@ -71,6 +73,10 @@ struct ggml_backend_openvino_buffer_context {
7173
ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) :
7274
device(device),
7375
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
76+
id([]() {
77+
static std::atomic<size_t> next_id{1};
78+
return next_id.fetch_add(1);
79+
}()),
7480
data(nullptr),
7581
size(size),
7682
is_remote(is_remote) {
@@ -107,6 +113,8 @@ struct ggml_backend_openvino_buffer_context {
107113

108114
~ggml_backend_openvino_buffer_context() {
109115
// Clean up all tensor extras
116+
GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device,
117+
size / 1024 / 1024);
110118
for (auto & pair : tensor_extras) {
111119
delete pair.second;
112120
}
@@ -587,6 +595,14 @@ bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
587595
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
588596
}
589597

598+
size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
599+
if (!ggml_backend_buffer_is_openvino(buffer)) {
600+
return 0;
601+
}
602+
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
603+
return ctx->id;
604+
}
605+
590606
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
591607
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
592608
}

ggml/src/ggml-openvino/utils.cpp

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
7676
ComputeParams c_params;
7777
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
7878

79-
const auto key = compute_graph_key(cgraph);
79+
graph_key key(cgraph);
8080
bool cache_hit;
8181

8282
int64_t decoder_end_time;
@@ -90,19 +90,22 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
9090
auto it = decoder_cache.find(key);
9191

9292
cache_hit = it != decoder_cache.end();
93+
ModelParams old_m_params;
9394
if (cache_hit) {
9495
ggml_decoder = it->second;
95-
cache_hit = ggml_decoder->get_model_params().can_reuse_dynamically(m_params);
96+
old_m_params = ggml_decoder->get_model_params();
97+
cache_hit = old_m_params.can_reuse_dynamically(m_params);
9698
}
9799

98100
if (cache_hit) {
99101
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
100-
ggml_decoder = decoder_cache[key];
101102
ggml_decoder->set_compute_params(c_params);
102103
ggml_decoder->set_model_params(m_params);
104+
if (old_m_params.kv_buffer_changed(m_params)) {
105+
ggml_decoder->update_io(cgraph);
106+
}
103107
ggml_decoder->add_extra_inputs();
104-
infer_request = infer_request_cache[key];
105-
108+
infer_request = infer_request_cache.at(key);
106109
if (stateful) {
107110
const auto * inp_pos = get_inp_pos_tensor(cgraph);
108111
int32_t * pos_data = (int32_t *) inp_pos->data;
@@ -240,7 +243,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
240243

241244
const auto * inp_pos = get_inp_pos_tensor(cgraph);
242245
const auto is_prefill = get_is_prefill(inp_pos);
243-
const auto key = compute_graph_key(cgraph);
246+
graph_key key(cgraph);
244247
bool cache_hit;
245248

246249
int64_t decoder_end_time;
@@ -254,19 +257,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
254257
auto it = decoder_cache.find(key);
255258

256259
cache_hit = it != decoder_cache.end();
260+
ModelParams old_m_params;
257261
if (cache_hit) {
258262
ggml_decoder = it->second;
259-
cache_hit = ggml_decoder->get_model_params().can_reuse_statically(m_params);
263+
old_m_params = ggml_decoder->get_model_params();
264+
cache_hit = old_m_params.can_reuse_statically(m_params);
260265
}
261266

262267
if (cache_hit) {
263268
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
264-
ggml_decoder = decoder_cache[key];
265269
ggml_decoder->m_is_prefill = is_prefill;
266270
ggml_decoder->set_model_params(m_params);
267271
ggml_decoder->set_compute_params(c_params);
272+
if (old_m_params.kv_buffer_changed(m_params)) {
273+
ggml_decoder->update_io(cgraph);
274+
}
268275
ggml_decoder->add_extra_inputs();
269-
infer_request = is_prefill ? infer_request_cache_prefill[key] : infer_request_cache[key];
276+
infer_request = is_prefill ? infer_request_cache_prefill.at(key) : infer_request_cache.at(key);
270277

271278
decoder_end_time = ggml_time_us();
272279
conversion_end_time = decoder_end_time;
@@ -761,17 +768,4 @@ bool get_is_prefill(const ggml_tensor * inp_pos) {
761768
return inp_pos->ne[0] > 1;
762769
}
763770

764-
graph_key compute_graph_key(ggml_cgraph * cgraph) {
765-
graph_key key;
766-
key.n_nodes = cgraph->n_nodes;
767-
768-
for (int i = 0; i < cgraph->n_nodes; ++i) {
769-
const auto * node = cgraph->nodes[i];
770-
if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) {
771-
key.cache_k_l0 = node->src[2];
772-
}
773-
}
774-
return key;
775-
}
776-
777771
#pragma GCC diagnostic pop

ggml/src/ggml-openvino/utils.h

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,33 @@
55
#include <algorithm>
66
#include <cstddef>
77
#include <openvino/runtime/core.hpp>
8+
#include <string>
89

910
struct graph_key {
10-
size_t n_nodes;
11-
void * cache_k_l0;
11+
int n_nodes;
12+
std::string first_node_name;
13+
std::string last_node_name;
14+
15+
graph_key(const ggml_cgraph * cgraph) : n_nodes(cgraph->n_nodes) {
16+
if (n_nodes > 0) {
17+
first_node_name = cgraph->nodes[0]->name;
18+
last_node_name = cgraph->nodes[n_nodes - 1]->name;
19+
}
20+
}
1221

1322
bool operator==(const graph_key & other) const {
14-
return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0;
23+
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
24+
last_node_name == other.last_node_name;
1525
}
1626
};
1727

1828
struct graph_key_hash {
1929
size_t operator()(const graph_key & key) const {
20-
size_t h = std::hash<size_t>{}(key.n_nodes);
21-
h ^= std::hash<void *>{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2);
30+
size_t h = std::hash<int>{}(key.n_nodes);
31+
if (key.n_nodes > 0) {
32+
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
33+
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
34+
}
2235
return h;
2336
}
2437
};
@@ -66,8 +79,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
6679

6780
bool get_is_prefill(const ggml_tensor * inp_pos);
6881

69-
graph_key compute_graph_key(struct ggml_cgraph * cgraph);
70-
7182
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
7283
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
7384
const std::string & param_name);

0 commit comments

Comments
 (0)