From e0c377fb6807ea0bbc7aecae23a0544ac064ff42 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 12:20:50 +0800 Subject: [PATCH 1/4] Fix llama-bench -p -n where p<=256 --- ggml/src/ggml-openvino/utils.cpp | 12 +++++------- ggml/src/ggml-openvino/utils.h | 9 +++------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f7d62588c87..2d30eef941f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -768,14 +768,12 @@ graph_key compute_graph_key(ggml_cgraph * cgraph) { graph_key key; key.n_nodes = cgraph->n_nodes; - if (cgraph->n_nodes > 0) { - key.first_node_name = std::string(cgraph->nodes[0]->name); - key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); - } else { - key.first_node_name = ""; - key.last_node_name = ""; + for (int i = 0; i < cgraph->n_nodes; ++i) { + const auto * node = cgraph->nodes[i]; + if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) { + key.cache_k_l0 = node->src[2]; + } } - return key; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 47bf2d4ff17..72ef904f741 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,20 +8,17 @@ struct graph_key { size_t n_nodes; - std::string first_node_name; - std::string last_node_name; + void * cache_k_l0; bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && first_node_name == other.first_node_name && - last_node_name == other.last_node_name; + return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; } }; struct graph_key_hash { size_t operator()(const graph_key & key) const { size_t h = std::hash{}(key.n_nodes); - h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); - h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2); return h; } }; From ff9bb1ab144343972e22e48d3d070857e9c50713 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 15:52:10 +0800 Subject: [PATCH 2/4] Fix --direct-io 0 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index de986ea42d6..06bff5a2b77 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -943,7 +943,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } From a6eafbc3d110255baf96928ad5f61f22a2528438 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 21 Jan 2026 15:17:11 -0800 Subject: [PATCH 3/4] Stateful fix for shape errors after rebase --- ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp | 7 ++++++- ggml/src/ggml-openvino/openvino/op/rope.cpp | 9 +++++++-- ggml/src/ggml-openvino/openvino/utils.cpp | 7 ++++++- ggml/src/ggml-openvino/utils.cpp | 4 ++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 2b7f13629f2..dbaa814acc2 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -26,7 +26,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + // TODO: Will it work if we set it to "-1" for all cases? + int32_t split_dim = 3; + if (context.is_stateful()) { + split_dim = -1; + } + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {split_dim}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 01bc46131e1..ef55f449d86 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -104,8 +104,12 @@ OutputVector translate_rope(const NodeContext & context) { ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); res = std::make_shared(stack, data_shape, false); } else if (mode == ROPE_TYPE_NEOX) { + int32_t split_dim = 3; + if (context.is_stateful()) { + split_dim = 2; + } auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}), 2); + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {split_dim}), 2); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -117,9 +121,10 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); + // TODO: Will it work if we set it to "-1" for all cases? int32_t concat_dim = 3; if (context.is_stateful()) { - concat_dim = 2; + concat_dim = -1; } res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, concat_dim); } diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index b7553f99c86..e94c19ff42b 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -213,10 +213,15 @@ ov::Output process_view_input(const NodeContext & context, int input_i } int64_t slice_end = split_addr + slice_len; + int32_t axes_val = 3; + if (context.is_stateful()) { + axes_val = 2; + } + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {axes_val}); auto sliced = std::make_shared(input, begin, end, stride, axes); return sliced; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2d30eef941f..b20f6264c58 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -614,10 +614,10 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) { + if (ggml_decoder->is_static() && (result_name == "result_output" || result_name == "result_norm") && output_shape[2] == 0) { output_shape[2] = 1; } - if (ggml_decoder->is_stateful() && result_name == "result_output") { + if (ggml_decoder->is_stateful() && (result_name == "result_output" || result_name == "result_norm")) { std::vector output_shape_3d; for (size_t i=1; i Date: Sun, 25 Jan 2026 17:27:56 -0800 Subject: [PATCH 4/4] Simplification for stateful and update output shape processing --- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 7 +--- ggml/src/ggml-openvino/openvino/op/rope.cpp | 13 ++------ .../openvino/translate_session.cpp | 25 +++++++++++++++ ggml/src/ggml-openvino/utils.cpp | 32 +++++++------------ 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index dbaa814acc2..6e0b85517e6 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -26,12 +26,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - // TODO: Will it work if we set it to "-1" for all cases? - int32_t split_dim = 3; - if (context.is_stateful()) { - split_dim = -1; - } - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {split_dim}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index ef55f449d86..fb19af8e0bd 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -104,12 +104,8 @@ OutputVector translate_rope(const NodeContext & context) { ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); res = std::make_shared(stack, data_shape, false); } else if (mode == ROPE_TYPE_NEOX) { - int32_t split_dim = 3; - if (context.is_stateful()) { - split_dim = 2; - } auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {split_dim}), 2); + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -121,12 +117,7 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - // TODO: Will it work if we set it to "-1" for all cases? - int32_t concat_dim = 3; - if (context.is_stateful()) { - concat_dim = -1; - } - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, concat_dim); + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index adb3025d175..b7e7b58531f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -29,8 +29,10 @@ #include #include #include +#include #include #include +#include namespace ov { namespace frontend { @@ -252,6 +254,29 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); } manager.run_passes(model); + if (ggml_model_decoder->is_stateful()) { + auto output_names = ggml_model_decoder->get_model_output_names(); + std::map model_output_indexes; + for (size_t i=0; iget_output_size(); i++) { + auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name(); + auto output_id = model_output_indexes[output_friendly_name]; + auto model_output_shape = model->output(i).get_partial_shape(); + auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id); + if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() + && model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() + && decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { + ppp.output(i).postprocess().custom([](const ov::Output& node) { + auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0}); + return std::make_shared(node, axes); + }); + } + } + model = ppp.build(); + } } return model; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index b20f6264c58..50078524063 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -103,10 +103,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; - auto * inp_pos = get_inp_pos_tensor(cgraph); - int32_t * pos_data = (int32_t *) inp_pos->data; - if (pos_data[0] == 0) { - infer_request->reset_state(); + if (stateful) { + auto * inp_pos = get_inp_pos_tensor(cgraph); + int32_t * pos_data = (int32_t *) inp_pos->data; + if (pos_data[0] == 0) { + infer_request->reset_state(); + } } decoder_end_time = ggml_time_us(); @@ -351,7 +353,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -378,7 +381,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -614,20 +618,8 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && (result_name == "result_output" || result_name == "result_norm") && output_shape[2] == 0) { - output_shape[2] = 1; - } - if (ggml_decoder->is_stateful() && (result_name == "result_output" || result_name == "result_norm")) { - std::vector output_shape_3d; - for (size_t i=1; idata); - return output_tensor; - } else { - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); - return output_tensor; - } + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; } size_t checksum(const void * data, size_t size) {