Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
}

static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft);
return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft);
GGML_UNUSED(dev);
}

Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
src1 = context.get_input(1);
} else {
auto combined = context.get_input(0);
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3});
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1});
auto split = std::make_shared<ov::op::v1::Split>(combined, split_axis, 2);
src0 = split->output(0);
src1 = split->output(1);
Expand Down
8 changes: 2 additions & 6 deletions ggml/src/ggml-openvino/openvino/op/rope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ OutputVector translate_rope(const NodeContext & context) {
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
} else if (mode == ROPE_TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}), 2);
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
Output<Node> slice_data_node_0 = data_split->outputs()[0];
Output<Node> slice_data_node_1 = data_split->outputs()[1];

Expand All @@ -117,11 +117,7 @@ OutputVector translate_rope(const NodeContext & context) {
std::make_shared<ov::op::v1::Multiply>(slice_data_node_0, sin_theta_node),
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));

int32_t concat_dim = 3;
if (context.is_stateful()) {
concat_dim = 2;
}
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, concat_dim);
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
}

return rename_outputs_with_suffix({res}, context.get_name());
Expand Down
25 changes: 25 additions & 0 deletions ggml/src/ggml-openvino/openvino/translate_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
#include <openvino/op/squeeze.hpp>
#include <openvino/op/strided_slice.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/pass/constant_folding.hpp>
#include <openvino/pass/make_stateful.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>

namespace ov {
namespace frontend {
Expand Down Expand Up @@ -252,6 +254,29 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
manager.register_pass<pass::SqueezeMatmul>();
}
manager.run_passes(model);
if (ggml_model_decoder->is_stateful()) {
auto output_names = ggml_model_decoder->get_model_output_names();
std::map<std::string, int> model_output_indexes;
for (size_t i=0; i<output_names.size(); i++) {
model_output_indexes.insert(std::make_pair(output_names[i], i));
}
ov::preprocess::PrePostProcessor ppp(model);
for (size_t i=0; i<model->get_output_size(); i++) {
auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name();
auto output_id = model_output_indexes[output_friendly_name];
auto model_output_shape = model->output(i).get_partial_shape();
auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id);
if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static()
&& model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length()
&& decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) {
ppp.output(i).postprocess().custom([](const ov::Output<ov::Node>& node) {
auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0});
return std::make_shared<ov::op::v0::Unsqueeze>(node, axes);
});
}
}
model = ppp.build();
}
}
return model;
}
Expand Down
7 changes: 6 additions & 1 deletion ggml/src/ggml-openvino/openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,15 @@ ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_i
}
int64_t slice_end = split_addr + slice_len;

int32_t axes_val = 3;
if (context.is_stateful()) {
axes_val = 2;
}

auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr});
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end});
auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {axes_val});
auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
return sliced;
}
Expand Down
44 changes: 17 additions & 27 deletions ggml/src/ggml-openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
ggml_decoder->add_extra_inputs();
infer_request = infer_request_cache[key];

auto * inp_pos = get_inp_pos_tensor(cgraph);
int32_t * pos_data = (int32_t *) inp_pos->data;
if (pos_data[0] == 0) {
infer_request->reset_state();
if (stateful) {
auto * inp_pos = get_inp_pos_tensor(cgraph);
int32_t * pos_data = (int32_t *) inp_pos->data;
if (pos_data[0] == 0) {
infer_request->reset_state();
}
}

decoder_end_time = ggml_time_us();
Expand Down Expand Up @@ -351,7 +353,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
}

for (size_t i = 0; i < ov_output_names.size(); i++) {
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data);
infer_request->set_output_tensor(i, output_tensor);
}

Expand All @@ -378,7 +381,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
}

for (size_t i = 0; i < ov_output_names.size(); i++) {
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data);
infer_request->set_output_tensor(i, output_tensor);
}

Expand Down Expand Up @@ -614,20 +618,8 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, con
auto output_type = ggml_decoder->get_ov_type(ggml_tensor);
auto output_shape = ggml_decoder->get_shape(ggml_tensor);

if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) {
output_shape[2] = 1;
}
if (ggml_decoder->is_stateful() && result_name == "result_output") {
std::vector<long unsigned int> output_shape_3d;
for (size_t i=1; i<output_shape.size(); i++) {
output_shape_3d.push_back(output_shape[i]);
}
ov::Tensor output_tensor(output_type, output_shape_3d, ggml_tensor->data);
return output_tensor;
} else {
ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data);
return output_tensor;
}
ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data);
return output_tensor;
}

size_t checksum(const void * data, size_t size) {
Expand Down Expand Up @@ -768,14 +760,12 @@ graph_key compute_graph_key(ggml_cgraph * cgraph) {
graph_key key;
key.n_nodes = cgraph->n_nodes;

if (cgraph->n_nodes > 0) {
key.first_node_name = std::string(cgraph->nodes[0]->name);
key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name);
} else {
key.first_node_name = "";
key.last_node_name = "";
for (int i = 0; i < cgraph->n_nodes; ++i) {
const auto * node = cgraph->nodes[i];
if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) {
key.cache_k_l0 = node->src[2];
}
}

return key;
}

Expand Down
9 changes: 3 additions & 6 deletions ggml/src/ggml-openvino/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,17 @@

struct graph_key {
size_t n_nodes;
std::string first_node_name;
std::string last_node_name;
void * cache_k_l0;

bool operator==(const graph_key & other) const {
return n_nodes == other.n_nodes && first_node_name == other.first_node_name &&
last_node_name == other.last_node_name;
return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0;
}
};

struct graph_key_hash {
size_t operator()(const graph_key & key) const {
size_t h = std::hash<size_t>{}(key.n_nodes);
h ^= std::hash<std::string>{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<std::string>{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<void *>{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2);
return h;
}
};
Expand Down