Skip to content

Commit 2cad824

Browse files
committed
Offload token embd to CPU backend (breaks NPU)
1 parent e059015 commit 2cad824

3 files changed

Lines changed: 9 additions & 3 deletions

File tree

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
171171
if (!naive && !src->view_src) {
172172
ggml_backend_buffer * buffer = src->buffer;
173173

174-
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
174+
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT ||
175+
src_name.find("OPENVINO#") == 0) {
175176
ov::PartialShape stateful_kv_shape;
176177
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
177178
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
@@ -402,6 +403,11 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
402403
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
403404
input_shape = ov::PartialShape{1, 1, 1, len};
404405

406+
} else if (is_inp_emb(input, op)) {
407+
// embeddings
408+
input_shape = ov::PartialShape{get_shape(input)};
409+
input_shape[2] = -1;
410+
405411
} else if (is_output_idx(input, op)) {
406412
// output index
407413
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
226226
}
227227

228228
inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
229-
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
229+
return op->op == GGML_OP_RMS_NORM && (tensor->op == GGML_OP_GET_ROWS || tensor->op == GGML_OP_NONE);
230230
}
231231

232232
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -941,7 +941,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
941941
}
942942

943943
static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
944-
return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft);
944+
return ggml_backend_buft_is_openvino(buft);
945945
GGML_UNUSED(dev);
946946
}
947947

0 commit comments

Comments
 (0)