Skip to content

Commit 399b609

Browse files
committed
Free weight buffers for GPU (breaks llama-bench)
1 parent aca7c53 commit 399b609

3 files changed

Lines changed: 49 additions & 18 deletions

File tree

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -538,8 +538,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
538538

539539
// Static cache for quantized weight nodes (keyed by tensor data pointer)
540540
// This is a fallback for when tensors don't have pre-built constants in extra
541-
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
542-
static std::mutex s_quantized_weight_cache_mutex;
541+
// static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
542+
// static std::mutex s_quantized_weight_cache_mutex;
543543

544544
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
545545
// Check if we have a pre-built constant from the OpenVINO backend buffer
@@ -571,14 +571,14 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
571571

572572
// Fallback: Check static cache for quantized weights (keyed by data pointer)
573573
// This handles cases where tensors weren't loaded through OpenVINO buffer
574-
if (ggml_is_quantized(tensor->type)) {
575-
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
576-
auto it = s_quantized_weight_cache.find(tensor->data);
577-
if (it != s_quantized_weight_cache.end()) {
578-
GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
579-
return it->second;
580-
}
581-
}
574+
// if (ggml_is_quantized(tensor->type)) {
575+
// std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
576+
// auto it = s_quantized_weight_cache.find(tensor->data);
577+
// if (it != s_quantized_weight_cache.end()) {
578+
// GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
579+
// return it->second;
580+
// }
581+
// }
582582

583583
GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
584584

@@ -593,11 +593,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
593593
result->set_friendly_name(tensor->name);
594594

595595
// Cache the quantized weight node for future reuse
596-
if (ggml_is_quantized(tensor->type)) {
597-
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
598-
s_quantized_weight_cache[tensor->data] = result;
599-
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
600-
}
596+
// if (ggml_is_quantized(tensor->type)) {
597+
// std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
598+
// s_quantized_weight_cache[tensor->data] = result;
599+
// GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
600+
// }
601601

602602
return result;
603603
}

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,18 +105,40 @@ struct ggml_backend_openvino_buffer_context {
105105
}
106106
}
107107

108-
~ggml_backend_openvino_buffer_context() {
109-
// Clean up all tensor extras
108+
void free() {
109+
if (data == nullptr) {
110+
return;
111+
}
110112
for (auto & pair : tensor_extras) {
111113
delete pair.second;
112114
}
113115
tensor_extras.clear();
114-
if (!is_remote && data != nullptr) {
116+
if (!is_remote) {
115117
ggml_aligned_free(data, size);
118+
} else {
119+
ov_buffer.reset();
116120
}
121+
data = nullptr;
117122
}
123+
124+
~ggml_backend_openvino_buffer_context() { free(); }
118125
};
119126

127+
void free_weight_buffers(ggml_cgraph * cgraph) {
128+
for (int i = 0; i < cgraph->n_nodes; ++i) {
129+
auto * node = cgraph->nodes[i];
130+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
131+
auto * src = node->src[j];
132+
if (src == nullptr) {
133+
break;
134+
}
135+
if (src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
136+
static_cast<ggml_backend_openvino_buffer_context *>(src->buffer->context)->free();
137+
}
138+
}
139+
}
140+
}
141+
120142
// Buffer type context (per-device)
121143
struct ggml_backend_openvino_buffer_type_context {
122144
int device;
@@ -125,8 +147,12 @@ struct ggml_backend_openvino_buffer_type_context {
125147

126148
// Buffer interface functions
127149
static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) {
150+
if (buffer == nullptr || buffer->context == nullptr) {
151+
return;
152+
}
128153
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
129154
delete ctx;
155+
buffer->context = nullptr;
130156
}
131157

132158
static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) {

ggml/src/ggml-openvino/utils.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
140140
} else {
141141
compiled_model = core.compile_model(model, device, config);
142142
}
143+
if (ggml_openvino_get_device_name() == "GPU") {
144+
// Defined in ggml-openvino.cpp where ggml_backend_openvino_buffer_context is available
145+
extern void free_weight_buffers(ggml_cgraph * cgraph);
146+
free_weight_buffers(cgraph);
147+
}
143148
compile_end_time = ggml_time_us();
144149
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
145150
infer_request_cache[key] = infer_request;

0 commit comments

Comments
 (0)