Free weight buffers for GPU (breaks llama-bench)

wine99 · wine99 · commit 399b609a89ad · 2026-01-24T17:05:57.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -538,8 +538,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
 
 // Static cache for quantized weight nodes (keyed by tensor data pointer)
 // This is a fallback for when tensors don't have pre-built constants in extra
-static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
-static std::mutex s_quantized_weight_cache_mutex;
+// static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
+// static std::mutex s_quantized_weight_cache_mutex;
 
 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
     // Check if we have a pre-built constant from the OpenVINO backend buffer
@@ -571,14 +571,14 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
 
     // Fallback: Check static cache for quantized weights (keyed by data pointer)
     // This handles cases where tensors weren't loaded through OpenVINO buffer
-    if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        auto it = s_quantized_weight_cache.find(tensor->data);
-        if (it != s_quantized_weight_cache.end()) {
-            GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
-            return it->second;
-        }
-    }
+    // if (ggml_is_quantized(tensor->type)) {
+    //     std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
+    //     auto it = s_quantized_weight_cache.find(tensor->data);
+    //     if (it != s_quantized_weight_cache.end()) {
+    //         GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
+    //         return it->second;
+    //     }
+    // }
 
     GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
 
@@ -593,11 +593,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
     result->set_friendly_name(tensor->name);
 
     // Cache the quantized weight node for future reuse
-    if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        s_quantized_weight_cache[tensor->data] = result;
-        GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
-    }
+    // if (ggml_is_quantized(tensor->type)) {
+    //     std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
+    //     s_quantized_weight_cache[tensor->data] = result;
+    //     GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
+    // }
 
     return result;
 }
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -105,18 +105,40 @@ struct ggml_backend_openvino_buffer_context {
         }
     }
 
-    ~ggml_backend_openvino_buffer_context() {
-        // Clean up all tensor extras
+    void free() {
+        if (data == nullptr) {
+            return;
+        }
         for (auto & pair : tensor_extras) {
             delete pair.second;
         }
         tensor_extras.clear();
-        if (!is_remote && data != nullptr) {
+        if (!is_remote) {
             ggml_aligned_free(data, size);
+        } else {
+            ov_buffer.reset();
         }
+        data = nullptr;
     }
+
+    ~ggml_backend_openvino_buffer_context() { free(); }
 };
 
+void free_weight_buffers(ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        auto * node = cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            auto * src = node->src[j];
+            if (src == nullptr) {
+                break;
+            }
+            if (src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                static_cast<ggml_backend_openvino_buffer_context *>(src->buffer->context)->free();
+            }
+        }
+    }
+}
+
 // Buffer type context (per-device)
 struct ggml_backend_openvino_buffer_type_context {
     int device;
@@ -125,8 +147,12 @@ struct ggml_backend_openvino_buffer_type_context {
 
 // Buffer interface functions
 static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    if (buffer == nullptr || buffer->context == nullptr) {
+        return;
+    }
     ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
     delete ctx;
+    buffer->context = nullptr;
 }
 
 static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -140,6 +140,11 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             } else {
                 compiled_model = core.compile_model(model, device, config);
             }
+            if (ggml_openvino_get_device_name() == "GPU") {
+                // Defined in ggml-openvino.cpp where ggml_backend_openvino_buffer_context is available
+                extern void free_weight_buffers(ggml_cgraph * cgraph);
+                free_weight_buffers(cgraph);
+            }
             compile_end_time = ggml_time_us();
             infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
             infer_request_cache[key] = infer_request;