@@ -551,13 +551,13 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
551551}
552552
553553std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node (ggml_tensor * tensor) {
554+ const bool is_ov_buffer = ggml_backend_buffer_is_openvino (tensor->buffer );
555+
554556 // Check if we have a pre-built constant from the OpenVINO backend buffer
555557 // This is set during ggml_backend_openvino_buffer_set_tensor
556558 if (tensor->extra ) {
557- if (!ggml_backend_buffer_is_openvino (tensor->buffer )) {
558- OPENVINO_ASSERT (false , " Unsupported weight tensor: " + std::string (tensor->name ) +
559- " Possibly this is a cpu backend repacked quantized weights" );
560- }
559+ OPENVINO_ASSERT (is_ov_buffer, " Unsupported weight tensor: " + std::string (tensor->name ) +
560+ " Possibly this is a cpu backend repacked quantized weights" );
561561 // Cast to our extra base type and check the type
562562 auto * extra_base = static_cast <ggml_openvino_extra_base *>(tensor->extra );
563563
@@ -578,12 +578,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
578578 }
579579 }
580580
581- // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
582- // openvino_host_buffer_type, which has enough space (get_alloc_size returns
583- // layout.total_size for quantized 2D tensors) to store extracted data in-place.
584- // Build the weight node and store it in tensor->extra for future reuse.
585581 GGML_LOG_DEBUG (" %s: creating new weight node for %s\n " , __func__, tensor->name );
586-
587582 static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
588583 GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
589584 GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
@@ -594,21 +589,28 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
594589
595590 OvWeight ov_weight;
596591 if (ggml_is_quantized (tensor->type )) {
597- // For quantized weights, copy raw data to a temp buffer first because
598- // process_weight_tensor reads from data and writes extracted results
599- // (weights/scales/zp) to output_base_ptr — they would overlap if both
600- // point to tensor->data.
601- size_t raw_size = ggml_nbytes (tensor);
602- std::vector<uint8_t > tmp (raw_size);
603- memcpy (tmp.data (), tensor->data , raw_size);
604- ov_weight = process_weight_tensor (tensor, tmp.data (), tensor->data );
592+ if (is_ov_buffer) {
593+ // For quantized weights, copy raw data to a temp buffer first because
594+ // process_weight_tensor reads from data and writes extracted results
595+ // (weights/scales/zp) to output_base_ptr — they would overlap if both
596+ // point to tensor->data.
597+ size_t raw_size = ggml_nbytes (tensor);
598+ std::vector<uint8_t > tmp (raw_size);
599+ memcpy (tmp.data (), tensor->data , raw_size);
600+ ov_weight = process_weight_tensor (tensor, tmp.data (), tensor->data );
601+ } else {
602+ ov_weight = process_weight_tensor (tensor, tensor->data , nullptr );
603+ }
605604 } else {
606605 // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
607606 // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
608607 ov_weight = process_weight_tensor (tensor, tensor->data , tensor->data );
609608 }
610609
611610 ov_weight.weight_node ->set_friendly_name (tensor->name );
611+ if (!is_ov_buffer) {
612+ return ov_weight.weight_node ;
613+ }
612614
613615 ggml_openvino_extra_base * extra;
614616 if (ov_weight.is_quantized ()) {
0 commit comments