Extract zp directly instead of bias

wine99 · wine99 · commit 656c43b88177 · 2026-02-05T11:12:50.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -508,10 +508,10 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 
 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
     std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
-    static std::mutex weights_mutex;
+    // static std::mutex weights_mutex;
     auto * nodes = cgraph->nodes;
     auto n_nodes = cgraph->n_nodes;
-    std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
+    std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
         for (int i = 0; i < GGML_MAX_SRC; i++) {
             auto * src = node->src[i];
             if (src == nullptr) {
@@ -522,21 +522,26 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
             if (!src->view_src) {
                 ggml_backend_buffer * buffer = src->buffer;
                 if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
-                    bool should_create = false;
-                    {
-                        std::lock_guard<std::mutex> lock(weights_mutex);
-                        if (model_weights.find(src_name) == model_weights.end()) {
-                            model_weights[src_name] = nullptr;
-                            should_create = true;
-                        }
-                    }
-                    if (should_create) {
+                    // bool should_create = false;
+                    // {
+                    //     std::lock_guard<std::mutex> lock(weights_mutex);
+                    //     if (model_weights.find(src_name) == model_weights.end()) {
+                    //         model_weights[src_name] = nullptr;
+                    //         should_create = true;
+                    //     }
+                    // }
+                    // if (should_create) {
+                    //     auto weight_node = create_weight_node(src);
+                    //     weight_node->set_friendly_name(src_name);
+                    //     {
+                    //         std::lock_guard<std::mutex> lock(weights_mutex);
+                    //         model_weights[src_name] = weight_node;
+                    //     }
+                    // }
+                    if (model_weights.find(src_name) == model_weights.end()) {
                         auto weight_node = create_weight_node(src);
                         weight_node->set_friendly_name(src_name);
-                        {
-                            std::lock_guard<std::mutex> lock(weights_mutex);
-                            model_weights[src_name] = weight_node;
-                        }
+                        model_weights[src_name] = weight_node;
                     }
                 }
             }
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
         layout.is_requant = true;
         layout.requant_type = requant_type;
 
-        // Special case: requant to F16 - just store F16 weights, no scales/biases
+        // Special case: requant to F16 - just store F16 weights, no scales/zp
         if (requant_type.value() == ExtraQuantType::F16) {
             layout.weights_size = n_elements * sizeof(uint16_t);  // F16 = 2 bytes
             layout.total_size = layout.weights_size;
             layout.weights_offset = 0;
-            // No scales/biases for F16
+            // No scales/zp for F16
             return layout;
         }
 
@@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
             layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
             int64_t n_blocks = n_elements / layout.weights_per_block;
             layout.scales_size = n_blocks * sizeof(uint16_t);
-            // For symmetric quantization, we only need one bias value (not one per block)
-            layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
+            // For symmetric quantization, we only need one zp value (not one per block)
+            // Zero points are stored in U4 or U8 format matching the weight type
+            size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+            layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
 
             layout.weights_offset = 0;
             layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
-            layout.biases_offset =
-                layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
-            layout.total_size = layout.biases_offset + layout.biases_size;
+            layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
+            layout.total_size = layout.zp_offset + layout.zp_size;
             layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
             return layout;
         }
@@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
     layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
 
-    // Scales and biases: F16 per block
+    // Scales: F16 per block
     int64_t n_blocks = n_elements / layout.weights_per_block;
     layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    // For symmetric quantization, we only need one bias value (not one per block)
-    layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
+    // Zero points: U4 or U8 matching weight type
+    // For symmetric quantization, we only need one zp value (not one per block)
+    size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+    layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
 
-    // Layout in buffer: [weights | scales | biases] with alignment
+    // Layout in buffer: [weights | scales | zp] with alignment
     layout.weights_offset = 0;
     layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
-    layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
-    layout.total_size = layout.biases_offset + layout.biases_size;
+    layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
+    layout.total_size = layout.zp_offset + layout.zp_size;
 
     return layout;
 }
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
         : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
 };
 
-// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant
+// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
 struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
     ov::Tensor weights;   // U4 or U8 extracted weights
     ov::Tensor scales;    // F16 scales
-    ov::Tensor biases;    // F16 biases (zero points)
+    ov::Tensor zp;        // U4 or U8 zero points (same type as weights)
     std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO weight subgraph
 
-    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)
-        : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
-          weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}
+    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
+        ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
+        weights(std::move(w)),
+        scales(std::move(s)),
+        zp(std::move(z)),
+        constant(std::move(c)) {}
 };
 
 // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
@@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
 // =====================================================
 // Extracted Size Calculation for Quantized Tensors
 // =====================================================
-// For quantized tensors, we need extra space to store extracted weights, scales, and biases.
+// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
 // Returns the total size needed in the buffer for extracted data.
 
 struct ggml_openvino_extracted_layout {
@@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout {
     size_t weights_size;      // Size of weights in bytes
     size_t scales_offset;     // Offset to scales in buffer
     size_t scales_size;       // Size of scales in bytes
-    size_t biases_offset;     // Offset to biases in buffer
-    size_t biases_size;       // Size of biases in bytes
+    size_t zp_offset;         // Offset to zero points in buffer
+    size_t zp_size;           // Size of zero points in bytes (U4 or U8)
     bool is_u4;               // true for U4 weights, false for U8
-    int64_t weights_per_block;// weights per scale/bias block
+    int64_t weights_per_block;  // weights per scale/zp block
     bool is_symmetric;        // true for symmetric quantization
 
     // Requantization info
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
                 ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
                 ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
                                          static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
+                // zp shape: scalar for symmetric, per-block for asymmetric
+                ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
 
                 ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
                 ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+                ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
 
                 auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
-                                                                        std::move(biases), constant);
+                                                                        std::move(zp), constant);
                 ctx->tensor_extras[tensor] = extra;
                 tensor->extra = extra;
 
@@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
     if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
         ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
         if (layout.total_size > 0) {
-            GGML_LOG_DEBUG(
-                "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",
-                __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,
-                layout.biases_size);
+            GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
+                           __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,
+                           layout.scales_size, layout.zp_size);
             return layout.total_size;
         }
     }
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp