Skip to content

Commit 656c43b

Browse files
committed
Extract zp directly instead of bias
1 parent e92c699 commit 656c43b

6 files changed

Lines changed: 297 additions & 280 deletions

File tree

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,10 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
508508

509509
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
510510
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
511-
static std::mutex weights_mutex;
511+
// static std::mutex weights_mutex;
512512
auto * nodes = cgraph->nodes;
513513
auto n_nodes = cgraph->n_nodes;
514-
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
514+
std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
515515
for (int i = 0; i < GGML_MAX_SRC; i++) {
516516
auto * src = node->src[i];
517517
if (src == nullptr) {
@@ -522,21 +522,26 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
522522
if (!src->view_src) {
523523
ggml_backend_buffer * buffer = src->buffer;
524524
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
525-
bool should_create = false;
526-
{
527-
std::lock_guard<std::mutex> lock(weights_mutex);
528-
if (model_weights.find(src_name) == model_weights.end()) {
529-
model_weights[src_name] = nullptr;
530-
should_create = true;
531-
}
532-
}
533-
if (should_create) {
525+
// bool should_create = false;
526+
// {
527+
// std::lock_guard<std::mutex> lock(weights_mutex);
528+
// if (model_weights.find(src_name) == model_weights.end()) {
529+
// model_weights[src_name] = nullptr;
530+
// should_create = true;
531+
// }
532+
// }
533+
// if (should_create) {
534+
// auto weight_node = create_weight_node(src);
535+
// weight_node->set_friendly_name(src_name);
536+
// {
537+
// std::lock_guard<std::mutex> lock(weights_mutex);
538+
// model_weights[src_name] = weight_node;
539+
// }
540+
// }
541+
if (model_weights.find(src_name) == model_weights.end()) {
534542
auto weight_node = create_weight_node(src);
535543
weight_node->set_friendly_name(src_name);
536-
{
537-
std::lock_guard<std::mutex> lock(weights_mutex);
538-
model_weights[src_name] = weight_node;
539-
}
544+
model_weights[src_name] = weight_node;
540545
}
541546
}
542547
}

ggml/src/ggml-openvino/ggml-openvino-extra.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
209209
layout.is_requant = true;
210210
layout.requant_type = requant_type;
211211

212-
// Special case: requant to F16 - just store F16 weights, no scales/biases
212+
// Special case: requant to F16 - just store F16 weights, no scales/zp
213213
if (requant_type.value() == ExtraQuantType::F16) {
214214
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
215215
layout.total_size = layout.weights_size;
216216
layout.weights_offset = 0;
217-
// No scales/biases for F16
217+
// No scales/zp for F16
218218
return layout;
219219
}
220220

@@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
255255
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
256256
int64_t n_blocks = n_elements / layout.weights_per_block;
257257
layout.scales_size = n_blocks * sizeof(uint16_t);
258-
// For symmetric quantization, we only need one bias value (not one per block)
259-
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
258+
// For symmetric quantization, we only need one zp value (not one per block)
259+
// Zero points are stored in U4 or U8 format matching the weight type
260+
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
261+
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
260262

261263
layout.weights_offset = 0;
262264
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
263-
layout.biases_offset =
264-
layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
265-
layout.total_size = layout.biases_offset + layout.biases_size;
265+
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
266+
layout.total_size = layout.zp_offset + layout.zp_size;
266267
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
267268
return layout;
268269
}
@@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
305306
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
306307
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
307308

308-
// Scales and biases: F16 per block
309+
// Scales: F16 per block
309310
int64_t n_blocks = n_elements / layout.weights_per_block;
310311
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
311-
// For symmetric quantization, we only need one bias value (not one per block)
312-
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
312+
// Zero points: U4 or U8 matching weight type
313+
// For symmetric quantization, we only need one zp value (not one per block)
314+
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
315+
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
313316

314-
// Layout in buffer: [weights | scales | biases] with alignment
317+
// Layout in buffer: [weights | scales | zp] with alignment
315318
layout.weights_offset = 0;
316319
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
317-
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
318-
layout.total_size = layout.biases_offset + layout.biases_size;
320+
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
321+
layout.total_size = layout.zp_offset + layout.zp_size;
319322

320323
return layout;
321324
}

ggml/src/ggml-openvino/ggml-openvino-extra.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
110110
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
111111
};
112112

113-
// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant
113+
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
114114
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
115115
ov::Tensor weights; // U4 or U8 extracted weights
116116
ov::Tensor scales; // F16 scales
117-
ov::Tensor biases; // F16 biases (zero points)
117+
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
118118
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph
119119

120-
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)
121-
: ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
122-
weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}
120+
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
121+
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
122+
weights(std::move(w)),
123+
scales(std::move(s)),
124+
zp(std::move(z)),
125+
constant(std::move(c)) {}
123126
};
124127

125128
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
@@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
133136
// =====================================================
134137
// Extracted Size Calculation for Quantized Tensors
135138
// =====================================================
136-
// For quantized tensors, we need extra space to store extracted weights, scales, and biases.
139+
// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
137140
// Returns the total size needed in the buffer for extracted data.
138141

139142
struct ggml_openvino_extracted_layout {
@@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout {
142145
size_t weights_size; // Size of weights in bytes
143146
size_t scales_offset; // Offset to scales in buffer
144147
size_t scales_size; // Size of scales in bytes
145-
size_t biases_offset; // Offset to biases in buffer
146-
size_t biases_size; // Size of biases in bytes
148+
size_t zp_offset; // Offset to zero points in buffer
149+
size_t zp_size; // Size of zero points in bytes (U4 or U8)
147150
bool is_u4; // true for U4 weights, false for U8
148-
int64_t weights_per_block;// weights per scale/bias block
151+
int64_t weights_per_block; // weights per scale/zp block
149152
bool is_symmetric; // true for symmetric quantization
150153

151154
// Requantization info

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
259259
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
260260
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
261261
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
262+
// zp shape: scalar for symmetric, per-block for asymmetric
263+
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
262264

263265
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
264266
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
265-
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
267+
ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
266268

267269
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
268-
std::move(biases), constant);
270+
std::move(zp), constant);
269271
ctx->tensor_extras[tensor] = extra;
270272
tensor->extra = extra;
271273

@@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
487489
if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
488490
ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
489491
if (layout.total_size > 0) {
490-
GGML_LOG_DEBUG(
491-
"%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",
492-
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,
493-
layout.biases_size);
492+
GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
493+
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,
494+
layout.scales_size, layout.zp_size);
494495
return layout.total_size;
495496
}
496497
}

0 commit comments

Comments
 (0)