@@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
209209 layout.is_requant = true ;
210210 layout.requant_type = requant_type;
211211
212- // Special case: requant to F16 - just store F16 weights, no scales/biases
212+ // Special case: requant to F16 - just store F16 weights, no scales/zp
213213 if (requant_type.value () == ExtraQuantType::F16) {
214214 layout.weights_size = n_elements * sizeof (uint16_t ); // F16 = 2 bytes
215215 layout.total_size = layout.weights_size ;
216216 layout.weights_offset = 0 ;
217- // No scales/biases for F16
217+ // No scales/zp for F16
218218 return layout;
219219 }
220220
@@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
255255 layout.weights_size = layout.is_u4 ? (n_elements / 2 ) : n_elements;
256256 int64_t n_blocks = n_elements / layout.weights_per_block ;
257257 layout.scales_size = n_blocks * sizeof (uint16_t );
258- // For symmetric quantization, we only need one bias value (not one per block)
259- layout.biases_size = layout.is_symmetric ? sizeof (uint16_t ) : n_blocks * sizeof (uint16_t );
258+ // For symmetric quantization, we only need one zp value (not one per block)
259+ // Zero points are stored in U4 or U8 format matching the weight type
260+ size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
261+ layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1 ) / 2 ) : n_zp_elements;
260262
261263 layout.weights_offset = 0 ;
262264 layout.scales_offset = ((layout.weights_size + alignment - 1 ) / alignment) * alignment;
263- layout.biases_offset =
264- layout.scales_offset + ((layout.scales_size + alignment - 1 ) / alignment) * alignment;
265- layout.total_size = layout.biases_offset + layout.biases_size ;
265+ layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1 ) / alignment) * alignment;
266+ layout.total_size = layout.zp_offset + layout.zp_size ;
266267 layout.total_size = std::max (layout.total_size , ggml_nbytes (tensor));
267268 return layout;
268269 }
@@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
305306 // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
306307 layout.weights_size = layout.is_u4 ? (n_elements / 2 ) : n_elements;
307308
308- // Scales and biases : F16 per block
309+ // Scales: F16 per block
309310 int64_t n_blocks = n_elements / layout.weights_per_block ;
310311 layout.scales_size = n_blocks * sizeof (uint16_t ); // F16 = 2 bytes
311- // For symmetric quantization, we only need one bias value (not one per block)
312- layout.biases_size = layout.is_symmetric ? sizeof (uint16_t ) : n_blocks * sizeof (uint16_t );
312+ // Zero points: U4 or U8 matching weight type
313+ // For symmetric quantization, we only need one zp value (not one per block)
314+ size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
315+ layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1 ) / 2 ) : n_zp_elements;
313316
314- // Layout in buffer: [weights | scales | biases ] with alignment
317+ // Layout in buffer: [weights | scales | zp ] with alignment
315318 layout.weights_offset = 0 ;
316319 layout.scales_offset = ((layout.weights_size + alignment - 1 ) / alignment) * alignment;
317- layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1 ) / alignment) * alignment;
318- layout.total_size = layout.biases_offset + layout.biases_size ;
320+ layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1 ) / alignment) * alignment;
321+ layout.total_size = layout.zp_offset + layout.zp_size ;
319322
320323 return layout;
321324}
0 commit comments