-
Notifications
You must be signed in to change notification settings - Fork 28
fix: Q1_0_g128 CPU dot product int truncation #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9343e09
3909d58
77c6355
d603bf4
12788a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| *.gguf filter=lfs diff=lfs merge=lfs -text |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,52 +65,56 @@ static inline int hsum_i32_4(const __m128i a) { | |
| return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); | ||
| } | ||
|
|
||
| #if defined(__AVX2__) || defined(__AVX512F__) | ||
| static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { | ||
| const __m256i ax = _mm256_sign_epi8(x, x); | ||
| const __m256i sy = _mm256_sign_epi8(y, x); | ||
| return _mm256_maddubs_epi16(ax, sy); | ||
| } | ||
|
|
||
| // spread 32 bits to 32 bytes { 0x00, 0xFF } | ||
| static inline __m256i bytes_from_bits_32(const uint8_t * x) { | ||
| uint32_t x32; | ||
| memcpy(&x32, x, sizeof(uint32_t)); | ||
| const __m256i shuf_mask = _mm256_set_epi64x( | ||
| 0x0303030303030303, 0x0202020202020202, | ||
| 0x0101010101010101, 0x0000000000000000); | ||
| __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); | ||
| const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); | ||
| bytes = _mm256_or_si256(bytes, bit_mask); | ||
| return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); | ||
| } | ||
| #if defined(__AVX2__) | ||
| // AVX2: single-pass byte-level processing, fully unrolled k-loop. | ||
| // Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma | ||
| const __m256i ones_8 = _mm256_set1_epi8(1); | ||
| const __m256i ones_16 = _mm256_set1_epi16(1); | ||
| const __m256i byte_shuf = _mm256_setr_epi8( | ||
| 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, | ||
| 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3); | ||
| const __m256i bit_masks = _mm256_setr_epi8( | ||
| 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128, | ||
| 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128); | ||
| const __m256i zero = _mm256_setzero_si256(); | ||
| __m256 acc = _mm256_setzero_ps(); | ||
|
|
||
| // Unpack 32 4-bit fields into 32 bytes | ||
| // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | ||
| static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) | ||
| { | ||
| const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); | ||
| const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); | ||
| const __m256i lowMask = _mm256_set1_epi8( 0xF ); | ||
| return _mm256_and_si256(lowMask, bytes); | ||
| } | ||
| for (int ib = 0; ib < nb; ++ib) { | ||
| const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); | ||
| const uint32_t * qs32 = (const uint32_t *)x[ib].qs; | ||
|
|
||
| #define Q1_AVX2_BLOCK(K) \ | ||
| { \ | ||
| const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \ | ||
| const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \ | ||
| _mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \ | ||
| bit_masks), zero); \ | ||
| const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \ | ||
| const __m256i s32 = _mm256_madd_epi16( \ | ||
| _mm256_maddubs_epi16(ones_8, sy), ones_16); \ | ||
| acc_block = (K == 0) \ | ||
| ? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \ | ||
| _mm256_cvtepi32_ps(s32)) \ | ||
| : _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \ | ||
| _mm256_cvtepi32_ps(s32), acc_block); \ | ||
| } | ||
|
|
||
| // add int16_t pairwise and return as float vector | ||
| static inline __m256 sum_i16_pairs_float(const __m256i x) { | ||
| const __m256i ones = _mm256_set1_epi16(1); | ||
| const __m256i summed_pairs = _mm256_madd_epi16(ones, x); | ||
| return _mm256_cvtepi32_ps(summed_pairs); | ||
| } | ||
| const block_q8_0 * y_ptr = &y[ib*4]; | ||
| __m256 acc_block; | ||
| Q1_AVX2_BLOCK(0) | ||
| Q1_AVX2_BLOCK(1) | ||
| Q1_AVX2_BLOCK(2) | ||
| Q1_AVX2_BLOCK(3) | ||
| #undef Q1_AVX2_BLOCK | ||
|
|
||
| static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { | ||
| #if defined(__AVX512VNNI__) && defined(__AVX512VL__) | ||
| const __m256i zero = _mm256_setzero_si256(); | ||
| const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); | ||
| return _mm256_cvtepi32_ps(summed_pairs); | ||
| #elif defined(__AVXVNNI__) | ||
| const __m256i zero = _mm256_setzero_si256(); | ||
| const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); | ||
| return _mm256_cvtepi32_ps(summed_pairs); | ||
| acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc); | ||
| } | ||
| { | ||
| const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0), | ||
| _mm256_extractf128_ps(acc, 1)); | ||
| const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h)); | ||
| *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q))); | ||
| } | ||
| #else | ||
| // Perform multiplication and create 16-bit values | ||
| const __m256i dot = _mm256_maddubs_epi16(ax, sy); | ||
|
|
@@ -664,13 +668,63 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons | |
|
|
||
| float sumf = 0; | ||
|
|
||
| // Each Q1_0_g128 block has 128 elements | ||
| // Each Q8_0 block has 32 elements | ||
| // So we need 4 Q8_0 blocks per Q1_0_g128 block | ||
| #if defined(__AVX2__) | ||
| // AVX2: process 32 Q8_0 values per sub-block in two 16-element passes. | ||
|
Comment on lines
668
to
+672
|
||
| // Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate, | ||
| // then madd->fma accumulation. | ||
| const __m256i ones_16 = _mm256_set1_epi16(1); | ||
| const __m256i bmask = _mm256_setr_epi16( | ||
| 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, | ||
| 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15)); | ||
| __m256 acc = _mm256_setzero_ps(); | ||
|
|
||
| for (int ib = 0; ib < nb; ++ib) { | ||
| const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); | ||
| __m256 acc_block = _mm256_setzero_ps(); | ||
|
|
||
| for (int k = 0; k < 4; k++) { | ||
| const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); | ||
| const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); | ||
|
|
||
| uint32_t bits; | ||
| memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits)); | ||
|
|
||
| // Lower 16 elements: sign-extend int8->int16, apply sign from weight bits | ||
| const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes)); | ||
| const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo); | ||
| const __m256i mask_lo = _mm256_cmpeq_epi16( | ||
| _mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask); | ||
| const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo); | ||
|
|
||
| // Upper 16 elements | ||
| const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1)); | ||
| const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi); | ||
| const __m256i mask_hi = _mm256_cmpeq_epi16( | ||
| _mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask); | ||
| const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi); | ||
|
|
||
| // Pair-wise sum int16->int32, combine halves, convert to float, FMA | ||
| const __m256i sum_32 = _mm256_add_epi32( | ||
| _mm256_madd_epi16(signed_lo, ones_16), | ||
| _mm256_madd_epi16(signed_hi, ones_16)); | ||
| acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1), | ||
| _mm256_cvtepi32_ps(sum_32), acc_block); | ||
| } | ||
| acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc); | ||
| } | ||
| // Horizontal reduction: 256 -> 128 -> scalar | ||
| { | ||
| const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0), | ||
| _mm256_extractf128_ps(acc, 1)); | ||
| const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h)); | ||
| *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q))); | ||
| } | ||
| #else | ||
| // Scalar fallback | ||
| for (int ib = 0; ib < nb; ++ib) { | ||
| const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); | ||
|
|
||
| int sumi = 0; | ||
| float sumi = 0; | ||
|
|
||
| // Process 4 Q8_0 blocks (4 * 32 = 128 elements) | ||
| for (int k = 0; k < 4; k++) { | ||
|
|
@@ -697,6 +751,7 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons | |
| } | ||
|
|
||
| *s = sumf; | ||
| #endif | ||
| } | ||
|
|
||
| void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -1185,15 +1185,16 @@ static void ggml_compute_forward_mul_mat_one_chunk( | |||||||||||
| assert(ne12 % ne02 == 0); | ||||||||||||
| assert(ne13 % ne03 == 0); | ||||||||||||
|
|
||||||||||||
| // block-tiling attempt | ||||||||||||
| const int64_t blck_0 = 16; | ||||||||||||
| // COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny, | ||||||||||||
| // so we can fit more rows in L1). Prefetch next weight block while processing current. | ||||||||||||
| const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16; | ||||||||||||
| const int64_t blck_1 = 16; | ||||||||||||
|
|
||||||||||||
| const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; | ||||||||||||
|
|
||||||||||||
| // attempt to reduce false-sharing (does not seem to make a difference) | ||||||||||||
| // 16 * 2, accounting for mmla kernels | ||||||||||||
| float tmp[32]; | ||||||||||||
| // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once) | ||||||||||||
| float tmp[128]; | ||||||||||||
|
Comment on lines
+1188
to
+1197
|
||||||||||||
| // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once) | |
| float tmp[128]; | |
| // Size: blck_0 * num_rows_per_vec_dot | |
| const int64_t tmp_size = blck_0 * num_rows_per_vec_dot; | |
| float tmp[tmp_size]; |
Copilot
AI
Apr 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__builtin_prefetch is a GCC/Clang builtin and is not available on all supported toolchains (notably MSVC). Consider guarding this with compiler checks (e.g., __GNUC__/__clang__) and/or using an existing cross-platform prefetch abstraction (or _mm_prefetch on x86) so CPU builds remain portable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
#if defined(__AVX2__)block here contains executable statements that referencenb,x,y_ptr, ands, but this code is located at file scope (inside the helper section) rather than inside a function. This will not compile and also appears to have overwritten the intendedmul_sum_us8_pairs_float(...)implementation. Please move this logic into the appropriate vec_dot function (or restore the helper to only operate on its parameters and return a value).