← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_0_sse_v2.c
Go to the documentation of this file.
1 /**
2  * @file gemm_kernels_q5_0_sse_v2.c
3  * @brief SSE-optimized GEMM kernels for Q5_0 x Q8_K quantization
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  */
14 
15 #include <immintrin.h>
16 #include <stdint.h>
17 #include <string.h>
18 #include <stdio.h>
19 
20 #include "ckernel_quant.h"
21 
22 void quantize_row_q8_k(const float *x, void *vy, int k);
23 void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);
24 
25 static inline float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset) {
26  const uint8_t *qs_w = bw->qs;
27  const int8_t *qs_a = ba->qs + q8_offset;
28 
29  uint32_t qh;
30  memcpy(&qh, bw->qh, sizeof(qh));
31 
32  // Vectorize bit extraction
33  // Load 16 bytes of low nibbles
34  __m128i qs_vec = _mm_loadu_si128((const __m128i *)qs_w);
35  __m128i mask_0f = _mm_set1_epi8(0x0F);
36 
37  __m128i w_lo = _mm_and_si128(qs_vec, mask_0f);
38  __m128i w_hi = _mm_and_si128(_mm_srli_epi16(qs_vec, 4), mask_0f);
39 
40  // Load high bits from qh
41  // This is still a bit scalar but we can use shuffle for some of it if needed
42  // For now, let's just make sure we handle the -16 offset correctly in SIMD.
43 
44  uint8_t w[32];
45  for (int j = 0; j < 16; j++) {
46  w[j] = (qs_w[j] & 0x0F) | (((qh >> (j + 0)) << 4) & 0x10);
47  w[j+16] = (qs_w[j] >> 4) | ((qh >> (j + 12)) & 0x10);
48  }
49 
50  __m128i vw0 = _mm_loadu_si128((const __m128i *)&w[0]);
51  __m128i vw1 = _mm_loadu_si128((const __m128i *)&w[16]);
52  __m128i va0 = _mm_loadu_si128((const __m128i *)&qs_a[0]);
53  __m128i va1 = _mm_loadu_si128((const __m128i *)&qs_a[16]);
54 
55  // Dot product: unsigned 8-bit * signed 8-bit -> signed 16-bit
56  __m128i p0 = _mm_maddubs_epi16(vw0, va0);
57  __m128i p1 = _mm_maddubs_epi16(vw1, va1);
58 
59  // Sum to i32
60  __m128i one = _mm_set1_epi16(1);
61  __m128i s0 = _mm_madd_epi16(p0, one);
62  __m128i s1 = _mm_madd_epi16(p1, one);
63  __m128i acc_i32 = _mm_add_epi32(s0, s1);
64 
65  // Horizontal sum of i32
66  acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(1, 0, 3, 2)));
67  acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(0, 1, 0, 1)));
68  int32_t dot_wa = _mm_cvtsi128_si32(acc_i32);
69 
70  // sum((w - 16) * a) = sum(w*a) - 16 * sum(a)
71  int32_t sum_a = (int32_t)ba->bsums[q8_offset/16] + (int32_t)ba->bsums[q8_offset/16 + 1];
72 
73  float result = ((float)dot_wa - 16.0f * (float)sum_a) * CK_FP16_TO_FP32(bw->d) * ba->d;
74  return result;
75 }
76 
77 void gemm_nt_q5_0_sse_v2(const float *A,
78  const void *B,
79  const float *bias,
80  float *C,
81  int M, int N, int K)
82 {
83  if (K % QK_K != 0) {
84  gemm_nt_q5_0_ref(A, B, bias, C, M, N, K);
85  return;
86  }
87 
88  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
89  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
90 
91  const block_q5_0 *weights = (const block_q5_0 *)B;
92  const int blocks_per_row = K / 32;
93 
94  for (int m = 0; m < M; m++) {
95  quantize_row_q8_k(&A[m * K], A_q8, K);
96 
97  for (int n = 0; n < N; n++) {
98  float sumf = 0.0f;
99  const block_q5_0 *w_row = weights + n * blocks_per_row;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  int q8_block_idx = (b * 32) / QK_K;
103  int q8_offset = (b * 32) % QK_K;
104  sumf += dot_q5_0_q8_k_32_sse(&w_row[b], &A_q8[q8_block_idx], q8_offset);
105  }
106 
107  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
108  }
109  }
110 }
Quantization block structures for weight-only quantization.
#define CK_FP16_TO_FP32(x)
#define QK_K
void gemm_nt_q5_0_sse_v2(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_0 weights: C = A @ B^T.
static float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)
#define C(color)
Definition: show_config.c:39
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72
int8_t qs[256]
int16_t bsums[256/16]