← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_k.c
Go to the documentation of this file.
1 /**
2  * @file gemm_kernels_q5_k.c
3  * @brief GEMM/GEMV kernels with Q5_K quantized weights
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Implements matrix multiplication where:
15  * - Activations (input): FP32
16  * - Weights: Q5_K (5-bit super-block quant)
17  * - Output: FP32
18  *
19  * Q5_K Format (256 weights per super-block):
20  * - d: FP16 super-block scale
21  * - dmin: FP16 super-block minimum
22  * - scales[12]: 8 sub-block scales + 8 sub-block mins (6 bits each, packed)
23  * - qh[32]: high bits for 256 weights (1 bit each)
24  * - qs[128]: low 4 bits for 256 weights (4 bits each)
25  *
26  * Total: 2 + 2 + 12 + 32 + 128 = 176 bytes per 256 weights = 5.5 bits/weight
27  *
28  * Dequantization formula (matches llama.cpp):
29  * w = d * (scale/64) * q - dmin * (mins/64)
30  * where q = qs_val | (qh_bit << 4) = 5-bit value [0, 31]
31  */
32 
33 #include <stdint.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include "ckernel_quant.h"
37 
38 /* Include SIMD headers based on available extensions */
39 #if defined(__AVX512F__) || defined(__AVX2__) || defined(__AVX__) || defined(__SSE4_1__)
40 #include <immintrin.h>
41 #endif
42 
43 /* Q5_K constants */
44 #define QK_K 256
45 
46 /*
47  * Q5_K / Q4_K scale unpacking.
48  *
49  * K-quant super-blocks pack 8 sub-block scales and 8 sub-block mins
50  * into 12 bytes using a 6-bit encoding:
51  *
52  * scales[0..3]: low 6 bits of sub-block scales 0-3
53  * scales[4..7]: low 6 bits of sub-block mins 0-3
54  * scales[8..11]: low 4 bits = scales 4-7, high 4 bits = mins 4-7
55  * ...with the top 2 bits of scales/mins 4-7 stored
56  * in the top 2 bits of scales[0..3] and scales[4..7]
57  *
58  * WHY THIS IS ERROR-PRONE:
59  * The mapping between array index j and which bytes to read from
60  * scales[] changes across three ranges (j<4, j<8, j>=8). The index
61  * arithmetic for the "top 2 bits" source uses j-4 for both scale
62  * and min in the j=4..7 case, but it's tempting to write j-0
63  * (i.e. just j) because the *scale* line above uses j-4 and you
64  * might think the *min* line should use the "other" offset.
65  * In fact both lines source their top 2 bits from scales[j-4].
66  *
67  * Reference: llama.cpp ggml-quants.c get_scale_min_k4()
68  */
69 
70 /* Helper: extract scale and min for a sub-block index from packed scales[12]
71  * Matches llama.cpp's get_scale_min_k4() pattern
72  * For Q5_K, scales[0-5] and scales[6-11] are directly 6-bit values
73  */
74 static inline void get_q5_k_scale_min(int j, const uint8_t *scales,
75  uint8_t *scale, uint8_t *min) {
76  if (j < 4) {
77  *scale = scales[j] & 63;
78  *min = scales[j + 4] & 63;
79  } else if (j < 8) {
80  *scale = (scales[j + 4] & 0x0F) | ((scales[j - 4] >> 6) << 4);
81  *min = (scales[j + 4] >> 4) | ((scales[j - 4] >> 6) << 4);
82  } else {
83  *scale = (scales[j - 4] & 0x0F) | ((scales[j - 8] >> 6) << 4);
84  *min = (scales[j - 4] >> 4) | ((scales[j - 8] >> 6) << 4);
85  }
86 }
87 
88 /* ============================================================================
89  * GEMV Reference: y = W @ x (W is Q5_K, x and y are FP32)
90  * ============================================================================ */
91 
92 void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)
93 {
94  const block_q5_K *blocks = (const block_q5_K *)W;
95  const int blocks_per_row = K / QK_K;
96 
97  for (int m = 0; m < M; m++) {
98  const float *x_row = x;
99  float sum = 0.0f;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  const block_q5_K *block = &blocks[m * blocks_per_row + b];
103  const float d = CK_FP16_TO_FP32(block->d);
104  const float dmin = CK_FP16_TO_FP32(block->dmin);
105  const uint8_t *scales = block->scales;
106  const uint8_t *qh = block->qh;
107  const uint8_t *qs = block->qs;
108 
109  /* Process 8 sub-blocks of 32 weights each */
110  for (int sb = 0; sb < 8; sb++) {
111  uint8_t sc, m;
112  get_q5_k_scale_min(sb, scales, &sc, &m);
113 
114  const float d_sub = d * (float)sc / 64.0f;
115  const float m_sub = dmin * (float)m / 64.0f;
116 
117  /* Each sub-block has 32 weights: low 4 bits in qs, high 1 bit in qh */
118  const int qs_offset = sb * 16; /* 16 bytes per sub-block */
119  const int qh_offset = sb * 4; /* 4 bytes per sub-block */
120 
121  for (int i = 0; i < 32; i++) {
122  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
123  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
124  uint8_t q = qs_val | (qh_bit << 4);
125 
126  /* Q5_K dequantization: w = d * sc/64 * q - dmin * m/64 */
127  float w = d_sub * (float)q - m_sub;
128  sum += w * x_row[b * QK_K + sb * 32 + i];
129  }
130  }
131  }
132 
133  y[m] = sum;
134  }
135 }
136 
137 /* ============================================================================
138  * GEMM NT Reference: C = A @ B^T + bias
139  * - A: FP32 activation matrix [M, K]
140  * - B: Q5_K weight matrix [N, K] (stored transposed, accessed as [N, K])
141  * - bias: Optional FP32 bias [N]
142  * - C: FP32 output matrix [M, N]
143  * ============================================================================ */
144 
145 void gemm_nt_q5_k_ref(const float *A,
146  const void *B,
147  const float *bias,
148  float *C,
149  int M, int N, int K)
150 {
151  const block_q5_K *blocks = (const block_q5_K *)B;
152  const int blocks_per_col = K / QK_K;
153 
154  for (int m = 0; m < M; m++) {
155  const float *a_row = &A[m * K];
156 
157  for (int n = 0; n < N; n++) {
158  float sum = 0.0f;
159 
160  for (int b = 0; b < blocks_per_col; b++) {
161  const block_q5_K *block = &blocks[n * blocks_per_col + b];
162  const float d = CK_FP16_TO_FP32(block->d);
163  const float dmin = CK_FP16_TO_FP32(block->dmin);
164  const uint8_t *scales = block->scales;
165  const uint8_t *qh = block->qh;
166  const uint8_t *qs = block->qs;
167 
168  /* Process 8 sub-blocks of 32 weights each */
169  for (int sb = 0; sb < 8; sb++) {
170  uint8_t sc, m;
171  get_q5_k_scale_min(sb, scales, &sc, &m);
172 
173  const float d_sub = d * (float)sc / 64.0f;
174  const float m_sub = dmin * (float)m / 64.0f;
175 
176  const int qs_offset = sb * 16;
177  const int qh_offset = sb * 4;
178 
179  for (int i = 0; i < 32; i++) {
180  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
181  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
182  uint8_t q = qs_val | (qh_bit << 4);
183 
184  float w = d_sub * (float)q - m_sub;
185  sum += w * a_row[b * QK_K + sb * 32 + i];
186  }
187  }
188  }
189 
190  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
191  }
192  }
193 }
194 
195 /* ============================================================================
196  * Dispatch wrappers - select best available implementation
197  * ============================================================================ */
198 
199 void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)
200 {
201 #if defined(__AVX512F__)
202  /* TODO: AVX-512 implementation */
203  gemv_q5_k_ref(y, W, x, M, K);
204 #elif defined(__AVX2__)
205  /* TODO: AVX-2 implementation */
206  gemv_q5_k_ref(y, W, x, M, K);
207 #elif defined(__AVX__)
208  /* TODO: AVX implementation */
209  gemv_q5_k_ref(y, W, x, M, K);
210 #elif defined(__SSE4_1__)
211  /* TODO: SSE4.1 implementation */
212  gemv_q5_k_ref(y, W, x, M, K);
213 #else
214  gemv_q5_k_ref(y, W, x, M, K);
215 #endif
216 }
217 
218 void gemm_nt_q5_k(const float *A,
219  const void *B,
220  const float *bias,
221  float *C,
222  int M, int N, int K)
223 {
224 #if defined(__AVX512F__)
225  /* TODO: AVX-512 implementation */
226  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
227 #elif defined(__AVX2__)
228  /* TODO: AVX-2 implementation */
229  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
230 #elif defined(__AVX__)
231  /* TODO: AVX implementation */
232  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
233 #elif defined(__SSE4_1__)
234  /* TODO: SSE4.1 implementation */
235  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
236 #else
237  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
238 #endif
239 }
Quantization block structures for weight-only quantization.
#define CK_FP16_TO_FP32(x)
void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)
static void get_q5_k_scale_min(int j, const uint8_t *scales, uint8_t *scale, uint8_t *min)
void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)
void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void gemm_nt_q5_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define QK_K
#define C(color)
Definition: show_config.c:39
ck_half dmin
uint8_t qh[256/8]
uint8_t qs[256/2]
uint8_t scales[12]