← Back to C-Kernel-Engine Docs Doxygen Source Documentation
dequant_kernels.c
Go to the documentation of this file.
1 /**
2  * @file dequant_kernels.c
3  * @brief Dequantization kernels for GGML-compatible formats
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Implements dequantization from Q4_0, Q5_0, Q5_1, Q4_K, Q6_K, Q8_0 to FP32.
15  * These kernels are used as building blocks for quantized GEMM/GEMV.
16  *
17  * Key optimization: Dequantize into registers, use immediately in FMA,
18  * never write intermediate FP32 values to memory.
19  */
20 
21 #include <stdint.h>
22 #include <stddef.h>
23 #include <string.h>
24 #include <immintrin.h>
25 #include "ckernel_quant.h"
26 
27 /* ============================================================================
28  * Q4_0 Dequantization
29  * - 32 weights per block, 1 FP16 scale
30  * - Weights stored as signed 4-bit (-8 to +7)
31  * ============================================================================ */
32 
33 /**
34  * @brief Dequantize a single Q4_0 block to FP32
35  * @param block Pointer to Q4_0 block (18 bytes)
36  * @param output Output FP32 array (32 floats)
37  */
38 void dequant_q4_0_block(const block_q4_0 *block, float *output)
39 {
40  const float d = GGML_FP16_TO_FP32(block->d);
41 
42  for (int i = 0; i < QK4_0 / 2; i++) {
43  const uint8_t packed = block->qs[i];
44 
45  /* Lower nibble: elements 0..15 */
46  const int8_t q0 = (packed & 0x0F) - 8;
47  /* Upper nibble: elements 16..31 */
48  const int8_t q1 = (packed >> 4) - 8;
49 
50  output[i] = d * (float)q0;
51  output[i + QK4_0 / 2] = d * (float)q1;
52  }
53 }
54 
55 /**
56  * @brief Dequantize Q4_0 row (multiple blocks)
57  * @param src Q4_0 data
58  * @param dst FP32 output
59  * @param n_elements Number of elements to dequantize
60  */
61 void dequant_q4_0_row(const void *src, float *dst, size_t n_elements)
62 {
63  const block_q4_0 *blocks = (const block_q4_0 *)src;
64  const size_t n_blocks = n_elements / QK4_0;
65 
66  for (size_t b = 0; b < n_blocks; b++) {
67  dequant_q4_0_block(&blocks[b], &dst[b * QK4_0]);
68  }
69 }
70 
71 #ifdef __AVX512F__
72 /**
73  * @brief Dequantize Q4_0 block using AVX-512 (16 floats at a time)
74  * @param block Pointer to Q4_0 block
75  * @param out_lo Lower 16 floats (weights 0-15)
76  * @param out_hi Upper 16 floats (weights 16-31)
77  */
78 void dequant_q4_0_block_avx512(const block_q4_0 *block,
79  __m512 *out_lo, __m512 *out_hi)
80 {
81  const __m512 scale = _mm512_set1_ps(GGML_FP16_TO_FP32(block->d));
82  const __m512i offset = _mm512_set1_epi32(8);
83 
84  /* Load 16 bytes = 32 x 4-bit weights */
85  __m128i packed = _mm_loadu_si128((const __m128i *)block->qs);
86 
87  /* Unpack lower nibbles (weights 0, 2, 4, ...) */
88  __m512i lo_nibbles = _mm512_cvtepu8_epi32(packed);
89  lo_nibbles = _mm512_and_epi32(lo_nibbles, _mm512_set1_epi32(0x0F));
90  lo_nibbles = _mm512_sub_epi32(lo_nibbles, offset);
91 
92  /* Unpack upper nibbles (weights 1, 3, 5, ...) */
93  __m512i hi_nibbles = _mm512_cvtepu8_epi32(packed);
94  hi_nibbles = _mm512_srli_epi32(hi_nibbles, 4);
95  hi_nibbles = _mm512_sub_epi32(hi_nibbles, offset);
96 
97  /* Convert to float and scale */
98  *out_lo = _mm512_mul_ps(_mm512_cvtepi32_ps(lo_nibbles), scale);
99  *out_hi = _mm512_mul_ps(_mm512_cvtepi32_ps(hi_nibbles), scale);
100 
101  /* Note: This gives interleaved output (0,2,4... and 1,3,5...)
102  * For proper sequential order, would need shuffle/blend */
103 }
104 #endif /* __AVX512F__ */
105 
106 /* ============================================================================
107  * Q4_1 Dequantization
108  * - 32 weights per block, 1 FP16 scale + 1 FP16 min
109  * - Weights stored as unsigned 4-bit (0 to 15)
110  * ============================================================================ */
111 
112 /**
113  * @brief Dequantize a single Q4_1 block to FP32
114  * @param block Pointer to Q4_1 block (20 bytes)
115  * @param output Output FP32 array (32 floats)
116  */
117 void dequant_q4_1_block(const block_q4_1 *block, float *output)
118 {
119  const float d = GGML_FP16_TO_FP32(block->d);
120  const float m = GGML_FP16_TO_FP32(block->m);
121 
122  for (int i = 0; i < QK4_1 / 2; i++) {
123  const uint8_t packed = block->qs[i];
124 
125  /* Lower nibble: unsigned 0-15 */
126  const int q0 = (packed & 0x0F);
127  /* Upper nibble: unsigned 0-15 */
128  const int q1 = (packed >> 4);
129 
130  /* Dequantize: w = d * q + m */
131  output[i] = d * (float)q0 + m;
132  output[i + QK4_1 / 2] = d * (float)q1 + m;
133  }
134 }
135 
136 /**
137  * @brief Dequantize Q4_1 row (multiple blocks)
138  */
139 void dequant_q4_1_row(const void *src, float *dst, size_t n_elements)
140 {
141  const block_q4_1 *blocks = (const block_q4_1 *)src;
142  const size_t n_blocks = n_elements / QK4_1;
143 
144  for (size_t b = 0; b < n_blocks; b++) {
145  dequant_q4_1_block(&blocks[b], &dst[b * QK4_1]);
146  }
147 }
148 
149 /* ============================================================================
150  * Q5_0 Dequantization
151  * - 32 weights per block, 1 FP16 scale
152  * - Low 4 bits + 1 high bit packed separately
153  * - Weights are 5-bit signed (-16 to +15)
154  * ============================================================================ */
155 
156 /**
157  * @brief Dequantize a single Q5_0 block to FP32
158  * @param block Pointer to Q5_0 block (22 bytes)
159  * @param output Output FP32 array (32 floats)
160  */
161 void dequant_q5_0_block(const block_q5_0 *block, float *output)
162 {
163  const float d = GGML_FP16_TO_FP32(block->d);
164 
165  /* Get high bits as a 32-bit integer */
166  uint32_t qh;
167  memcpy(&qh, block->qh, sizeof(qh));
168 
169  /* llama.cpp Q5_0 layout:
170  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
171  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
172  */
173  for (int j = 0; j < QK5_0 / 2; j++) {
174  const uint8_t packed = block->qs[j];
175 
176  /* Extract low 4 bits for two weights */
177  const int lo = (packed & 0x0F);
178  const int hi = (packed >> 4);
179 
180  /* Extract high bits from qh - matches llama.cpp exactly */
181  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
182  const int xh_1 = ((qh >> (j + 12))) & 0x10;
183 
184  /* Combine: 5-bit value, range 0-31, then subtract 16 */
185  const int q0 = (lo | xh_0) - 16;
186  const int q1 = (hi | xh_1) - 16;
187 
188  output[j] = d * (float)q0;
189  output[j + 16] = d * (float)q1;
190  }
191 }
192 
193 /**
194  * @brief Dequantize Q5_0 row (multiple blocks)
195  */
196 void dequant_q5_0_row(const void *src, float *dst, size_t n_elements)
197 {
198  const block_q5_0 *blocks = (const block_q5_0 *)src;
199  const size_t n_blocks = n_elements / QK5_0;
200 
201  for (size_t b = 0; b < n_blocks; b++) {
202  dequant_q5_0_block(&blocks[b], &dst[b * QK5_0]);
203  }
204 }
205 
206 /* ============================================================================
207  * Q5_1 Dequantization
208  * - 32 weights per block, 1 FP16 scale + 1 FP16 min
209  * - Low 4 bits + 1 high bit packed separately
210  * - Weights are unsigned 5-bit (0 to 31), scaled and offset by min
211  * ============================================================================ */
212 
213 /**
214  * @brief Dequantize a single Q5_1 block to FP32
215  * @param block Pointer to Q5_1 block (24 bytes)
216  * @param output Output FP32 array (32 floats)
217  */
218 void dequant_q5_1_block(const block_q5_1 *block, float *output)
219 {
220  const float d = GGML_FP16_TO_FP32(block->d);
221  const float m = GGML_FP16_TO_FP32(block->m);
222 
223  /* Get high bits as a 32-bit integer */
224  uint32_t qh;
225  memcpy(&qh, block->qh, sizeof(qh));
226 
227  /* llama.cpp Q5_1 layout (same as Q5_0):
228  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
229  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
230  */
231  for (int j = 0; j < QK5_1 / 2; j++) {
232  const uint8_t packed = block->qs[j];
233 
234  /* Extract low 4 bits for two weights */
235  const int lo = (packed & 0x0F);
236  const int hi = (packed >> 4);
237 
238  /* Extract high bits from qh - matches llama.cpp exactly */
239  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
240  const int xh_1 = ((qh >> (j + 12))) & 0x10;
241 
242  /* Combine: 5-bit unsigned value, range 0-31 */
243  const int q0 = (lo | xh_0);
244  const int q1 = (hi | xh_1);
245 
246  /* Dequantize: w = d * q + m */
247  output[j] = d * (float)q0 + m;
248  output[j + 16] = d * (float)q1 + m;
249  }
250 }
251 
252 /**
253  * @brief Dequantize Q5_1 row (multiple blocks)
254  */
255 void dequant_q5_1_row(const void *src, float *dst, size_t n_elements)
256 {
257  const block_q5_1 *blocks = (const block_q5_1 *)src;
258  const size_t n_blocks = n_elements / QK5_1;
259 
260  for (size_t b = 0; b < n_blocks; b++) {
261  dequant_q5_1_block(&blocks[b], &dst[b * QK5_1]);
262  }
263 }
264 
265 /* ============================================================================
266  * Q8_0 Dequantization
267  * - 32 weights per block, 1 FP16 scale
268  * - Weights stored as signed 8-bit
269  * ============================================================================ */
270 
271 /**
272  * @brief Dequantize a single Q8_0 block to FP32
273  */
274 void dequant_q8_0_block(const block_q8_0 *block, float *output)
275 {
276  const float d = GGML_FP16_TO_FP32(block->d);
277 
278  for (int i = 0; i < QK8_0; i++) {
279  output[i] = d * (float)block->qs[i];
280  }
281 }
282 
283 /**
284  * @brief Dequantize Q8_0 row (multiple blocks)
285  */
286 void dequant_q8_0_row(const void *src, float *dst, size_t n_elements)
287 {
288  const block_q8_0 *blocks = (const block_q8_0 *)src;
289  const size_t n_blocks = n_elements / QK8_0;
290 
291  for (size_t b = 0; b < n_blocks; b++) {
292  dequant_q8_0_block(&blocks[b], &dst[b * QK8_0]);
293  }
294 }
295 
296 #ifdef __AVX512F__
297 /**
298  * @brief Dequantize Q8_0 block using AVX-512
299  */
300 void dequant_q8_0_block_avx512(const block_q8_0 *block,
301  __m512 *out0, __m512 *out1)
302 {
303  const __m512 scale = _mm512_set1_ps(GGML_FP16_TO_FP32(block->d));
304 
305  /* Load 32 x int8 as two __m128i */
306  __m128i q0 = _mm_loadu_si128((const __m128i *)&block->qs[0]);
307  __m128i q1 = _mm_loadu_si128((const __m128i *)&block->qs[16]);
308 
309  /* Sign-extend to 32-bit and convert to float */
310  __m512i i0 = _mm512_cvtepi8_epi32(q0);
311  __m512i i1 = _mm512_cvtepi8_epi32(q1);
312 
313  *out0 = _mm512_mul_ps(_mm512_cvtepi32_ps(i0), scale);
314  *out1 = _mm512_mul_ps(_mm512_cvtepi32_ps(i1), scale);
315 }
316 #endif /* __AVX512F__ */
317 
318 /* ============================================================================
319  * Q4_K Dequantization (Primary Target for Q4_K_M)
320  * - 256 weights per super-block
321  * - 8 sub-blocks of 32 weights each
322  * - Two-level scaling: super-block d/dmin + sub-block 6-bit scales
323  * ============================================================================ */
324 
325 /**
326  * @brief Dequantize a single Q4_K block to FP32
327  *
328  * This matches llama.cpp's dequantize_row_q4_K exactly:
329  * - Formula: weight = d * scale * q - dmin * m
330  * - Layout: 4 iterations of 64 weights each
331  * - First 32: low nibbles of qs[0..31] with scale[2*iter], min[2*iter]
332  * - Next 32: high nibbles of qs[0..31] with scale[2*iter+1], min[2*iter+1]
333  */
334 void dequant_q4_k_block(const block_q4_K *block, float *output)
335 {
336  const float d = GGML_FP16_TO_FP32(block->d);
337  const float dmin = GGML_FP16_TO_FP32(block->dmin);
338 
339  /* Unpack the 6-bit sub-block scales and mins */
340  uint8_t sc[8], m[8];
341  unpack_q4_k_scales(block->scales, sc, m);
342 
343  /* llama.cpp layout: 4 iterations of 64 weights each */
344  for (int iter = 0; iter < 4; iter++) {
345  const float d1 = d * (float)sc[2 * iter];
346  const float m1 = dmin * (float)m[2 * iter];
347  const float d2 = d * (float)sc[2 * iter + 1];
348  const float m2 = dmin * (float)m[2 * iter + 1];
349 
350  const uint8_t *qs = &block->qs[iter * 32];
351  float *out = &output[iter * 64];
352 
353  /* First 32 weights: low nibbles */
354  for (int l = 0; l < 32; l++) {
355  const int q = (qs[l] & 0x0F);
356  out[l] = d1 * (float)q - m1;
357  }
358 
359  /* Next 32 weights: high nibbles */
360  for (int l = 0; l < 32; l++) {
361  const int q = (qs[l] >> 4);
362  out[32 + l] = d2 * (float)q - m2;
363  }
364  }
365 }
366 
367 /**
368  * @brief Dequantize Q4_K row (multiple blocks)
369  */
370 void dequant_q4_k_row(const void *src, float *dst, size_t n_elements)
371 {
372  const block_q4_K *blocks = (const block_q4_K *)src;
373  const size_t n_blocks = n_elements / QK_K;
374 
375  for (size_t b = 0; b < n_blocks; b++) {
376  dequant_q4_k_block(&blocks[b], &dst[b * QK_K]);
377  }
378 }
379 
380 /* ============================================================================
381  * Q6_K Dequantization
382  * - 256 weights per block
383  * - 16 sub-blocks of 16 weights, int8 scales + FP16 super-scale
384  * ============================================================================ */
385 
386 /**
387  * @brief Dequantize a single Q6_K block to FP32
388  */
389 void dequant_q6_k_block(const block_q6_K *block, float *output)
390 {
391  const float d = GGML_FP16_TO_FP32(block->d);
392  const uint8_t *ql = block->ql;
393  const uint8_t *qh = block->qh;
394  const int8_t *sc = block->scales;
395  float *y = output;
396 
397  for (int n = 0; n < QK_K; n += 128) {
398  for (int l = 0; l < 32; ++l) {
399  const int is = l / 16;
400  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
401  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
402  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
403  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
404 
405  y[l + 0] = d * (float)sc[is + 0] * (float)q1;
406  y[l + 32] = d * (float)sc[is + 2] * (float)q2;
407  y[l + 64] = d * (float)sc[is + 4] * (float)q3;
408  y[l + 96] = d * (float)sc[is + 6] * (float)q4;
409  }
410  y += 128;
411  ql += 64;
412  qh += 32;
413  sc += 8;
414  }
415 }
416 
417 /**
418  * @brief Dequantize Q6_K row (multiple blocks)
419  */
420 void dequant_q6_k_row(const void *src, float *dst, size_t n_elements)
421 {
422  const block_q6_K *blocks = (const block_q6_K *)src;
423  const size_t n_blocks = n_elements / QK_K;
424 
425  for (size_t b = 0; b < n_blocks; b++) {
426  dequant_q6_k_block(&blocks[b], &dst[b * QK_K]);
427  }
428 }
429 
430 #ifdef __AVX512F__
431 /**
432  * @brief Dequantize one Q4_K sub-block (32 weights) using AVX-512
433  *
434  * @param qs Pointer to 16 bytes of packed 4-bit weights
435  * @param scale Pre-computed d * sub_scale
436  * @param min_val Pre-computed dmin * sub_min
437  * @param out0 Output: weights 0-15
438  * @param out1 Output: weights 16-31
439  */
440 /**
441  * @brief Dequantize full Q4_K block using AVX-512
442  *
443  * This matches llama.cpp's dequantize_row_q4_K exactly:
444  * - Formula: weight = d * scale * q - dmin * m
445  * - Layout: 4 iterations of 64 weights each
446  * - First 32: low nibbles of qs[0..31] with scale[2*iter], min[2*iter]
447  * - Next 32: high nibbles of qs[0..31] with scale[2*iter+1], min[2*iter+1]
448  */
449 void dequant_q4_k_block_avx512(const block_q4_K *block, float *output)
450 {
451  const float d = GGML_FP16_TO_FP32(block->d);
452  const float dmin = GGML_FP16_TO_FP32(block->dmin);
453 
454  uint8_t sc[8], m[8];
455  unpack_q4_k_scales(block->scales, sc, m);
456 
457  const __m512i mask_lo = _mm512_set1_epi32(0x0F);
458 
459  /* llama.cpp layout: 4 iterations of 64 weights each */
460  for (int iter = 0; iter < 4; iter++) {
461  const float d1 = d * (float)sc[2 * iter];
462  const float m1 = dmin * (float)m[2 * iter];
463  const float d2 = d * (float)sc[2 * iter + 1];
464  const float m2 = dmin * (float)m[2 * iter + 1];
465 
466  const __m512 vd1 = _mm512_set1_ps(d1);
467  const __m512 vm1 = _mm512_set1_ps(m1);
468  const __m512 vd2 = _mm512_set1_ps(d2);
469  const __m512 vm2 = _mm512_set1_ps(m2);
470 
471  const uint8_t *qs = &block->qs[iter * 32];
472  float *out = &output[iter * 64];
473 
474  /* Process first 32 weights (low nibbles) in two 16-float chunks */
475  for (int chunk = 0; chunk < 2; chunk++) {
476  __m128i packed = _mm_loadu_si128((const __m128i *)&qs[chunk * 16]);
477  __m512i bytes = _mm512_cvtepu8_epi32(packed);
478  __m512i lo = _mm512_and_epi32(bytes, mask_lo);
479  /* w = d1 * q - m1: fnmadd computes -(a*b) + c = c - a*b = -m1 + d1*q */
480  __m512 w = _mm512_fnmadd_ps(_mm512_set1_ps(1.0f), vm1,
481  _mm512_mul_ps(_mm512_cvtepi32_ps(lo), vd1));
482  _mm512_storeu_ps(&out[chunk * 16], w);
483  }
484 
485  /* Process next 32 weights (high nibbles) in two 16-float chunks */
486  for (int chunk = 0; chunk < 2; chunk++) {
487  __m128i packed = _mm_loadu_si128((const __m128i *)&qs[chunk * 16]);
488  __m512i bytes = _mm512_cvtepu8_epi32(packed);
489  __m512i hi = _mm512_srli_epi32(bytes, 4);
490  /* w = d2 * q - m2 */
491  __m512 w = _mm512_fnmadd_ps(_mm512_set1_ps(1.0f), vm2,
492  _mm512_mul_ps(_mm512_cvtepi32_ps(hi), vd2));
493  _mm512_storeu_ps(&out[32 + chunk * 16], w);
494  }
495  }
496 }
497 #endif /* __AVX512F__ */
498 
499 /* ============================================================================
500  * Generic Dequantization Dispatch
501  * ============================================================================ */
502 
503 #include "ckernel_dtype.h"
504 
505 /**
506  * @brief Dequantize a row of quantized data to FP32
507  * @param dtype Data type (must be quantized type)
508  * @param src Source quantized data
509  * @param dst Destination FP32 buffer
510  * @param n_elements Number of elements
511  */
512 void dequant_row(CKDataType dtype, const void *src, float *dst, size_t n_elements)
513 {
514  switch (dtype) {
515  case CK_DT_Q4_0:
516  dequant_q4_0_row(src, dst, n_elements);
517  break;
518  case CK_DT_Q4_1:
519  dequant_q4_1_row(src, dst, n_elements);
520  break;
521  case CK_DT_Q5_0:
522  dequant_q5_0_row(src, dst, n_elements);
523  break;
524  case CK_DT_Q5_1:
525  dequant_q5_1_row(src, dst, n_elements);
526  break;
527  case CK_DT_Q4_K:
528  dequant_q4_k_row(src, dst, n_elements);
529  break;
530  case CK_DT_Q6_K:
531  dequant_q6_k_row(src, dst, n_elements);
532  break;
533  case CK_DT_Q8_0:
534  dequant_q8_0_row(src, dst, n_elements);
535  break;
536  default:
537  /* Not a quantized type - no-op or error */
538  break;
539  }
540 }
CKDataType
Supported data types in C-Kernel-Engine.
Definition: ckernel_dtype.h:27
@ CK_DT_Q4_K
Definition: ckernel_dtype.h:40
@ CK_DT_Q4_0
Definition: ckernel_dtype.h:38
@ CK_DT_Q8_0
Definition: ckernel_dtype.h:42
@ CK_DT_Q5_0
Definition: ckernel_dtype.h:44
@ CK_DT_Q6_K
Definition: ckernel_dtype.h:41
@ CK_DT_Q4_1
Definition: ckernel_dtype.h:39
@ CK_DT_Q5_1
Definition: ckernel_dtype.h:45
Quantization block structures for weight-only quantization.
#define QK5_0
Definition: ckernel_quant.h:67
#define GGML_FP16_TO_FP32
#define QK5_1
Definition: ckernel_quant.h:84
#define QK4_0
Definition: ckernel_quant.h:35
#define QK4_1
Definition: ckernel_quant.h:50
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
#define QK8_0
#define QK_K
void dequant_q4_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_0 row (multiple blocks)
void dequant_q5_0_block(const block_q5_0 *block, float *output)
Dequantize a single Q5_0 block to FP32.
void dequant_q5_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_0 row (multiple blocks)
void dequant_q8_0_block(const block_q8_0 *block, float *output)
Dequantize a single Q8_0 block to FP32.
void dequant_q4_1_block(const block_q4_1 *block, float *output)
Dequantize a single Q4_1 block to FP32.
void dequant_q6_k_block(const block_q6_K *block, float *output)
Dequantize a single Q6_K block to FP32.
void dequant_q4_k_block(const block_q4_K *block, float *output)
Dequantize a single Q4_K block to FP32.
void dequant_q4_0_block(const block_q4_0 *block, float *output)
Dequantize a single Q4_0 block to FP32.
void dequant_row(CKDataType dtype, const void *src, float *dst, size_t n_elements)
Dequantize a row of quantized data to FP32.
void dequant_q8_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q8_0 row (multiple blocks)
void dequant_q5_1_block(const block_q5_1 *block, float *output)
Dequantize a single Q5_1 block to FP32.
void dequant_q5_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_1 row (multiple blocks)
void dequant_q4_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_1 row (multiple blocks)
void dequant_q6_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q6_K row (multiple blocks)
void dequant_q4_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_K row (multiple blocks)
ck_half d
Definition: ckernel_quant.h:38
uint8_t qs[32/2]
Definition: ckernel_quant.h:39
ck_half m
Definition: ckernel_quant.h:54
ck_half d
Definition: ckernel_quant.h:53
uint8_t qs[32/2]
Definition: ckernel_quant.h:55
uint8_t scales[12]
uint8_t qs[256/2]
ck_half dmin
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72
uint8_t qs[32/2]
Definition: ckernel_quant.h:90
uint8_t qh[4]
Definition: ckernel_quant.h:89
ck_half m
Definition: ckernel_quant.h:88
ck_half d
Definition: ckernel_quant.h:87
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]
int8_t qs[32]