← Back to C-Kernel-Engine Docs Doxygen Source Documentation
dequant_kernels.c File Reference

Dequantization kernels for GGML-compatible formats. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <immintrin.h>
#include "ckernel_quant.h"
#include "ckernel_dtype.h"

Go to the source code of this file.

Functions

void dequant_q4_0_block (const block_q4_0 *block, float *output)
 Dequantize a single Q4_0 block to FP32. More...
 
void dequant_q4_0_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q4_0 row (multiple blocks) More...
 
void dequant_q4_1_block (const block_q4_1 *block, float *output)
 Dequantize a single Q4_1 block to FP32. More...
 
void dequant_q4_1_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q4_1 row (multiple blocks) More...
 
void dequant_q4_k_block (const block_q4_K *block, float *output)
 Dequantize a single Q4_K block to FP32. More...
 
void dequant_q4_k_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q4_K row (multiple blocks) More...
 
void dequant_q5_0_block (const block_q5_0 *block, float *output)
 Dequantize a single Q5_0 block to FP32. More...
 
void dequant_q5_0_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q5_0 row (multiple blocks) More...
 
void dequant_q5_1_block (const block_q5_1 *block, float *output)
 Dequantize a single Q5_1 block to FP32. More...
 
void dequant_q5_1_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q5_1 row (multiple blocks) More...
 
void dequant_q6_k_block (const block_q6_K *block, float *output)
 Dequantize a single Q6_K block to FP32. More...
 
void dequant_q6_k_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q6_K row (multiple blocks) More...
 
void dequant_q8_0_block (const block_q8_0 *block, float *output)
 Dequantize a single Q8_0 block to FP32. More...
 
void dequant_q8_0_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q8_0 row (multiple blocks) More...
 
void dequant_row (CKDataType dtype, const void *src, float *dst, size_t n_elements)
 Dequantize a row of quantized data to FP32. More...
 

Detailed Description

Dequantization kernels for GGML-compatible formats.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements dequantization from Q4_0, Q5_0, Q5_1, Q4_K, Q6_K, Q8_0 to FP32. These kernels are used as building blocks for quantized GEMM/GEMV.

Key optimization: Dequantize into registers, use immediately in FMA, never write intermediate FP32 values to memory.

Definition in file dequant_kernels.c.

Function Documentation

◆ dequant_q4_0_block()

void dequant_q4_0_block ( const block_q4_0 block,
float *  output 
)

Dequantize a single Q4_0 block to FP32.

Parameters
blockPointer to Q4_0 block (18 bytes)
outputOutput FP32 array (32 floats)

Definition at line 38 of file dequant_kernels.c.

39 {
40  const float d = GGML_FP16_TO_FP32(block->d);
41 
42  for (int i = 0; i < QK4_0 / 2; i++) {
43  const uint8_t packed = block->qs[i];
44 
45  /* Lower nibble: elements 0..15 */
46  const int8_t q0 = (packed & 0x0F) - 8;
47  /* Upper nibble: elements 16..31 */
48  const int8_t q1 = (packed >> 4) - 8;
49 
50  output[i] = d * (float)q0;
51  output[i + QK4_0 / 2] = d * (float)q1;
52  }
53 }
#define GGML_FP16_TO_FP32
#define QK4_0
Definition: ckernel_quant.h:35
ck_half d
Definition: ckernel_quant.h:38
uint8_t qs[32/2]
Definition: ckernel_quant.h:39

References block_q4_0::d, GGML_FP16_TO_FP32, QK4_0, and block_q4_0::qs.

Referenced by dequant_q4_0_row().

◆ dequant_q4_0_row()

void dequant_q4_0_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q4_0 row (multiple blocks)

Parameters
srcQ4_0 data
dstFP32 output
n_elementsNumber of elements to dequantize

Definition at line 61 of file dequant_kernels.c.

62 {
63  const block_q4_0 *blocks = (const block_q4_0 *)src;
64  const size_t n_blocks = n_elements / QK4_0;
65 
66  for (size_t b = 0; b < n_blocks; b++) {
67  dequant_q4_0_block(&blocks[b], &dst[b * QK4_0]);
68  }
69 }
void dequant_q4_0_block(const block_q4_0 *block, float *output)
Dequantize a single Q4_0 block to FP32.

References dequant_q4_0_block(), and QK4_0.

Referenced by ck_test_dequant_q4_0(), and dequant_row().

◆ dequant_q4_1_block()

void dequant_q4_1_block ( const block_q4_1 block,
float *  output 
)

Dequantize a single Q4_1 block to FP32.

Parameters
blockPointer to Q4_1 block (20 bytes)
outputOutput FP32 array (32 floats)

Definition at line 117 of file dequant_kernels.c.

118 {
119  const float d = GGML_FP16_TO_FP32(block->d);
120  const float m = GGML_FP16_TO_FP32(block->m);
121 
122  for (int i = 0; i < QK4_1 / 2; i++) {
123  const uint8_t packed = block->qs[i];
124 
125  /* Lower nibble: unsigned 0-15 */
126  const int q0 = (packed & 0x0F);
127  /* Upper nibble: unsigned 0-15 */
128  const int q1 = (packed >> 4);
129 
130  /* Dequantize: w = d * q + m */
131  output[i] = d * (float)q0 + m;
132  output[i + QK4_1 / 2] = d * (float)q1 + m;
133  }
134 }
#define QK4_1
Definition: ckernel_quant.h:50
ck_half m
Definition: ckernel_quant.h:54
ck_half d
Definition: ckernel_quant.h:53
uint8_t qs[32/2]
Definition: ckernel_quant.h:55

References block_q4_1::d, GGML_FP16_TO_FP32, block_q4_1::m, QK4_1, and block_q4_1::qs.

Referenced by dequant_q4_1_row().

◆ dequant_q4_1_row()

void dequant_q4_1_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q4_1 row (multiple blocks)

Definition at line 139 of file dequant_kernels.c.

140 {
141  const block_q4_1 *blocks = (const block_q4_1 *)src;
142  const size_t n_blocks = n_elements / QK4_1;
143 
144  for (size_t b = 0; b < n_blocks; b++) {
145  dequant_q4_1_block(&blocks[b], &dst[b * QK4_1]);
146  }
147 }
void dequant_q4_1_block(const block_q4_1 *block, float *output)
Dequantize a single Q4_1 block to FP32.

References dequant_q4_1_block(), and QK4_1.

Referenced by dequant_row().

◆ dequant_q4_k_block()

void dequant_q4_k_block ( const block_q4_K block,
float *  output 
)

Dequantize a single Q4_K block to FP32.

This matches llama.cpp's dequantize_row_q4_K exactly:

  • Formula: weight = d * scale * q - dmin * m
  • Layout: 4 iterations of 64 weights each
    • First 32: low nibbles of qs[0..31] with scale[2*iter], min[2*iter]
    • Next 32: high nibbles of qs[0..31] with scale[2*iter+1], min[2*iter+1]

Definition at line 334 of file dequant_kernels.c.

335 {
336  const float d = GGML_FP16_TO_FP32(block->d);
337  const float dmin = GGML_FP16_TO_FP32(block->dmin);
338 
339  /* Unpack the 6-bit sub-block scales and mins */
340  uint8_t sc[8], m[8];
341  unpack_q4_k_scales(block->scales, sc, m);
342 
343  /* llama.cpp layout: 4 iterations of 64 weights each */
344  for (int iter = 0; iter < 4; iter++) {
345  const float d1 = d * (float)sc[2 * iter];
346  const float m1 = dmin * (float)m[2 * iter];
347  const float d2 = d * (float)sc[2 * iter + 1];
348  const float m2 = dmin * (float)m[2 * iter + 1];
349 
350  const uint8_t *qs = &block->qs[iter * 32];
351  float *out = &output[iter * 64];
352 
353  /* First 32 weights: low nibbles */
354  for (int l = 0; l < 32; l++) {
355  const int q = (qs[l] & 0x0F);
356  out[l] = d1 * (float)q - m1;
357  }
358 
359  /* Next 32 weights: high nibbles */
360  for (int l = 0; l < 32; l++) {
361  const int q = (qs[l] >> 4);
362  out[32 + l] = d2 * (float)q - m2;
363  }
364  }
365 }
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
uint8_t scales[12]
uint8_t qs[256/2]
ck_half dmin

References block_q4_K::d, block_q4_K::dmin, GGML_FP16_TO_FP32, block_q4_K::qs, block_q4_K::scales, and unpack_q4_k_scales().

Referenced by dequant_q4_k_row().

◆ dequant_q4_k_row()

void dequant_q4_k_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q4_K row (multiple blocks)

Definition at line 370 of file dequant_kernels.c.

371 {
372  const block_q4_K *blocks = (const block_q4_K *)src;
373  const size_t n_blocks = n_elements / QK_K;
374 
375  for (size_t b = 0; b < n_blocks; b++) {
376  dequant_q4_k_block(&blocks[b], &dst[b * QK_K]);
377  }
378 }
#define QK_K
void dequant_q4_k_block(const block_q4_K *block, float *output)
Dequantize a single Q4_K block to FP32.

References dequant_q4_k_block(), and QK_K.

Referenced by ck_test_dequant_q4_k(), dequant_row(), and embedding_forward_q4_k().

◆ dequant_q5_0_block()

void dequant_q5_0_block ( const block_q5_0 block,
float *  output 
)

Dequantize a single Q5_0 block to FP32.

Parameters
blockPointer to Q5_0 block (22 bytes)
outputOutput FP32 array (32 floats)

Definition at line 161 of file dequant_kernels.c.

162 {
163  const float d = GGML_FP16_TO_FP32(block->d);
164 
165  /* Get high bits as a 32-bit integer */
166  uint32_t qh;
167  memcpy(&qh, block->qh, sizeof(qh));
168 
169  /* llama.cpp Q5_0 layout:
170  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
171  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
172  */
173  for (int j = 0; j < QK5_0 / 2; j++) {
174  const uint8_t packed = block->qs[j];
175 
176  /* Extract low 4 bits for two weights */
177  const int lo = (packed & 0x0F);
178  const int hi = (packed >> 4);
179 
180  /* Extract high bits from qh - matches llama.cpp exactly */
181  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
182  const int xh_1 = ((qh >> (j + 12))) & 0x10;
183 
184  /* Combine: 5-bit value, range 0-31, then subtract 16 */
185  const int q0 = (lo | xh_0) - 16;
186  const int q1 = (hi | xh_1) - 16;
187 
188  output[j] = d * (float)q0;
189  output[j + 16] = d * (float)q1;
190  }
191 }
#define QK5_0
Definition: ckernel_quant.h:67
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72

References block_q5_0::d, GGML_FP16_TO_FP32, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by dequant_q5_0_row().

◆ dequant_q5_0_row()

void dequant_q5_0_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q5_0 row (multiple blocks)

Definition at line 196 of file dequant_kernels.c.

197 {
198  const block_q5_0 *blocks = (const block_q5_0 *)src;
199  const size_t n_blocks = n_elements / QK5_0;
200 
201  for (size_t b = 0; b < n_blocks; b++) {
202  dequant_q5_0_block(&blocks[b], &dst[b * QK5_0]);
203  }
204 }
void dequant_q5_0_block(const block_q5_0 *block, float *output)
Dequantize a single Q5_0 block to FP32.

References dequant_q5_0_block(), and QK5_0.

Referenced by dequant_row().

◆ dequant_q5_1_block()

void dequant_q5_1_block ( const block_q5_1 block,
float *  output 
)

Dequantize a single Q5_1 block to FP32.

Parameters
blockPointer to Q5_1 block (24 bytes)
outputOutput FP32 array (32 floats)

Definition at line 218 of file dequant_kernels.c.

219 {
220  const float d = GGML_FP16_TO_FP32(block->d);
221  const float m = GGML_FP16_TO_FP32(block->m);
222 
223  /* Get high bits as a 32-bit integer */
224  uint32_t qh;
225  memcpy(&qh, block->qh, sizeof(qh));
226 
227  /* llama.cpp Q5_1 layout (same as Q5_0):
228  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
229  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
230  */
231  for (int j = 0; j < QK5_1 / 2; j++) {
232  const uint8_t packed = block->qs[j];
233 
234  /* Extract low 4 bits for two weights */
235  const int lo = (packed & 0x0F);
236  const int hi = (packed >> 4);
237 
238  /* Extract high bits from qh - matches llama.cpp exactly */
239  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
240  const int xh_1 = ((qh >> (j + 12))) & 0x10;
241 
242  /* Combine: 5-bit unsigned value, range 0-31 */
243  const int q0 = (lo | xh_0);
244  const int q1 = (hi | xh_1);
245 
246  /* Dequantize: w = d * q + m */
247  output[j] = d * (float)q0 + m;
248  output[j + 16] = d * (float)q1 + m;
249  }
250 }
#define QK5_1
Definition: ckernel_quant.h:84
uint8_t qs[32/2]
Definition: ckernel_quant.h:90
uint8_t qh[4]
Definition: ckernel_quant.h:89
ck_half m
Definition: ckernel_quant.h:88
ck_half d
Definition: ckernel_quant.h:87

References block_q5_1::d, GGML_FP16_TO_FP32, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by dequant_q5_1_row().

◆ dequant_q5_1_row()

void dequant_q5_1_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q5_1 row (multiple blocks)

Definition at line 255 of file dequant_kernels.c.

256 {
257  const block_q5_1 *blocks = (const block_q5_1 *)src;
258  const size_t n_blocks = n_elements / QK5_1;
259 
260  for (size_t b = 0; b < n_blocks; b++) {
261  dequant_q5_1_block(&blocks[b], &dst[b * QK5_1]);
262  }
263 }
void dequant_q5_1_block(const block_q5_1 *block, float *output)
Dequantize a single Q5_1 block to FP32.

References dequant_q5_1_block(), and QK5_1.

Referenced by ck_test_dequant_q5_1(), and dequant_row().

◆ dequant_q6_k_block()

void dequant_q6_k_block ( const block_q6_K block,
float *  output 
)

Dequantize a single Q6_K block to FP32.

Definition at line 389 of file dequant_kernels.c.

390 {
391  const float d = GGML_FP16_TO_FP32(block->d);
392  const uint8_t *ql = block->ql;
393  const uint8_t *qh = block->qh;
394  const int8_t *sc = block->scales;
395  float *y = output;
396 
397  for (int n = 0; n < QK_K; n += 128) {
398  for (int l = 0; l < 32; ++l) {
399  const int is = l / 16;
400  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
401  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
402  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
403  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
404 
405  y[l + 0] = d * (float)sc[is + 0] * (float)q1;
406  y[l + 32] = d * (float)sc[is + 2] * (float)q2;
407  y[l + 64] = d * (float)sc[is + 4] * (float)q3;
408  y[l + 96] = d * (float)sc[is + 6] * (float)q4;
409  }
410  y += 128;
411  ql += 64;
412  qh += 32;
413  sc += 8;
414  }
415 }
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]

References block_q6_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, and block_q6_K::scales.

Referenced by dequant_q6_k_row().

◆ dequant_q6_k_row()

void dequant_q6_k_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q6_K row (multiple blocks)

Definition at line 420 of file dequant_kernels.c.

421 {
422  const block_q6_K *blocks = (const block_q6_K *)src;
423  const size_t n_blocks = n_elements / QK_K;
424 
425  for (size_t b = 0; b < n_blocks; b++) {
426  dequant_q6_k_block(&blocks[b], &dst[b * QK_K]);
427  }
428 }
void dequant_q6_k_block(const block_q6_K *block, float *output)
Dequantize a single Q6_K block to FP32.

References dequant_q6_k_block(), and QK_K.

Referenced by ck_test_dequant_q6_k(), ck_test_gemv_q6_k(), dequant_row(), and embedding_forward_q6_k().

◆ dequant_q8_0_block()

void dequant_q8_0_block ( const block_q8_0 block,
float *  output 
)

Dequantize a single Q8_0 block to FP32.

Definition at line 274 of file dequant_kernels.c.

275 {
276  const float d = GGML_FP16_TO_FP32(block->d);
277 
278  for (int i = 0; i < QK8_0; i++) {
279  output[i] = d * (float)block->qs[i];
280  }
281 }
#define QK8_0
int8_t qs[32]

References block_q8_0::d, GGML_FP16_TO_FP32, QK8_0, and block_q8_0::qs.

Referenced by dequant_q8_0_row().

◆ dequant_q8_0_row()

void dequant_q8_0_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q8_0 row (multiple blocks)

Definition at line 286 of file dequant_kernels.c.

287 {
288  const block_q8_0 *blocks = (const block_q8_0 *)src;
289  const size_t n_blocks = n_elements / QK8_0;
290 
291  for (size_t b = 0; b < n_blocks; b++) {
292  dequant_q8_0_block(&blocks[b], &dst[b * QK8_0]);
293  }
294 }
void dequant_q8_0_block(const block_q8_0 *block, float *output)
Dequantize a single Q8_0 block to FP32.

References dequant_q8_0_block(), and QK8_0.

Referenced by dequant_row(), and embedding_forward_q8_0().

◆ dequant_row()

void dequant_row ( CKDataType  dtype,
const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize a row of quantized data to FP32.

Parameters
dtypeData type (must be quantized type)
srcSource quantized data
dstDestination FP32 buffer
n_elementsNumber of elements

Definition at line 512 of file dequant_kernels.c.

513 {
514  switch (dtype) {
515  case CK_DT_Q4_0:
516  dequant_q4_0_row(src, dst, n_elements);
517  break;
518  case CK_DT_Q4_1:
519  dequant_q4_1_row(src, dst, n_elements);
520  break;
521  case CK_DT_Q5_0:
522  dequant_q5_0_row(src, dst, n_elements);
523  break;
524  case CK_DT_Q5_1:
525  dequant_q5_1_row(src, dst, n_elements);
526  break;
527  case CK_DT_Q4_K:
528  dequant_q4_k_row(src, dst, n_elements);
529  break;
530  case CK_DT_Q6_K:
531  dequant_q6_k_row(src, dst, n_elements);
532  break;
533  case CK_DT_Q8_0:
534  dequant_q8_0_row(src, dst, n_elements);
535  break;
536  default:
537  /* Not a quantized type - no-op or error */
538  break;
539  }
540 }
@ CK_DT_Q4_K
Definition: ckernel_dtype.h:40
@ CK_DT_Q4_0
Definition: ckernel_dtype.h:38
@ CK_DT_Q8_0
Definition: ckernel_dtype.h:42
@ CK_DT_Q5_0
Definition: ckernel_dtype.h:44
@ CK_DT_Q6_K
Definition: ckernel_dtype.h:41
@ CK_DT_Q4_1
Definition: ckernel_dtype.h:39
@ CK_DT_Q5_1
Definition: ckernel_dtype.h:45
void dequant_q4_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_0 row (multiple blocks)
void dequant_q5_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_0 row (multiple blocks)
void dequant_q8_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q8_0 row (multiple blocks)
void dequant_q5_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_1 row (multiple blocks)
void dequant_q4_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_1 row (multiple blocks)
void dequant_q6_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q6_K row (multiple blocks)
void dequant_q4_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_K row (multiple blocks)

References CK_DT_Q4_0, CK_DT_Q4_1, CK_DT_Q4_K, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q6_K, CK_DT_Q8_0, dequant_q4_0_row(), dequant_q4_1_row(), dequant_q4_k_row(), dequant_q5_0_row(), dequant_q5_1_row(), dequant_q6_k_row(), and dequant_q8_0_row().