Dequantization kernels for GGML-compatible formats. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <immintrin.h>
#include "ckernel_quant.h"
#include "ckernel_dtype.h"

Functions
void	dequant_q4_0_block (const block_q4_0 block, float output)
	Dequantize a single Q4_0 block to FP32. More...

void	dequant_q4_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_0 row (multiple blocks) More...

void	dequant_q4_1_block (const block_q4_1 block, float output)
	Dequantize a single Q4_1 block to FP32. More...

void	dequant_q4_1_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_1 row (multiple blocks) More...

void	dequant_q4_k_block (const block_q4_K block, float output)
	Dequantize a single Q4_K block to FP32. More...

void	dequant_q4_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_K row (multiple blocks) More...

void	dequant_q5_0_block (const block_q5_0 block, float output)
	Dequantize a single Q5_0 block to FP32. More...

void	dequant_q5_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q5_0 row (multiple blocks) More...

void	dequant_q5_1_block (const block_q5_1 block, float output)
	Dequantize a single Q5_1 block to FP32. More...

void	dequant_q5_1_row (const void src, float dst, size_t n_elements)
	Dequantize Q5_1 row (multiple blocks) More...

void	dequant_q6_k_block (const block_q6_K block, float output)
	Dequantize a single Q6_K block to FP32. More...

void	dequant_q6_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q6_K row (multiple blocks) More...

void	dequant_q8_0_block (const block_q8_0 block, float output)
	Dequantize a single Q8_0 block to FP32. More...

void	dequant_q8_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q8_0 row (multiple blocks) More...

void	dequant_row (CKDataType dtype, const void src, float dst, size_t n_elements)
	Dequantize a row of quantized data to FP32. More...

Detailed Description

Dequantization kernels for GGML-compatible formats.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements dequantization from Q4_0, Q5_0, Q5_1, Q4_K, Q6_K, Q8_0 to FP32. These kernels are used as building blocks for quantized GEMM/GEMV.

Key optimization: Dequantize into registers, use immediately in FMA, never write intermediate FP32 values to memory.

Definition in file dequant_kernels.c.

Function Documentation

◆ dequant_q4_0_block()

void dequant_q4_0_block	(	const block_q4_0 *	block,
		float *	output
	)

Dequantize a single Q4_0 block to FP32.

Parameters

block	Pointer to Q4_0 block (18 bytes)
output	Output FP32 array (32 floats)

Definition at line 38 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
  
     for (int i = 0; i < QK4_0 / 2; i++) {
         const uint8_t packed = block->qs[i];
  
         /* Lower nibble: elements 0..15 */
         const int8_t q0 = (packed & 0x0F) - 8;
         /* Upper nibble: elements 16..31 */
         const int8_t q1 = (packed >> 4) - 8;
  
         output[i] = d * (float)q0;
         output[i + QK4_0 / 2] = d * (float)q1;
     }
 }

References block_q4_0::d, GGML_FP16_TO_FP32, QK4_0, and block_q4_0::qs.

Referenced by dequant_q4_0_row().

◆ dequant_q4_0_row()

void dequant_q4_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_0 row (multiple blocks)

Parameters

src	Q4_0 data
dst	FP32 output
n_elements	Number of elements to dequantize

Definition at line 61 of file dequant_kernels.c.

 {
     const block_q4_0 *blocks = (const block_q4_0 *)src;
     const size_t n_blocks = n_elements / QK4_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_0_block(&blocks[b], &dst[b * QK4_0]);
     }
 }

References dequant_q4_0_block(), and QK4_0.

Referenced by ck_test_dequant_q4_0(), and dequant_row().

◆ dequant_q4_1_block()

void dequant_q4_1_block	(	const block_q4_1 *	block,
		float *	output
	)

Dequantize a single Q4_1 block to FP32.

Parameters

block	Pointer to Q4_1 block (20 bytes)
output	Output FP32 array (32 floats)

Definition at line 117 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
     const float m = GGML_FP16_TO_FP32(block->m);
  
     for (int i = 0; i < QK4_1 / 2; i++) {
         const uint8_t packed = block->qs[i];
  
         /* Lower nibble: unsigned 0-15 */
         const int q0 = (packed & 0x0F);
         /* Upper nibble: unsigned 0-15 */
         const int q1 = (packed >> 4);
  
         /* Dequantize: w = d * q + m */
         output[i] = d * (float)q0 + m;
         output[i + QK4_1 / 2] = d * (float)q1 + m;
     }
 }

References block_q4_1::d, GGML_FP16_TO_FP32, block_q4_1::m, QK4_1, and block_q4_1::qs.

Referenced by dequant_q4_1_row().

◆ dequant_q4_1_row()

void dequant_q4_1_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_1 row (multiple blocks)

Definition at line 139 of file dequant_kernels.c.

 {
     const block_q4_1 *blocks = (const block_q4_1 *)src;
     const size_t n_blocks = n_elements / QK4_1;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_1_block(&blocks[b], &dst[b * QK4_1]);
     }
 }

References dequant_q4_1_block(), and QK4_1.

Referenced by dequant_row().

◆ dequant_q4_k_block()

void dequant_q4_k_block	(	const block_q4_K *	block,
		float *	output
	)

Dequantize a single Q4_K block to FP32.

This matches llama.cpp's dequantize_row_q4_K exactly:

Formula: weight = d * scale * q - dmin * m
Layout: 4 iterations of 64 weights each
- First 32: low nibbles of qs[0..31] with scale[2*iter], min[2*iter]
- Next 32: high nibbles of qs[0..31] with scale[2*iter+1], min[2*iter+1]

Definition at line 334 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
     const float dmin = GGML_FP16_TO_FP32(block->dmin);
  
     /* Unpack the 6-bit sub-block scales and mins */
     uint8_t sc[8], m[8];
     unpack_q4_k_scales(block->scales, sc, m);
  
     /* llama.cpp layout: 4 iterations of 64 weights each */
     for (int iter = 0; iter < 4; iter++) {
         const float d1 = d * (float)sc[2 * iter];
         const float m1 = dmin * (float)m[2 * iter];
         const float d2 = d * (float)sc[2 * iter + 1];
         const float m2 = dmin * (float)m[2 * iter + 1];
  
         const uint8_t *qs = &block->qs[iter * 32];
         float *out = &output[iter * 64];
  
         /* First 32 weights: low nibbles */
         for (int l = 0; l < 32; l++) {
             const int q = (qs[l] & 0x0F);
             out[l] = d1 * (float)q - m1;
         }
  
         /* Next 32 weights: high nibbles */
         for (int l = 0; l < 32; l++) {
             const int q = (qs[l] >> 4);
             out[32 + l] = d2 * (float)q - m2;
         }
     }
 }

References block_q4_K::d, block_q4_K::dmin, GGML_FP16_TO_FP32, block_q4_K::qs, block_q4_K::scales, and unpack_q4_k_scales().

Referenced by dequant_q4_k_row().

◆ dequant_q4_k_row()

void dequant_q4_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_K row (multiple blocks)

Definition at line 370 of file dequant_kernels.c.

 {
     const block_q4_K *blocks = (const block_q4_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

References dequant_q4_k_block(), and QK_K.

Referenced by ck_test_dequant_q4_k(), dequant_row(), and embedding_forward_q4_k().

◆ dequant_q5_0_block()

void dequant_q5_0_block	(	const block_q5_0 *	block,
		float *	output
	)

Dequantize a single Q5_0 block to FP32.

Parameters

block	Pointer to Q5_0 block (22 bytes)
output	Output FP32 array (32 floats)

Definition at line 161 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
  
     /* Get high bits as a 32-bit integer */
     uint32_t qh;
     memcpy(&qh, block->qh, sizeof(qh));
  
     /* llama.cpp Q5_0 layout:
      * - Weight j uses: low nibble of qs[j], high bit from qh bit j
      * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
      */
     for (int j = 0; j < QK5_0 / 2; j++) {
         const uint8_t packed = block->qs[j];
  
         /* Extract low 4 bits for two weights */
         const int lo = (packed & 0x0F);
         const int hi = (packed >> 4);
  
         /* Extract high bits from qh - matches llama.cpp exactly */
         const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
         const int xh_1 = ((qh >> (j + 12))) & 0x10;
  
         /* Combine: 5-bit value, range 0-31, then subtract 16 */
         const int q0 = (lo | xh_0) - 16;
         const int q1 = (hi | xh_1) - 16;
  
         output[j] = d * (float)q0;
         output[j + 16] = d * (float)q1;
     }
 }

References block_q5_0::d, GGML_FP16_TO_FP32, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by dequant_q5_0_row().

◆ dequant_q5_0_row()

void dequant_q5_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q5_0 row (multiple blocks)

Definition at line 196 of file dequant_kernels.c.

 {
     const block_q5_0 *blocks = (const block_q5_0 *)src;
     const size_t n_blocks = n_elements / QK5_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q5_0_block(&blocks[b], &dst[b * QK5_0]);
     }
 }

References dequant_q5_0_block(), and QK5_0.

Referenced by dequant_row().

◆ dequant_q5_1_block()

void dequant_q5_1_block	(	const block_q5_1 *	block,
		float *	output
	)

Dequantize a single Q5_1 block to FP32.

Parameters

block	Pointer to Q5_1 block (24 bytes)
output	Output FP32 array (32 floats)

Definition at line 218 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
     const float m = GGML_FP16_TO_FP32(block->m);
  
     /* Get high bits as a 32-bit integer */
     uint32_t qh;
     memcpy(&qh, block->qh, sizeof(qh));
  
     /* llama.cpp Q5_1 layout (same as Q5_0):
      * - Weight j uses: low nibble of qs[j], high bit from qh bit j
      * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
      */
     for (int j = 0; j < QK5_1 / 2; j++) {
         const uint8_t packed = block->qs[j];
  
         /* Extract low 4 bits for two weights */
         const int lo = (packed & 0x0F);
         const int hi = (packed >> 4);
  
         /* Extract high bits from qh - matches llama.cpp exactly */
         const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
         const int xh_1 = ((qh >> (j + 12))) & 0x10;
  
         /* Combine: 5-bit unsigned value, range 0-31 */
         const int q0 = (lo | xh_0);
         const int q1 = (hi | xh_1);
  
         /* Dequantize: w = d * q + m */
         output[j] = d * (float)q0 + m;
         output[j + 16] = d * (float)q1 + m;
     }
 }

References block_q5_1::d, GGML_FP16_TO_FP32, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by dequant_q5_1_row().

◆ dequant_q5_1_row()

void dequant_q5_1_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q5_1 row (multiple blocks)

Definition at line 255 of file dequant_kernels.c.

 {
     const block_q5_1 *blocks = (const block_q5_1 *)src;
     const size_t n_blocks = n_elements / QK5_1;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q5_1_block(&blocks[b], &dst[b * QK5_1]);
     }
 }

References dequant_q5_1_block(), and QK5_1.

Referenced by ck_test_dequant_q5_1(), and dequant_row().

◆ dequant_q6_k_block()

void dequant_q6_k_block	(	const block_q6_K *	block,
		float *	output
	)

Dequantize a single Q6_K block to FP32.

Definition at line 389 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
     const uint8_t *ql = block->ql;
     const uint8_t *qh = block->qh;
     const int8_t *sc = block->scales;
     float *y = output;
  
     for (int n = 0; n < QK_K; n += 128) {
         for (int l = 0; l < 32; ++l) {
             const int is = l / 16;
             const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
             const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
             const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
             const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
  
             y[l + 0] = d * (float)sc[is + 0] * (float)q1;
             y[l + 32] = d * (float)sc[is + 2] * (float)q2;
             y[l + 64] = d * (float)sc[is + 4] * (float)q3;
             y[l + 96] = d * (float)sc[is + 6] * (float)q4;
         }
         y += 128;
         ql += 64;
         qh += 32;
         sc += 8;
     }
 }

References block_q6_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, and block_q6_K::scales.

Referenced by dequant_q6_k_row().

◆ dequant_q6_k_row()

void dequant_q6_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q6_K row (multiple blocks)

Definition at line 420 of file dequant_kernels.c.

 {
     const block_q6_K *blocks = (const block_q6_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q6_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

References dequant_q6_k_block(), and QK_K.

Referenced by ck_test_dequant_q6_k(), ck_test_gemv_q6_k(), dequant_row(), and embedding_forward_q6_k().

◆ dequant_q8_0_block()

void dequant_q8_0_block	(	const block_q8_0 *	block,
		float *	output
	)

Dequantize a single Q8_0 block to FP32.

Definition at line 274 of file dequant_kernels.c.

 {
     const float d = GGML_FP16_TO_FP32(block->d);
  
     for (int i = 0; i < QK8_0; i++) {
         output[i] = d * (float)block->qs[i];
     }
 }

References block_q8_0::d, GGML_FP16_TO_FP32, QK8_0, and block_q8_0::qs.

Referenced by dequant_q8_0_row().

◆ dequant_q8_0_row()

void dequant_q8_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q8_0 row (multiple blocks)

Definition at line 286 of file dequant_kernels.c.

 {
     const block_q8_0 *blocks = (const block_q8_0 *)src;
     const size_t n_blocks = n_elements / QK8_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q8_0_block(&blocks[b], &dst[b * QK8_0]);
     }
 }

References dequant_q8_0_block(), and QK8_0.

Referenced by dequant_row(), and embedding_forward_q8_0().

◆ dequant_row()

void dequant_row	(	CKDataType	dtype,
		const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize a row of quantized data to FP32.

Parameters

dtype	Data type (must be quantized type)
src	Source quantized data
dst	Destination FP32 buffer
n_elements	Number of elements

Definition at line 512 of file dequant_kernels.c.

 {
     switch (dtype) {
     case CK_DT_Q4_0:
         dequant_q4_0_row(src, dst, n_elements);
         break;
     case CK_DT_Q4_1:
         dequant_q4_1_row(src, dst, n_elements);
         break;
     case CK_DT_Q5_0:
         dequant_q5_0_row(src, dst, n_elements);
         break;
     case CK_DT_Q5_1:
         dequant_q5_1_row(src, dst, n_elements);
         break;
     case CK_DT_Q4_K:
         dequant_q4_k_row(src, dst, n_elements);
         break;
     case CK_DT_Q6_K:
         dequant_q6_k_row(src, dst, n_elements);
         break;
     case CK_DT_Q8_0:
         dequant_q8_0_row(src, dst, n_elements);
         break;
     default:
         /* Not a quantized type - no-op or error */
         break;
     }
 }

References CK_DT_Q4_0, CK_DT_Q4_1, CK_DT_Q4_K, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q6_K, CK_DT_Q8_0, dequant_q4_0_row(), dequant_q4_1_row(), dequant_q4_k_row(), dequant_q5_0_row(), dequant_q5_1_row(), dequant_q6_k_row(), and dequant_q8_0_row().

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Function Documentation

◆ dequant_q4_0_block()

◆ dequant_q4_0_row()

◆ dequant_q4_1_block()

◆ dequant_q4_1_row()

◆ dequant_q4_k_block()

◆ dequant_q4_k_row()

◆ dequant_q5_0_block()

◆ dequant_q5_0_row()

◆ dequant_q5_1_block()

◆ dequant_q5_1_row()

◆ dequant_q6_k_block()

◆ dequant_q6_k_row()

◆ dequant_q8_0_block()

◆ dequant_q8_0_row()

◆ dequant_row()