GEMM/GEMV kernels with Q5_K quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "ckernel_quant.h"

Macros
#define	QK_K 256

Functions
void	gemm_nt_q5_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q5_k_ref (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemv_q5_k (float y, const void W, const float *x, int M, int K)

void	gemv_q5_k_ref (float y, const void W, const float *x, int M, int K)

static void	get_q5_k_scale_min (int j, const uint8_t scales, uint8_t scale, uint8_t *min)

Detailed Description

GEMM/GEMV kernels with Q5_K quantized weights.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements matrix multiplication where:

Activations (input): FP32
Weights: Q5_K (5-bit super-block quant)
Output: FP32

Q5_K Format (256 weights per super-block):

d: FP16 super-block scale
dmin: FP16 super-block minimum
scales[12]: 8 sub-block scales + 8 sub-block mins (6 bits each, packed)
qh[32]: high bits for 256 weights (1 bit each)
qs[128]: low 4 bits for 256 weights (4 bits each)

Total: 2 + 2 + 12 + 32 + 128 = 176 bytes per 256 weights = 5.5 bits/weight

Dequantization formula (matches llama.cpp): w = d * (scale/64) * q - dmin * (mins/64) where q = qs_val | (qh_bit << 4) = 5-bit value [0, 31]

Definition in file gemm_kernels_q5_k.c.

Macro Definition Documentation

◆ QK_K

#define QK_K 256

Definition at line 44 of file gemm_kernels_q5_k.c.

Function Documentation

◆ gemm_nt_q5_k()

void gemm_nt_q5_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 218 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #else
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #endif
 }

References C, and gemm_nt_q5_k_ref().

Referenced by ck_test_gemm_q5_k().

◆ gemm_nt_q5_k_ref()

void gemm_nt_q5_k_ref	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 145 of file gemm_kernels_q5_k.c.

 {
     const block_q5_K *blocks = (const block_q5_K *)B;
     const int blocks_per_col = K / QK_K;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_col; b++) {
                 const block_q5_K *block = &blocks[n * blocks_per_col + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float dmin = CK_FP16_TO_FP32(block->dmin);
                 const uint8_t *scales = block->scales;
                 const uint8_t *qh = block->qh;
                 const uint8_t *qs = block->qs;
  
                 /* Process 8 sub-blocks of 32 weights each */
                 for (int sb = 0; sb < 8; sb++) {
                     uint8_t sc, m;
                     get_q5_k_scale_min(sb, scales, &sc, &m);
  
                     const float d_sub = d * (float)sc / 64.0f;
                     const float m_sub = dmin * (float)m / 64.0f;
  
                     const int qs_offset = sb * 16;
                     const int qh_offset = sb * 4;
  
                     for (int i = 0; i < 32; i++) {
                         uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
                         uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
                         uint8_t q = qs_val | (qh_bit << 4);
  
                         float w = d_sub * (float)q - m_sub;
                         sum += w * a_row[b * QK_K + sb * 32 + i];
                     }
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

References C, CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemm_nt_q5_k().

◆ gemv_q5_k()

void gemv_q5_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 199 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #else
     gemv_q5_k_ref(y, W, x, M, K);
 #endif
 }

References gemv_q5_k_ref().

Referenced by ck_test_gemv_q5_k().

◆ gemv_q5_k_ref()

void gemv_q5_k_ref	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 92 of file gemm_kernels_q5_k.c.

 {
     const block_q5_K *blocks = (const block_q5_K *)W;
     const int blocks_per_row = K / QK_K;
  
     for (int m = 0; m < M; m++) {
         const float *x_row = x;
         float sum = 0.0f;
  
         for (int b = 0; b < blocks_per_row; b++) {
             const block_q5_K *block = &blocks[m * blocks_per_row + b];
             const float d = CK_FP16_TO_FP32(block->d);
             const float dmin = CK_FP16_TO_FP32(block->dmin);
             const uint8_t *scales = block->scales;
             const uint8_t *qh = block->qh;
             const uint8_t *qs = block->qs;
  
             /* Process 8 sub-blocks of 32 weights each */
             for (int sb = 0; sb < 8; sb++) {
                 uint8_t sc, m;
                 get_q5_k_scale_min(sb, scales, &sc, &m);
  
                 const float d_sub = d * (float)sc / 64.0f;
                 const float m_sub = dmin * (float)m / 64.0f;
  
                 /* Each sub-block has 32 weights: low 4 bits in qs, high 1 bit in qh */
                 const int qs_offset = sb * 16;  /* 16 bytes per sub-block */
                 const int qh_offset = sb * 4;   /* 4 bytes per sub-block */
  
                 for (int i = 0; i < 32; i++) {
                     uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
                     uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
                     uint8_t q = qs_val | (qh_bit << 4);
  
                     /* Q5_K dequantization: w = d * sc/64 * q - dmin * m/64 */
                     float w = d_sub * (float)q - m_sub;
                     sum += w * x_row[b * QK_K + sb * 32 + i];
                 }
             }
         }
  
         y[m] = sum;
     }
 }

References CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemv_q5_k().

◆ get_q5_k_scale_min()

static void get_q5_k_scale_min	(	int	j,
		const uint8_t *	scales,
		uint8_t *	scale,
		uint8_t *	min
	)

inlinestatic

Definition at line 74 of file gemm_kernels_q5_k.c.

                                                                      {
     if (j < 4) {
         *scale = scales[j] & 63;
         *min = scales[j + 4] & 63;
     } else if (j < 8) {
         *scale = (scales[j + 4] & 0x0F) | ((scales[j - 4] >> 6) << 4);
         *min = (scales[j + 4] >> 4) | ((scales[j - 4] >> 6) << 4);
     } else {
         *scale = (scales[j - 4] & 0x0F) | ((scales[j - 8] >> 6) << 4);
         *min = (scales[j - 4] >> 4) | ((scales[j - 8] >> 6) << 4);
     }
 }

Referenced by gemm_nt_q5_k_ref(), and gemv_q5_k_ref().

Macros

Functions