← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_k.c File Reference

GEMM/GEMV kernels with Q5_K quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Macros

#define QK_K   256
 

Functions

void gemm_nt_q5_k (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q5_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemv_q5_k (float *y, const void *W, const float *x, int M, int K)
 
void gemv_q5_k_ref (float *y, const void *W, const float *x, int M, int K)
 
static void get_q5_k_scale_min (int j, const uint8_t *scales, uint8_t *scale, uint8_t *min)
 

Detailed Description

GEMM/GEMV kernels with Q5_K quantized weights.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements matrix multiplication where:

  • Activations (input): FP32
  • Weights: Q5_K (5-bit super-block quant)
  • Output: FP32

Q5_K Format (256 weights per super-block):

  • d: FP16 super-block scale
  • dmin: FP16 super-block minimum
  • scales[12]: 8 sub-block scales + 8 sub-block mins (6 bits each, packed)
  • qh[32]: high bits for 256 weights (1 bit each)
  • qs[128]: low 4 bits for 256 weights (4 bits each)

Total: 2 + 2 + 12 + 32 + 128 = 176 bytes per 256 weights = 5.5 bits/weight

Dequantization formula (matches llama.cpp): w = d * (scale/64) * q - dmin * (mins/64) where q = qs_val | (qh_bit << 4) = 5-bit value [0, 31]

Definition in file gemm_kernels_q5_k.c.

Macro Definition Documentation

◆ QK_K

#define QK_K   256

Definition at line 44 of file gemm_kernels_q5_k.c.

Function Documentation

◆ gemm_nt_q5_k()

void gemm_nt_q5_k ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 218 of file gemm_kernels_q5_k.c.

223 {
224 #if defined(__AVX512F__)
225  /* TODO: AVX-512 implementation */
226  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
227 #elif defined(__AVX2__)
228  /* TODO: AVX-2 implementation */
229  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
230 #elif defined(__AVX__)
231  /* TODO: AVX implementation */
232  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
233 #elif defined(__SSE4_1__)
234  /* TODO: SSE4.1 implementation */
235  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
236 #else
237  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
238 #endif
239 }
void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define C(color)
Definition: show_config.c:39

References C, and gemm_nt_q5_k_ref().

Referenced by ck_test_gemm_q5_k().

◆ gemm_nt_q5_k_ref()

void gemm_nt_q5_k_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 145 of file gemm_kernels_q5_k.c.

150 {
151  const block_q5_K *blocks = (const block_q5_K *)B;
152  const int blocks_per_col = K / QK_K;
153 
154  for (int m = 0; m < M; m++) {
155  const float *a_row = &A[m * K];
156 
157  for (int n = 0; n < N; n++) {
158  float sum = 0.0f;
159 
160  for (int b = 0; b < blocks_per_col; b++) {
161  const block_q5_K *block = &blocks[n * blocks_per_col + b];
162  const float d = CK_FP16_TO_FP32(block->d);
163  const float dmin = CK_FP16_TO_FP32(block->dmin);
164  const uint8_t *scales = block->scales;
165  const uint8_t *qh = block->qh;
166  const uint8_t *qs = block->qs;
167 
168  /* Process 8 sub-blocks of 32 weights each */
169  for (int sb = 0; sb < 8; sb++) {
170  uint8_t sc, m;
171  get_q5_k_scale_min(sb, scales, &sc, &m);
172 
173  const float d_sub = d * (float)sc / 64.0f;
174  const float m_sub = dmin * (float)m / 64.0f;
175 
176  const int qs_offset = sb * 16;
177  const int qh_offset = sb * 4;
178 
179  for (int i = 0; i < 32; i++) {
180  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
181  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
182  uint8_t q = qs_val | (qh_bit << 4);
183 
184  float w = d_sub * (float)q - m_sub;
185  sum += w * a_row[b * QK_K + sb * 32 + i];
186  }
187  }
188  }
189 
190  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
191  }
192  }
193 }
#define CK_FP16_TO_FP32(x)
static void get_q5_k_scale_min(int j, const uint8_t *scales, uint8_t *scale, uint8_t *min)
#define QK_K
ck_half dmin
uint8_t qh[256/8]
uint8_t qs[256/2]
uint8_t scales[12]

References C, CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemm_nt_q5_k().

◆ gemv_q5_k()

void gemv_q5_k ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Definition at line 199 of file gemm_kernels_q5_k.c.

200 {
201 #if defined(__AVX512F__)
202  /* TODO: AVX-512 implementation */
203  gemv_q5_k_ref(y, W, x, M, K);
204 #elif defined(__AVX2__)
205  /* TODO: AVX-2 implementation */
206  gemv_q5_k_ref(y, W, x, M, K);
207 #elif defined(__AVX__)
208  /* TODO: AVX implementation */
209  gemv_q5_k_ref(y, W, x, M, K);
210 #elif defined(__SSE4_1__)
211  /* TODO: SSE4.1 implementation */
212  gemv_q5_k_ref(y, W, x, M, K);
213 #else
214  gemv_q5_k_ref(y, W, x, M, K);
215 #endif
216 }
void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)

References gemv_q5_k_ref().

Referenced by ck_test_gemv_q5_k().

◆ gemv_q5_k_ref()

void gemv_q5_k_ref ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Definition at line 92 of file gemm_kernels_q5_k.c.

93 {
94  const block_q5_K *blocks = (const block_q5_K *)W;
95  const int blocks_per_row = K / QK_K;
96 
97  for (int m = 0; m < M; m++) {
98  const float *x_row = x;
99  float sum = 0.0f;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  const block_q5_K *block = &blocks[m * blocks_per_row + b];
103  const float d = CK_FP16_TO_FP32(block->d);
104  const float dmin = CK_FP16_TO_FP32(block->dmin);
105  const uint8_t *scales = block->scales;
106  const uint8_t *qh = block->qh;
107  const uint8_t *qs = block->qs;
108 
109  /* Process 8 sub-blocks of 32 weights each */
110  for (int sb = 0; sb < 8; sb++) {
111  uint8_t sc, m;
112  get_q5_k_scale_min(sb, scales, &sc, &m);
113 
114  const float d_sub = d * (float)sc / 64.0f;
115  const float m_sub = dmin * (float)m / 64.0f;
116 
117  /* Each sub-block has 32 weights: low 4 bits in qs, high 1 bit in qh */
118  const int qs_offset = sb * 16; /* 16 bytes per sub-block */
119  const int qh_offset = sb * 4; /* 4 bytes per sub-block */
120 
121  for (int i = 0; i < 32; i++) {
122  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
123  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
124  uint8_t q = qs_val | (qh_bit << 4);
125 
126  /* Q5_K dequantization: w = d * sc/64 * q - dmin * m/64 */
127  float w = d_sub * (float)q - m_sub;
128  sum += w * x_row[b * QK_K + sb * 32 + i];
129  }
130  }
131  }
132 
133  y[m] = sum;
134  }
135 }

References CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemv_q5_k().

◆ get_q5_k_scale_min()

static void get_q5_k_scale_min ( int  j,
const uint8_t *  scales,
uint8_t *  scale,
uint8_t *  min 
)
inlinestatic

Definition at line 74 of file gemm_kernels_q5_k.c.

75  {
76  if (j < 4) {
77  *scale = scales[j] & 63;
78  *min = scales[j + 4] & 63;
79  } else if (j < 8) {
80  *scale = (scales[j + 4] & 0x0F) | ((scales[j - 4] >> 6) << 4);
81  *min = (scales[j + 4] >> 4) | ((scales[j - 4] >> 6) << 4);
82  } else {
83  *scale = (scales[j - 4] & 0x0F) | ((scales[j - 8] >> 6) << 4);
84  *min = (scales[j - 4] >> 4) | ((scales[j - 8] >> 6) << 4);
85  }
86 }

Referenced by gemm_nt_q5_k_ref(), and gemv_q5_k_ref().