← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_1.c File Reference

GEMM/GEMV kernels with Q5_1 quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

float dot_q5_1 (const void *w_q5_1, const float *x, int K)
 
void gemm_nt_q5_1 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 GEMM with transposed Q5_1 weights: C = A @ B^T. More...
 
void gemm_q5_1 (float *Y, const void *W, const float *X, int M, int N, int K)
 Matrix-matrix multiply with Q5_1 weights. More...
 
void gemm_q5_1_backward (float *dX, const void *W, const float *dY, int M, int N, int K)
 Batched backward pass. More...
 
void gemv_q5_1 (float *y, const void *W, const float *x, int M, int K)
 Auto-dispatch GEMV. More...
 
void gemv_q5_1_backward (float *dX, const void *W, const float *dY, int M, int K)
 Auto-dispatch backward. More...
 
void gemv_q5_1_backward_ref (float *dX, const void *W, const float *dY, int M, int K)
 Backward pass: compute input gradient. More...
 
void gemv_q5_1_ref (float *y, const void *W, const float *x, int M, int K)
 Matrix-vector multiply with Q5_1 weights (scalar reference) More...
 

Detailed Description

GEMM/GEMV kernels with Q5_1 quantized weights.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Q5_1 Format:

  • 32 weights per block
  • 1 FP16 scale (d) per block
  • 1 FP16 minimum (m) per block
  • Low 4-bits stored like Q4_1 (16 bytes)
  • High 1-bit packed separately (4 bytes)
  • 24 bytes per 32 weights = 6.0 bits/weight

Dequantization: w = d * q5 + m where q5 = low4bit | (highbit << 4), giving values 0-31

Operations: Forward: Y = W @ X (W is Q5_1, X and Y are FP32) Backward: dX = W^T @ dY (gradient w.r.t. input)

Definition in file gemm_kernels_q5_1.c.

Function Documentation

◆ dot_q5_1()

float dot_q5_1 ( const void *  w_q5_1,
const float *  x,
int  K 
)

Definition at line 357 of file gemm_kernels_q5_1.c.

358 {
359  float result;
360  gemv_q5_1(&result, w_q5_1, x, 1, K);
361  return result;
362 }
void gemv_q5_1(float *y, const void *W, const float *x, int M, int K)
Auto-dispatch GEMV.

References gemv_q5_1().

◆ gemm_nt_q5_1()

void gemm_nt_q5_1 ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

GEMM with transposed Q5_1 weights: C = A @ B^T.

Parameters
AInput activations [M x K], row-major FP32
BWeight matrix in Q5_1 format [N x K], row-major quantized
biasOptional bias [N], NULL if not used
COutput [M x N], row-major FP32
MBatch size (number of tokens)
NOutput dimension
KInput dimension

Definition at line 309 of file gemm_kernels_q5_1.c.

314 {
315  const block_q5_1 *blocks = (const block_q5_1 *)B;
316  const int blocks_per_row = K / QK5_1;
317 
318  for (int m = 0; m < M; m++) {
319  const float *a_row = &A[m * K];
320 
321  for (int n = 0; n < N; n++) {
322  float sum = 0.0f;
323 
324  for (int b = 0; b < blocks_per_row; b++) {
325  const block_q5_1 *block = &blocks[n * blocks_per_row + b];
326  const float d = CK_FP16_TO_FP32(block->d);
327  const float min = CK_FP16_TO_FP32(block->m);
328  const float *ap = &a_row[b * QK5_1];
329 
330  uint32_t qh;
331  memcpy(&qh, block->qh, sizeof(qh));
332 
333  /* First 16 weights: low nibbles, high bits from qh[0:15] */
334  for (int j = 0; j < QK5_1 / 2; j++) {
335  const int lo = (block->qs[j] & 0x0F);
336  const int hi = ((qh >> j) & 1) << 4;
337  sum += (d * (float)(lo | hi) + min) * ap[j];
338  }
339 
340  /* Second 16 weights: high nibbles, high bits from qh[16:31] */
341  for (int j = 0; j < QK5_1 / 2; j++) {
342  const int lo = (block->qs[j] >> 4);
343  const int hi = ((qh >> (j + 16)) & 1) << 4;
344  sum += (d * (float)(lo | hi) + min) * ap[j + QK5_1 / 2];
345  }
346  }
347 
348  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
349  }
350  }
351 }
#define CK_FP16_TO_FP32(x)
#define QK5_1
Definition: ckernel_quant.h:84
#define C(color)
Definition: show_config.c:39
uint8_t qs[32/2]
Definition: ckernel_quant.h:90
uint8_t qh[4]
Definition: ckernel_quant.h:89
ck_half m
Definition: ckernel_quant.h:88
ck_half d
Definition: ckernel_quant.h:87

References C, CK_FP16_TO_FP32, block_q5_1::d, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by ck_gemm_nt_quant(), and ck_test_gemm_q5_1().

◆ gemm_q5_1()

void gemm_q5_1 ( float *  Y,
const void *  W,
const float *  X,
int  M,
int  N,
int  K 
)

Matrix-matrix multiply with Q5_1 weights.

Definition at line 203 of file gemm_kernels_q5_1.c.

207 {
208  for (int n = 0; n < N; n++) {
209  gemv_q5_1(&Y[n * M], W, &X[n * K], M, K);
210  }
211 }

References gemv_q5_1().

◆ gemm_q5_1_backward()

void gemm_q5_1_backward ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  N,
int  K 
)

Batched backward pass.

Definition at line 284 of file gemm_kernels_q5_1.c.

288 {
289  for (int n = 0; n < N; n++) {
290  gemv_q5_1_backward(&dX[n * K], W, &dY[n * M], M, K);
291  }
292 }
void gemv_q5_1_backward(float *dX, const void *W, const float *dY, int M, int K)
Auto-dispatch backward.

References gemv_q5_1_backward().

◆ gemv_q5_1()

void gemv_q5_1 ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Auto-dispatch GEMV.

Definition at line 184 of file gemm_kernels_q5_1.c.

188 {
189 #ifdef __AVX512F__
190  gemv_q5_1_avx512(y, W, x, M, K);
191 #else
192  gemv_q5_1_ref(y, W, x, M, K);
193 #endif
194 }
void gemv_q5_1_ref(float *y, const void *W, const float *x, int M, int K)
Matrix-vector multiply with Q5_1 weights (scalar reference)

References gemv_q5_1_ref().

Referenced by ck_test_gemv_q5_1(), dot_q5_1(), and gemm_q5_1().

◆ gemv_q5_1_backward()

void gemv_q5_1_backward ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  K 
)

Auto-dispatch backward.

Definition at line 273 of file gemm_kernels_q5_1.c.

277 {
278  gemv_q5_1_backward_ref(dX, W, dY, M, K);
279 }
void gemv_q5_1_backward_ref(float *dX, const void *W, const float *dY, int M, int K)
Backward pass: compute input gradient.

References gemv_q5_1_backward_ref().

Referenced by gemm_q5_1_backward().

◆ gemv_q5_1_backward_ref()

void gemv_q5_1_backward_ref ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  K 
)

Backward pass: compute input gradient.

Parameters
dXOutput gradient w.r.t. input [K]
WWeight matrix in Q5_1 format [M x K]
dYGradient w.r.t. output [M]
MNumber of output rows
KNumber of columns (input dimension)

Definition at line 226 of file gemm_kernels_q5_1.c.

230 {
231  const block_q5_1 *blocks = (const block_q5_1 *)W;
232  const int blocks_per_row = K / QK5_1;
233 
234  /* Zero output gradient */
235  memset(dX, 0, K * sizeof(float));
236 
237  /* Accumulate: dX += W^T @ dY */
238  for (int row = 0; row < M; row++) {
239  const float dy = dY[row];
240 
241  for (int b = 0; b < blocks_per_row; b++) {
242  const block_q5_1 *block = &blocks[row * blocks_per_row + b];
243  const float d = CK_FP16_TO_FP32(block->d);
244  const float m = CK_FP16_TO_FP32(block->m);
245  float *dxp = &dX[b * QK5_1];
246 
247  /* Get high bits */
248  uint32_t qh;
249  memcpy(&qh, block->qh, sizeof(qh));
250 
251  /* First 16 weights: low nibbles, high bits from qh[0:15] */
252  for (int j = 0; j < QK5_1 / 2; j++) {
253  const int lo = (block->qs[j] & 0x0F);
254  const int hi = ((qh >> j) & 1) << 4;
255  const float w = d * (float)(lo | hi) + m;
256  dxp[j] += w * dy;
257  }
258 
259  /* Second 16 weights: high nibbles, high bits from qh[16:31] */
260  for (int j = 0; j < QK5_1 / 2; j++) {
261  const int lo = (block->qs[j] >> 4);
262  const int hi = ((qh >> (j + 16)) & 1) << 4;
263  const float w = d * (float)(lo | hi) + m;
264  dxp[j + QK5_1 / 2] += w * dy;
265  }
266  }
267  }
268 }

References CK_FP16_TO_FP32, block_q5_1::d, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by gemv_q5_1_backward().

◆ gemv_q5_1_ref()

void gemv_q5_1_ref ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Matrix-vector multiply with Q5_1 weights (scalar reference)

Parameters
yOutput vector [M]
WWeight matrix in Q5_1 format [M x K]
xInput vector [K]
MNumber of output rows
KNumber of columns (must be multiple of 32)

Definition at line 52 of file gemm_kernels_q5_1.c.

56 {
57  const block_q5_1 *blocks = (const block_q5_1 *)W;
58  const int blocks_per_row = K / QK5_1;
59 
60  for (int row = 0; row < M; row++) {
61  float sum = 0.0f;
62 
63  for (int b = 0; b < blocks_per_row; b++) {
64  const block_q5_1 *block = &blocks[row * blocks_per_row + b];
65  const float d = CK_FP16_TO_FP32(block->d);
66  const float m = CK_FP16_TO_FP32(block->m);
67  const float *xp = &x[b * QK5_1];
68 
69  /* Get high bits as 32-bit integer */
70  uint32_t qh;
71  memcpy(&qh, block->qh, sizeof(qh));
72 
73  /* GGML Q5_1 layout: weights 0-15 from LOW nibbles, 16-31 from HIGH nibbles.
74  * High bits: bits 0-15 of qh → first half, bits 16-31 → second half.
75  * NOT interleaved like Q4_0/Q4_1. */
76 
77  /* First 16 weights: low nibbles of qs[j], high bit from qh bits 0-15 */
78  for (int j = 0; j < QK5_1 / 2; j++) {
79  const int lo = (block->qs[j] & 0x0F);
80  const int hi = ((qh >> j) & 1) << 4;
81  const float w = d * (float)(lo | hi) + m;
82  sum += w * xp[j];
83  }
84 
85  /* Second 16 weights: high nibbles of qs[j], high bit from qh bits 16-31 */
86  for (int j = 0; j < QK5_1 / 2; j++) {
87  const int lo = (block->qs[j] >> 4);
88  const int hi = ((qh >> (j + 16)) & 1) << 4;
89  const float w = d * (float)(lo | hi) + m;
90  sum += w * xp[j + QK5_1 / 2];
91  }
92  }
93 
94  y[row] = sum;
95  }
96 }

References CK_FP16_TO_FP32, block_q5_1::d, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by gemv_q5_1().