← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q6k.c File Reference

GEMM/GEMV kernels with Q6_K quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

static float dot_q6_k_ref (const block_q6_K *w, const float *x, int K)
 
void gemm_nt_q6_k (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q6_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_q6_k (float *Y, const void *W, const float *X, int M, int N, int K)
 
void gemv_q6_k (float *y, const void *W, const float *x, int M, int K)
 

Detailed Description

GEMM/GEMV kernels with Q6_K quantized weights.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements matrix multiplication where:

  • Activations (input): FP32
  • Weights: Q6_K (6-bit k-quant, int8 scales)
  • Output: FP32

Q6_K Format (256 weights per block):

  • d: FP16 super-block scale
  • ql: 128 bytes (low 4 bits of each weight)
  • qh: 64 bytes (high 2 bits of each weight)
  • scales: 16 int8 sub-block scales

Definition in file gemm_kernels_q6k.c.

Function Documentation

◆ dot_q6_k_ref()

static float dot_q6_k_ref ( const block_q6_K w,
const float *  x,
int  K 
)
static

Definition at line 39 of file gemm_kernels_q6k.c.

42 {
43  const int blocks_per_row = K / QK_K;
44  float sum = 0.0f;
45 
46  for (int b = 0; b < blocks_per_row; ++b) {
47  const block_q6_K *block = &w[b];
48  const float d = GGML_FP16_TO_FP32(block->d);
49 
50  const uint8_t *ql = block->ql;
51  const uint8_t *qh = block->qh;
52  const int8_t *sc = block->scales;
53  const float *xp = x + (size_t)b * QK_K;
54 
55  for (int n = 0; n < QK_K; n += 128) {
56  for (int l = 0; l < 32; ++l) {
57  const int is = l / 16;
58  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
59  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
60  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
61  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
62 
63  sum += (d * (float)sc[is + 0] * (float)q1) * xp[l + 0];
64  sum += (d * (float)sc[is + 2] * (float)q2) * xp[l + 32];
65  sum += (d * (float)sc[is + 4] * (float)q3) * xp[l + 64];
66  sum += (d * (float)sc[is + 6] * (float)q4) * xp[l + 96];
67  }
68  xp += 128;
69  ql += 64;
70  qh += 32;
71  sc += 8;
72  }
73  }
74 
75  return sum;
76 }
#define GGML_FP16_TO_FP32
#define QK_K
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]

References block_q6_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, and block_q6_K::scales.

Referenced by gemv_q6_k().

◆ gemm_nt_q6_k()

void gemm_nt_q6_k ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 212 of file gemm_kernels_q6k.c.

217 {
218  if (!A || !B || !C) {
219  return;
220  }
221  if (M <= 0 || N <= 0 || K <= 0) {
222  return;
223  }
224 
225  /* gemm_q6_k produces Y as [batch x M_out] where:
226  * batch = M (tokens)
227  * M_out = N (output channels) */
228  gemm_q6_k(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
229 
230  if (!bias) {
231  return;
232  }
233 
234  for (int i = 0; i < M; ++i) {
235  float *row = C + (size_t)i * (size_t)N;
236  for (int j = 0; j < N; ++j) {
237  row[j] += bias[j];
238  }
239  }
240 }
void gemm_q6_k(float *Y, const void *W, const float *X, int M, int N, int K)
#define C(color)
Definition: show_config.c:39

References C, and gemm_q6_k().

Referenced by ck_gemm_nt_quant(), gemm_nt_q6_k_ref(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().

◆ gemm_nt_q6_k_ref()

void gemm_nt_q6_k_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 243 of file gemm_kernels_q6k.c.

248 {
249  gemm_nt_q6_k(A, B, bias, C, M, N, K);
250 }
void gemm_nt_q6_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)

References C, and gemm_nt_q6_k().

Referenced by gemm_nt_q6_k_sse().

◆ gemm_q6_k()

void gemm_q6_k ( float *  Y,
const void *  W,
const float *  X,
int  M,
int  N,
int  K 
)

Definition at line 195 of file gemm_kernels_q6k.c.

199 {
200  if (!Y || !W || !X) {
201  return;
202  }
203  if (M <= 0 || N <= 0 || K <= 0) {
204  return;
205  }
206 
207  for (int n = 0; n < N; ++n) {
208  gemv_q6_k(&Y[n * M], W, &X[n * K], M, K);
209  }
210 }
void gemv_q6_k(float *y, const void *W, const float *x, int M, int K)

References gemv_q6_k().

Referenced by gemm_nt_q6_k().

◆ gemv_q6_k()

void gemv_q6_k ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Definition at line 169 of file gemm_kernels_q6k.c.

173 {
174  if (!y || !W || !x) {
175  return;
176  }
177  if (M <= 0 || K <= 0) {
178  return;
179  }
180  // TEMPORARILY DISABLE NEW AVX KERNELS - USE REFERENCE ONLY
181 
182  const block_q6_K *blocks = (const block_q6_K *)W;
183  const int blocks_per_row = K / QK_K;
184 
185  for (int row = 0; row < M; ++row) {
186  const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
187 #if defined(__AVX__) && !defined(__AVX512F__)
188  y[row] = dot_q6_k_avx(w_row, x, K);
189 #else
190  y[row] = dot_q6_k_ref(w_row, x, K);
191 #endif
192  }
193 }
static float dot_q6_k_ref(const block_q6_K *w, const float *x, int K)

References dot_q6_k_ref(), and QK_K.

Referenced by gemm_q6_k().