GEMM/GEMV kernels with Q6_K quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include "ckernel_quant.h"

Functions
static float	dot_q6_k_ref (const block_q6_K w, const float x, int K)

void	gemm_nt_q6_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q6_k_ref (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_q6_k (float Y, const void W, const float *X, int M, int N, int K)

void	gemv_q6_k (float y, const void W, const float *x, int M, int K)

Detailed Description

GEMM/GEMV kernels with Q6_K quantized weights.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements matrix multiplication where:

Activations (input): FP32
Weights: Q6_K (6-bit k-quant, int8 scales)
Output: FP32

Q6_K Format (256 weights per block):

d: FP16 super-block scale
ql: 128 bytes (low 4 bits of each weight)
qh: 64 bytes (high 2 bits of each weight)
scales: 16 int8 sub-block scales

Definition in file gemm_kernels_q6k.c.

Function Documentation

◆ dot_q6_k_ref()

static float dot_q6_k_ref	(	const block_q6_K *	w,
		const float *	x,
		int	K
	)

static

Definition at line 39 of file gemm_kernels_q6k.c.

 {
     const int blocks_per_row = K / QK_K;
     float sum = 0.0f;
  
     for (int b = 0; b < blocks_per_row; ++b) {
         const block_q6_K *block = &w[b];
         const float d = GGML_FP16_TO_FP32(block->d);
  
         const uint8_t *ql = block->ql;
         const uint8_t *qh = block->qh;
         const int8_t *sc = block->scales;
         const float *xp = x + (size_t)b * QK_K;
  
         for (int n = 0; n < QK_K; n += 128) {
             for (int l = 0; l < 32; ++l) {
                 const int is = l / 16;
                 const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
                 const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
                 const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
                 const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
  
                 sum += (d * (float)sc[is + 0] * (float)q1) * xp[l + 0];
                 sum += (d * (float)sc[is + 2] * (float)q2) * xp[l + 32];
                 sum += (d * (float)sc[is + 4] * (float)q3) * xp[l + 64];
                 sum += (d * (float)sc[is + 6] * (float)q4) * xp[l + 96];
             }
             xp += 128;
             ql += 64;
             qh += 32;
             sc += 8;
         }
     }
  
     return sum;
 }

References block_q6_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, and block_q6_K::scales.

Referenced by gemv_q6_k().

◆ gemm_nt_q6_k()

void gemm_nt_q6_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 212 of file gemm_kernels_q6k.c.

 {
     if (!A || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     /* gemm_q6_k produces Y as [batch x M_out] where:
      *   batch = M (tokens)
      *   M_out = N (output channels) */
     gemm_q6_k(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

References C, and gemm_q6_k().

Referenced by ck_gemm_nt_quant(), gemm_nt_q6_k_ref(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().

◆ gemm_nt_q6_k_ref()

void gemm_nt_q6_k_ref	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 243 of file gemm_kernels_q6k.c.

 {
     gemm_nt_q6_k(A, B, bias, C, M, N, K);
 }

References C, and gemm_nt_q6_k().

Referenced by gemm_nt_q6_k_sse().

◆ gemm_q6_k()

void gemm_q6_k	(	float *	Y,
		const void *	W,
		const float *	X,
		int	M,
		int	N,
		int	K
	)

Definition at line 195 of file gemm_kernels_q6k.c.

 {
     if (!Y || !W || !X) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     for (int n = 0; n < N; ++n) {
         gemv_q6_k(&Y[n * M], W, &X[n * K], M, K);
     }
 }

References gemv_q6_k().

Referenced by gemm_nt_q6_k().

◆ gemv_q6_k()

void gemv_q6_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 169 of file gemm_kernels_q6k.c.

 {
     if (!y || !W || !x) {
         return;
     }
     if (M <= 0 || K <= 0) {
         return;
     }
     // TEMPORARILY DISABLE NEW AVX KERNELS - USE REFERENCE ONLY
  
     const block_q6_K *blocks = (const block_q6_K *)W;
     const int blocks_per_row = K / QK_K;
  
     for (int row = 0; row < M; ++row) {
         const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
 #if defined(__AVX__) && !defined(__AVX512F__)
         y[row] = dot_q6_k_avx(w_row, x, K);
 #else
         y[row] = dot_q6_k_ref(w_row, x, K);
 #endif
     }
 }

References dot_q6_k_ref(), and QK_K.

Referenced by gemm_q6_k().

Functions