← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q6k_q8k.c File Reference

Q6_K (weights) x Q8_K (activations) kernels for inference. More...

#include <assert.h>
#include <math.h>
#include <string.h>
#include <stdint.h>
#include <stddef.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

static float dot_q6_k_q8_k_ref (const block_q6_K *w, const block_q8_K *x, int K)
 Scalar dot product for Q6_K x Q8_K. More...
 
void gemm_nt_q6_k_q8_k (const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K)
 NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K. More...
 
void gemm_q6_k_q8_k (float *Y, const void *W, const void *X_q8, int M, int N, int K)
 GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K. More...
 
void gemv_q6_k_q8_k (float *y, const void *W, const void *x_q8, int M, int K)
 GEMV: y = W @ x where W is Q6_K and x is Q8_K. More...
 
void gemv_q6_k_q8_k_avx (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q6_k_q8_k_avx2 (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q6_k_q8_k_avx512 (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q6_k_q8_k_avx512_vbmi (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q6_k_q8_k_parallel (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth)
 Parallel reference GEMV for Q6_K × Q8_K. More...
 
void gemv_q6_k_q8_k_parallel_simd (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth)
 Parallel SIMD GEMV for Q6_K × Q8_K. More...
 
void gemv_q6_k_q8_k_ref (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q6_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K)
 
void vec_dot_q6_k_q8_k (int n, float *s, const void *vx, const void *vy)
 Q6_K x Q8_K dot product (single row) More...
 

Detailed Description

Q6_K (weights) x Q8_K (activations) kernels for inference.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Implements decode-style matvec/matmul where weights are Q6_K and the activations are quantized on-the-fly to Q8_K. This is inference-only; no backward pass is provided here.

Q6_K Format (256 weights per block):

  • d: FP16 super-block scale
  • ql: 128 bytes (low 4 bits of each weight)
  • qh: 64 bytes (high 2 bits of each weight)
  • scales: 16 int8 sub-block scales

Q8_K Format (256 weights per block):

  • d: FP32 scale
  • qs: 256 int8 values
  • bsums: 16 int16 block sums

Definition in file gemm_kernels_q6k_q8k.c.

Function Documentation

◆ dot_q6_k_q8_k_ref()

static float dot_q6_k_q8_k_ref ( const block_q6_K w,
const block_q8_K x,
int  K 
)
static

Scalar dot product for Q6_K x Q8_K.

Q6_K layout: 256 weights per block

  • ql[0..127]: low 4 bits for all 256 weights (packed 2 per byte)
  • qh[0..63]: high 2 bits for all 256 weights (packed 4 per byte)
  • scales[0..15]: int8 scale for each 16-weight sub-block
  • d: FP16 super-block scale

The dequantization formula for each weight is: weight = d * scale[sub] * (q6_value - 32) where q6_value is the 6-bit unsigned value (0..63).

Definition at line 67 of file gemm_kernels_q6k_q8k.c.

70 {
71  const int nb = K / QK_K;
72  float sumf = 0.0f;
73 
74  for (int i = 0; i < nb; ++i) {
75  const float d = GGML_FP16_TO_FP32(w[i].d) * x[i].d;
76 
77  const uint8_t *ql = w[i].ql;
78  const uint8_t *qh = w[i].qh;
79  const int8_t *sc = w[i].scales;
80  const int8_t *q8 = x[i].qs;
81 
82  /* Process 256 weights in 2 iterations of 128 */
83  for (int n = 0; n < QK_K; n += 128) {
84  /* Each iteration processes 128 weights:
85  * - ql[0..63] contains low 4 bits
86  * - qh[0..31] contains high 2 bits
87  * - Interleaved pattern: weights 0-31, 32-63, 64-95, 96-127
88  */
89  for (int l = 0; l < 32; ++l) {
90  /* Sub-block index: each scale covers 16 weights */
91  const int is = l / 16;
92 
93  /* Extract 6-bit values from packed format */
94  /* q1: weights l+0 (low nibble of ql[l], bits 0-1 of qh[l]) */
95  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
96  /* q2: weights l+32 (low nibble of ql[l+32], bits 2-3 of qh[l]) */
97  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
98  /* q3: weights l+64 (high nibble of ql[l], bits 4-5 of qh[l]) */
99  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
100  /* q4: weights l+96 (high nibble of ql[l+32], bits 6-7 of qh[l]) */
101  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
102 
103  /* Accumulate: d * scale * q6 * q8 */
104  sumf += d * (float)sc[is + 0] * (float)q1 * (float)q8[l + 0];
105  sumf += d * (float)sc[is + 2] * (float)q2 * (float)q8[l + 32];
106  sumf += d * (float)sc[is + 4] * (float)q3 * (float)q8[l + 64];
107  sumf += d * (float)sc[is + 6] * (float)q4 * (float)q8[l + 96];
108  }
109  q8 += 128;
110  ql += 64;
111  qh += 32;
112  sc += 8;
113  }
114  }
115 
116  return sumf;
117 }
#define GGML_FP16_TO_FP32
#define QK_K
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]
int8_t qs[256]

References block_q8_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.

Referenced by gemv_q6_k_q8_k_parallel(), gemv_q6_k_q8_k_parallel_simd(), gemv_q6_k_q8_k_ref(), and vec_dot_q6_k_q8_k().

◆ gemm_nt_q6_k_q8_k()

void gemm_nt_q6_k_q8_k ( const void *  A_q8,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K.

This is the typical inference pattern:

  • A: Activations in Q8_K format [M x K]
  • B: Weights in Q6_K format [N x K]
  • C: Output [M x N]
Parameters
A_q8Input activations in Q8_K format
BWeight matrix in Q6_K format
biasOptional bias vector [N]
COutput matrix
MBatch size (number of tokens)
NOutput dimension
KInput dimension

Definition at line 1144 of file gemm_kernels_q6k_q8k.c.

1149 {
1150  if (!A_q8 || !B || !C) {
1151  return;
1152  }
1153  if (M <= 0 || N <= 0 || K <= 0) {
1154  return;
1155  }
1156 
1157  gemm_q6_k_q8_k(C, B, A_q8, /*M_out=*/N, /*N_batch=*/M, K);
1158 
1159  if (!bias) {
1160  return;
1161  }
1162 
1163  for (int i = 0; i < M; ++i) {
1164  float *row = C + (size_t)i * (size_t)N;
1165  for (int j = 0; j < N; ++j) {
1166  row[j] += bias[j];
1167  }
1168  }
1169 }
void gemm_q6_k_q8_k(float *Y, const void *W, const void *X_q8, int M, int N, int K)
GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K.
#define C(color)
Definition: show_config.c:39

References C, and gemm_q6_k_q8_k().

Referenced by ck_test_gemm_q6_k(), gemm_nt_q8_k_mlp_dispatch(), and gemm_nt_q8_k_qkv_dispatch().

◆ gemm_q6_k_q8_k()

void gemm_q6_k_q8_k ( float *  Y,
const void *  W,
const void *  X_q8,
int  M,
int  N,
int  K 
)

GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K.

Parameters
YOutput matrix [N x M] in row-major
WWeight matrix in Q6_K format [M x K]
X_q8Input matrix in Q8_K format [N x K]
MNumber of output rows (output dim)
NNumber of input vectors (batch size)
KInput dimension

Definition at line 1110 of file gemm_kernels_q6k_q8k.c.

1114 {
1115  if (!Y || !W || !X_q8 || M <= 0 || N <= 0 || K <= 0) {
1116  return;
1117  }
1118 
1119  const block_q8_K *X = (const block_q8_K *)X_q8;
1120  const int blocks_per_vec = K / QK_K;
1121 
1122  for (int n = 0; n < N; ++n) {
1123  const block_q8_K *x_row = X + (size_t)n * (size_t)blocks_per_vec;
1124  gemv_q6_k_q8_k(&Y[n * M], W, x_row, M, K);
1125  }
1126 }
void gemv_q6_k_q8_k(float *y, const void *W, const void *x_q8, int M, int K)
GEMV: y = W @ x where W is Q6_K and x is Q8_K.

References gemv_q6_k_q8_k(), and QK_K.

Referenced by gemm_nt_q6_k_q8_k().

◆ gemv_q6_k_q8_k()

void gemv_q6_k_q8_k ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

GEMV: y = W @ x where W is Q6_K and x is Q8_K.

Definition at line 980 of file gemm_kernels_q6k_q8k.c.

984 {
985  /* AVX-512 uses same algorithm as AVX2 (matches llama.cpp) */
986 #if defined(__AVX512F__) && defined(__AVX512BW__)
987  gemv_q6_k_q8_k_avx512(y, W, x_q8, M, K);
988 #elif defined(__AVX2__)
989  gemv_q6_k_q8_k_avx2(y, W, x_q8, M, K);
990 #elif defined(__AVX__)
991  gemv_q6_k_q8_k_avx(y, W, x_q8, M, K);
992 #elif defined(__SSSE3__)
993  gemv_q6_k_q8_k_sse(y, W, x_q8, M, K);
994 #else
995  gemv_q6_k_q8_k_ref(y, W, x_q8, M, K);
996 #endif
997 }
void gemv_q6_k_q8_k_ref(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q6_k_q8_k_sse(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q6_k_q8_k_avx(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q6_k_q8_k_avx2(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q6_k_q8_k_avx512(float *y, const void *W, const void *x_q8, int M, int K)

References gemv_q6_k_q8_k_avx(), gemv_q6_k_q8_k_avx2(), gemv_q6_k_q8_k_avx512(), gemv_q6_k_q8_k_ref(), and gemv_q6_k_q8_k_sse().

Referenced by gemm_q6_k_q8_k().

◆ gemv_q6_k_q8_k_avx()

void gemv_q6_k_q8_k_avx ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Referenced by gemv_q6_k_q8_k().

◆ gemv_q6_k_q8_k_avx2()

void gemv_q6_k_q8_k_avx2 ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Referenced by gemv_q6_k_q8_k().

◆ gemv_q6_k_q8_k_avx512()

void gemv_q6_k_q8_k_avx512 ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Referenced by gemv_q6_k_q8_k().

◆ gemv_q6_k_q8_k_avx512_vbmi()

void gemv_q6_k_q8_k_avx512_vbmi ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

◆ gemv_q6_k_q8_k_parallel()

void gemv_q6_k_q8_k_parallel ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K,
int  ith,
int  nth 
)

Parallel reference GEMV for Q6_K × Q8_K.

Caller provides ith (thread index) and nth (total threads). Each thread processes rows [r0, r1).

Definition at line 1014 of file gemm_kernels_q6k_q8k.c.

1019 {
1020  if (!y || !W || !x_q8 || M <= 0 || K <= 0) return;
1021  if (ith < 0 || nth <= 0 || ith >= nth) return;
1022 
1023  /* Compute row range for this thread */
1024  const int dr = (M + nth - 1) / nth;
1025  const int r0 = dr * ith;
1026  const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
1027 
1028  if (r0 >= M) return;
1029 
1030  const block_q6_K *blocks = (const block_q6_K *)W;
1031  const block_q8_K *x = (const block_q8_K *)x_q8;
1032  const int blocks_per_row = K / QK_K;
1033 
1034  for (int row = r0; row < r1; ++row) {
1035  const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
1036  y[row] = dot_q6_k_q8_k_ref(w_row, x, K);
1037  }
1038 }
static float dot_q6_k_q8_k_ref(const block_q6_K *w, const block_q8_K *x, int K)
Scalar dot product for Q6_K x Q8_K.

References dot_q6_k_q8_k_ref(), and QK_K.

◆ gemv_q6_k_q8_k_parallel_simd()

void gemv_q6_k_q8_k_parallel_simd ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K,
int  ith,
int  nth 
)

Parallel SIMD GEMV for Q6_K × Q8_K.

Uses best available SIMD (AVX/SSE) with row prefetching. Caller provides ith/nth from OpenMP region.

Definition at line 1046 of file gemm_kernels_q6k_q8k.c.

1051 {
1052  if (!y || !W || !x_q8 || M <= 0 || K <= 0) return;
1053  if (ith < 0 || nth <= 0 || ith >= nth) return;
1054 
1055  const int dr = (M + nth - 1) / nth;
1056  const int r0 = dr * ith;
1057  const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
1058 
1059  if (r0 >= M) return;
1060 
1061  const block_q6_K *blocks = (const block_q6_K *)W;
1062  const block_q8_K *x = (const block_q8_K *)x_q8;
1063  const int blocks_per_row = K / QK_K;
1064 
1065 #if defined(__AVX__) || defined(__SSSE3__)
1066  /* Prefetch first few rows */
1067  const int PREFETCH_ROWS = 4;
1068  for (int p = 0; p < PREFETCH_ROWS && r0 + p < r1; ++p) {
1069  const char *row_ptr = (const char *)(blocks + (r0 + p) * blocks_per_row);
1070  _mm_prefetch(row_ptr, _MM_HINT_T0);
1071  _mm_prefetch(row_ptr + 64, _MM_HINT_T0);
1072  }
1073 
1074  for (int row = r0; row < r1; ++row) {
1075  /* Prefetch rows ahead */
1076  if (row + PREFETCH_ROWS < r1) {
1077  const char *prefetch_ptr = (const char *)(blocks + (row + PREFETCH_ROWS) * blocks_per_row);
1078  _mm_prefetch(prefetch_ptr, _MM_HINT_T0);
1079  _mm_prefetch(prefetch_ptr + 64, _MM_HINT_T0);
1080  }
1081 
1082  const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
1083 #if defined(__AVX2__)
1084  y[row] = dot_q6_k_q8_k_avx2(w_row, x, K);
1085 #elif defined(__AVX__)
1086  y[row] = dot_q6_k_q8_k_avx(w_row, x, K);
1087 #else
1088  y[row] = dot_q6_k_q8_k_sse(w_row, x, K);
1089 #endif
1090  }
1091 #else
1092  /* Fallback to reference */
1093  for (int row = r0; row < r1; ++row) {
1094  const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
1095  y[row] = dot_q6_k_q8_k_ref(w_row, x, K);
1096  }
1097 #endif
1098 }

References dot_q6_k_q8_k_ref(), and QK_K.

◆ gemv_q6_k_q8_k_ref()

void gemv_q6_k_q8_k_ref ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 119 of file gemm_kernels_q6k_q8k.c.

123 {
124  if (!y || !W || !x_q8 || M <= 0 || K <= 0) {
125  return;
126  }
127 
128  const block_q6_K *blocks = (const block_q6_K *)W;
129  const block_q8_K *x = (const block_q8_K *)x_q8;
130  const int blocks_per_row = K / QK_K;
131 
132  for (int row = 0; row < M; ++row) {
133  const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
134  y[row] = dot_q6_k_q8_k_ref(w_row, x, K);
135  }
136 }

References dot_q6_k_q8_k_ref(), and QK_K.

Referenced by gemv_q6_k_q8_k().

◆ gemv_q6_k_q8_k_sse()

void gemv_q6_k_q8_k_sse ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Referenced by gemv_q6_k_q8_k().

◆ vec_dot_q6_k_q8_k()

void vec_dot_q6_k_q8_k ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Q6_K x Q8_K dot product (single row)

Definition at line 954 of file gemm_kernels_q6k_q8k.c.

955 {
956  if (!s || !vx || !vy || n <= 0) {
957  return;
958  }
959 
960  const block_q6_K *x = (const block_q6_K *)vx;
961  const block_q8_K *y = (const block_q8_K *)vy;
962 
963  /* Dispatch based on available SIMD */
964 #if defined(__AVX512F__) && defined(__AVX512BW__)
965  *s = dot_q6_k_q8_k_avx512(x, y, n);
966 #elif defined(__AVX2__)
967  *s = dot_q6_k_q8_k_avx2(x, y, n);
968 #elif defined(__AVX__) && !defined(__AVX2__)
969  *s = dot_q6_k_q8_k_avx(x, y, n);
970 #elif defined(__SSSE3__)
971  *s = dot_q6_k_q8_k_sse(x, y, n);
972 #else
973  *s = dot_q6_k_q8_k_ref(x, y, n);
974 #endif
975 }

References dot_q6_k_q8_k_ref().