← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q6k_sse.c
Go to the documentation of this file.
1 /**
2  * @file gemm_kernels_q6k_sse.c
3  * @brief SSE-optimized GEMM kernels for Q6_K x Q8_K quantization
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  */
14 
15 #pragma GCC target("sse4.1,ssse3")
16 #include <immintrin.h>
17 #include <stdint.h>
18 #include <string.h>
19 
20 #include "ckernel_quant.h"
21 
22 /* Forward declarations */
23 void quantize_row_q8_k(const float *x, void *vy, int k);
24 
25 /**
26  * SSE Optimized dot product for Q6_K x Q8_K
27  * Q6_K layout:
28  * ql: 128 bytes (low 4 bits)
29  * qh: 64 bytes (high 2 bits)
30  * scales: 16 bytes (int8 scales)
31  * d: fp16 super-scale
32  */
33 static inline float dot_q6_k_q8_k_256_sse(const block_q6_K *bw, const block_q8_K *ba) {
34  const uint8_t *ql = bw->ql;
35  const uint8_t *qh = bw->qh;
36  const int8_t *sc = bw->scales;
37  const int8_t *qa = ba->qs;
38 
39  double sum = 0.0;
40  float d = CK_FP16_TO_FP32(bw->d);
41 
42  for (int n = 0; n < QK_K; n += 128) {
43  for (int l = 0; l < 32; ++l) {
44  const int is = l / 16;
45  // Unpack 4 weights at a time to match scalar reference logic
46  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
47  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
48  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
49  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
50 
51  sum += (double)(d * (float)sc[is + 0] * (float)q1) * (double)qa[l + 0];
52  sum += (double)(d * (float)sc[is + 2] * (float)q2) * (double)qa[l + 32];
53  sum += (double)(d * (float)sc[is + 4] * (float)q3) * (double)qa[l + 64];
54  sum += (double)(d * (float)sc[is + 6] * (float)q4) * (double)qa[l + 96];
55  }
56  qa += 128;
57  ql += 64;
58  qh += 32;
59  sc += 8;
60  }
61 
62  return (float)(sum * ba->d);
63 }
64 
65 // Fallback to ref if K not aligned
66 void gemm_nt_q6_k_sse(const float *A,
67  const void *B,
68  const float *bias,
69  float *C,
70  int M, int N, int K)
71 {
72  if (K % QK_K != 0) {
73  gemm_nt_q6_k_ref(A, B, bias, C, M, N, K);
74  return;
75  }
76 
77  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
78  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
79 
80  const block_q6_K *weights = (const block_q6_K *)B;
81  const int blocks_per_row = K / QK_K;
82 
83  for (int m = 0; m < M; m++) {
84  quantize_row_q8_k(&A[m * K], A_q8, K);
85 
86  for (int n = 0; n < N; n++) {
87  float sumf = 0.0f;
88  const block_q6_K *w_row = weights + n * blocks_per_row;
89 
90  for (int b = 0; b < blocks_per_row; b++) {
91  sumf += dot_q6_k_q8_k_256_sse(&w_row[b], &A_q8[b]);
92  }
93 
94  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
95  }
96  }
97 }
Quantization block structures for weight-only quantization.
void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define CK_FP16_TO_FP32(x)
#define QK_K
static float dot_q6_k_q8_k_256_sse(const block_q6_K *bw, const block_q8_K *ba)
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q6_k_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define C(color)
Definition: show_config.c:39
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]
int8_t qs[256]