← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q6k_sse.c File Reference

SSE-optimized GEMM kernels for Q6_K x Q8_K quantization. More...

#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

static float dot_q6_k_q8_k_256_sse (const block_q6_K *bw, const block_q8_K *ba)
 
void gemm_nt_q6_k_sse (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void quantize_row_q8_k (const float *x, void *vy, int k)
 

Detailed Description

SSE-optimized GEMM kernels for Q6_K x Q8_K quantization.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Definition in file gemm_kernels_q6k_sse.c.

Function Documentation

◆ dot_q6_k_q8_k_256_sse()

static float dot_q6_k_q8_k_256_sse ( const block_q6_K bw,
const block_q8_K ba 
)
inlinestatic

SSE Optimized dot product for Q6_K x Q8_K Q6_K layout: ql: 128 bytes (low 4 bits) qh: 64 bytes (high 2 bits) scales: 16 bytes (int8 scales) d: fp16 super-scale

Definition at line 33 of file gemm_kernels_q6k_sse.c.

33  {
34  const uint8_t *ql = bw->ql;
35  const uint8_t *qh = bw->qh;
36  const int8_t *sc = bw->scales;
37  const int8_t *qa = ba->qs;
38 
39  double sum = 0.0;
40  float d = CK_FP16_TO_FP32(bw->d);
41 
42  for (int n = 0; n < QK_K; n += 128) {
43  for (int l = 0; l < 32; ++l) {
44  const int is = l / 16;
45  // Unpack 4 weights at a time to match scalar reference logic
46  const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
47  const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
48  const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
49  const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
50 
51  sum += (double)(d * (float)sc[is + 0] * (float)q1) * (double)qa[l + 0];
52  sum += (double)(d * (float)sc[is + 2] * (float)q2) * (double)qa[l + 32];
53  sum += (double)(d * (float)sc[is + 4] * (float)q3) * (double)qa[l + 64];
54  sum += (double)(d * (float)sc[is + 6] * (float)q4) * (double)qa[l + 96];
55  }
56  qa += 128;
57  ql += 64;
58  qh += 32;
59  sc += 8;
60  }
61 
62  return (float)(sum * ba->d);
63 }
#define CK_FP16_TO_FP32(x)
#define QK_K
uint8_t ql[256/2]
int8_t scales[256/16]
uint8_t qh[256/4]
int8_t qs[256]

References CK_FP16_TO_FP32, block_q6_K::d, block_q8_K::d, block_q6_K::qh, QK_K, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.

Referenced by gemm_nt_q6_k_sse().

◆ gemm_nt_q6_k_sse()

void gemm_nt_q6_k_sse ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 66 of file gemm_kernels_q6k_sse.c.

71 {
72  if (K % QK_K != 0) {
73  gemm_nt_q6_k_ref(A, B, bias, C, M, N, K);
74  return;
75  }
76 
77  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
78  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
79 
80  const block_q6_K *weights = (const block_q6_K *)B;
81  const int blocks_per_row = K / QK_K;
82 
83  for (int m = 0; m < M; m++) {
84  quantize_row_q8_k(&A[m * K], A_q8, K);
85 
86  for (int n = 0; n < N; n++) {
87  float sumf = 0.0f;
88  const block_q6_K *w_row = weights + n * blocks_per_row;
89 
90  for (int b = 0; b < blocks_per_row; b++) {
91  sumf += dot_q6_k_q8_k_256_sse(&w_row[b], &A_q8[b]);
92  }
93 
94  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
95  }
96  }
97 }
void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
static float dot_q6_k_q8_k_256_sse(const block_q6_K *bw, const block_q8_K *ba)
void quantize_row_q8_k(const float *x, void *vy, int k)
#define C(color)
Definition: show_config.c:39

References C, dot_q6_k_q8_k_256_sse(), gemm_nt_q6_k_ref(), QK_K, and quantize_row_q8_k().

◆ quantize_row_q8_k()

void quantize_row_q8_k ( const float *  x,
void *  vy,
int  k 
)

Definition at line 107 of file gemm_kernels_q4k_q8k.c.

107  {
108 #if defined(__SSE4_1__)
109  quantize_row_q8_k_sse(x, vy, k);
110 #else
111  quantize_row_q8_k_ref(x, vy, k);
112 #endif
113 }
void quantize_row_q8_k_sse(const float *x, void *vy, int k)
void quantize_row_q8_k_ref(const float *x, void *vy, int k)

Referenced by gemm_nt_q6_k_sse().