← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_0_sse_v2.c File Reference

SSE-optimized GEMM kernels for Q5_0 x Q8_K quantization. More...

#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

static float dot_q5_0_q8_k_32_sse (const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)
 
void gemm_nt_q5_0_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 GEMM with transposed Q5_0 weights: C = A @ B^T. More...
 
void gemm_nt_q5_0_sse_v2 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void quantize_row_q8_k (const float *x, void *vy, int k)
 

Detailed Description

SSE-optimized GEMM kernels for Q5_0 x Q8_K quantization.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Definition in file gemm_kernels_q5_0_sse_v2.c.

Function Documentation

◆ dot_q5_0_q8_k_32_sse()

static float dot_q5_0_q8_k_32_sse ( const block_q5_0 bw,
const block_q8_K ba,
int  q8_offset 
)
inlinestatic

Definition at line 25 of file gemm_kernels_q5_0_sse_v2.c.

25  {
26  const uint8_t *qs_w = bw->qs;
27  const int8_t *qs_a = ba->qs + q8_offset;
28 
29  uint32_t qh;
30  memcpy(&qh, bw->qh, sizeof(qh));
31 
32  // Vectorize bit extraction
33  // Load 16 bytes of low nibbles
34  __m128i qs_vec = _mm_loadu_si128((const __m128i *)qs_w);
35  __m128i mask_0f = _mm_set1_epi8(0x0F);
36 
37  __m128i w_lo = _mm_and_si128(qs_vec, mask_0f);
38  __m128i w_hi = _mm_and_si128(_mm_srli_epi16(qs_vec, 4), mask_0f);
39 
40  // Load high bits from qh
41  // This is still a bit scalar but we can use shuffle for some of it if needed
42  // For now, let's just make sure we handle the -16 offset correctly in SIMD.
43 
44  uint8_t w[32];
45  for (int j = 0; j < 16; j++) {
46  w[j] = (qs_w[j] & 0x0F) | (((qh >> (j + 0)) << 4) & 0x10);
47  w[j+16] = (qs_w[j] >> 4) | ((qh >> (j + 12)) & 0x10);
48  }
49 
50  __m128i vw0 = _mm_loadu_si128((const __m128i *)&w[0]);
51  __m128i vw1 = _mm_loadu_si128((const __m128i *)&w[16]);
52  __m128i va0 = _mm_loadu_si128((const __m128i *)&qs_a[0]);
53  __m128i va1 = _mm_loadu_si128((const __m128i *)&qs_a[16]);
54 
55  // Dot product: unsigned 8-bit * signed 8-bit -> signed 16-bit
56  __m128i p0 = _mm_maddubs_epi16(vw0, va0);
57  __m128i p1 = _mm_maddubs_epi16(vw1, va1);
58 
59  // Sum to i32
60  __m128i one = _mm_set1_epi16(1);
61  __m128i s0 = _mm_madd_epi16(p0, one);
62  __m128i s1 = _mm_madd_epi16(p1, one);
63  __m128i acc_i32 = _mm_add_epi32(s0, s1);
64 
65  // Horizontal sum of i32
66  acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(1, 0, 3, 2)));
67  acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(0, 1, 0, 1)));
68  int32_t dot_wa = _mm_cvtsi128_si32(acc_i32);
69 
70  // sum((w - 16) * a) = sum(w*a) - 16 * sum(a)
71  int32_t sum_a = (int32_t)ba->bsums[q8_offset/16] + (int32_t)ba->bsums[q8_offset/16 + 1];
72 
73  float result = ((float)dot_wa - 16.0f * (float)sum_a) * CK_FP16_TO_FP32(bw->d) * ba->d;
74  return result;
75 }
#define CK_FP16_TO_FP32(x)
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72
int8_t qs[256]
int16_t bsums[256/16]

References block_q8_K::bsums, CK_FP16_TO_FP32, block_q5_0::d, block_q8_K::d, block_q5_0::qh, block_q8_K::qs, and block_q5_0::qs.

Referenced by gemm_nt_q5_0_sse_v2().

◆ gemm_nt_q5_0_ref()

void gemm_nt_q5_0_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

GEMM with transposed Q5_0 weights: C = A @ B^T.

Parameters
AInput activations [M x K], row-major FP32
BWeight matrix in Q5_0 format [N x K], row-major quantized
biasOptional bias [N], NULL if not used
COutput [M x N], row-major FP32
MBatch size (number of tokens)
NOutput dimension (number of rows in B)
KInput dimension

Definition at line 788 of file gemm_kernels_q5_0.c.

793 {
794  const block_q5_0 *blocks = (const block_q5_0 *)B;
795  const int blocks_per_row = K / QK5_0;
796 
797  for (int m = 0; m < M; m++) {
798  const float *a_row = &A[m * K];
799 
800  for (int n = 0; n < N; n++) {
801  float sum = 0.0f;
802 
803  for (int b = 0; b < blocks_per_row; b++) {
804  const block_q5_0 *block = &blocks[n * blocks_per_row + b];
805  const float d = CK_FP16_TO_FP32(block->d);
806  const float *ap = &a_row[b * QK5_0];
807 
808  uint32_t qh;
809  memcpy(&qh, block->qh, sizeof(qh));
810 
811  /* llama.cpp Q5_0 layout - note j+12 for second weight high bit */
812  for (int j = 0; j < QK5_0 / 2; j++) {
813  const uint8_t packed = block->qs[j];
814  const int lo = (packed & 0x0F);
815  const int hi = (packed >> 4);
816  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
817  const int xh_1 = ((qh >> (j + 12))) & 0x10;
818  const int q0 = (lo | xh_0) - 16;
819  const int q1 = (hi | xh_1) - 16;
820 
821  sum += d * (float)q0 * ap[j];
822  sum += d * (float)q1 * ap[j + 16];
823  }
824  }
825 
826  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
827  }
828  }
829 }
#define QK5_0
Definition: ckernel_quant.h:67
#define C(color)
Definition: show_config.c:39

References C, CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemm_nt_q5_0_sse_v2().

◆ gemm_nt_q5_0_sse_v2()

void gemm_nt_q5_0_sse_v2 ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 77 of file gemm_kernels_q5_0_sse_v2.c.

82 {
83  if (K % QK_K != 0) {
84  gemm_nt_q5_0_ref(A, B, bias, C, M, N, K);
85  return;
86  }
87 
88  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
89  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
90 
91  const block_q5_0 *weights = (const block_q5_0 *)B;
92  const int blocks_per_row = K / 32;
93 
94  for (int m = 0; m < M; m++) {
95  quantize_row_q8_k(&A[m * K], A_q8, K);
96 
97  for (int n = 0; n < N; n++) {
98  float sumf = 0.0f;
99  const block_q5_0 *w_row = weights + n * blocks_per_row;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  int q8_block_idx = (b * 32) / QK_K;
103  int q8_offset = (b * 32) % QK_K;
104  sumf += dot_q5_0_q8_k_32_sse(&w_row[b], &A_q8[q8_block_idx], q8_offset);
105  }
106 
107  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
108  }
109  }
110 }
#define QK_K
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_0 weights: C = A @ B^T.
static float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)

References C, dot_q5_0_q8_k_32_sse(), gemm_nt_q5_0_ref(), QK_K, and quantize_row_q8_k().

◆ quantize_row_q8_k()

void quantize_row_q8_k ( const float *  x,
void *  vy,
int  k 
)

Definition at line 107 of file gemm_kernels_q4k_q8k.c.

107  {
108 #if defined(__SSE4_1__)
109  quantize_row_q8_k_sse(x, vy, k);
110 #else
111  quantize_row_q8_k_ref(x, vy, k);
112 #endif
113 }
void quantize_row_q8_k_sse(const float *x, void *vy, int k)
void quantize_row_q8_k_ref(const float *x, void *vy, int k)

Referenced by gemm_nt_q5_0_sse_v2().