← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q4k_sse.c File Reference

SSE4.1 Q4_K x Q8_K dot product kernels. More...

#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

void gemv_q4_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K)
 
static int32_t hsum_epi32_sse (__m128i v)
 

Detailed Description

SSE4.1 Q4_K x Q8_K dot product kernels.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Compatible with Sandy Bridge/Ivy Bridge and later.

Definition in file gemm_kernels_q4k_sse.c.

Function Documentation

◆ gemv_q4_k_q8_k_sse()

void gemv_q4_k_q8_k_sse ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 33 of file gemm_kernels_q4k_sse.c.

37 {
38  const block_q4_K *blocks = (const block_q4_K *)W;
39  const block_q8_K *x = (const block_q8_K *)x_q8;
40  const int blocks_per_row = K / QK_K;
41 
42  const __m128i mask_low = _mm_set1_epi8(0x0F);
43 
44  for (int row = 0; row < M; ++row) {
45  float sumf = 0.0f;
46  const block_q4_K *w_row = blocks + row * blocks_per_row;
47 
48  for (int i = 0; i < blocks_per_row; ++i) {
49  const block_q4_K *b4 = &w_row[i];
50  const block_q8_K *b8 = &x[i];
51 
52  // Unpack scales (same as ref)
53  uint8_t sc[8], m_val[8];
54  unpack_q4_k_scales(b4->scales, sc, m_val);
55 
56  float d = CK_FP16_TO_FP32(b4->d) * b8->d;
57  float dmin = CK_FP16_TO_FP32(b4->dmin) * b8->d;
58 
59  int is = 0;
60  int q_offset = 0;
61 
62  // Process 4 chunks of 64 elements (256 total)
63  for (int j = 0; j < QK_K; j += 64) {
64  // We process 32 bytes of qs (covering 64 elements via low/high nibbles)
65  // We access qs[0..31] relative to q_offset
66 
67  // Accumulators for this 64-element chunk
68  __m128i acc_lo = _mm_setzero_si128();
69  __m128i acc_hi = _mm_setzero_si128();
70 
71  // Inner loop: 2 iters of 16 bytes (32 elements)
72  for (int l = 0; l < 32; l += 16) {
73  // Load 16 bytes of Q4
74  __m128i q4_vec = _mm_loadu_si128((const __m128i *)(b4->qs + q_offset + l));
75 
76  // Low nibbles -> correspond to q8_lo (elements j+l .. j+l+15)
77  __m128i q4_lo = _mm_and_si128(q4_vec, mask_low);
78 
79  // High nibbles -> correspond to q8_hi (elements j+32+l .. j+32+l+15)
80  __m128i q4_hi = _mm_and_si128(_mm_srli_epi16(q4_vec, 4), mask_low);
81 
82  // Load Q8
83  __m128i q8_lo_vec = _mm_loadu_si128((const __m128i *)(b8->qs + j + l));
84  __m128i q8_hi_vec = _mm_loadu_si128((const __m128i *)(b8->qs + j + 32 + l));
85 
86  // Expand and Multiply-Add: Q4(u8) * Q8(s8) -> i32
87  // Since Q4 is u8 and Q8 is s8, we use intermediate i16
88 
89  // LO PART
90  __m128i q4_lo_16_L = _mm_cvtepu8_epi16(q4_lo); // lower 8 -> 16
91  __m128i q8_lo_16_L = _mm_cvtepi8_epi16(q8_lo_vec);
92  __m128i prod_lo_L = _mm_madd_epi16(q4_lo_16_L, q8_lo_16_L); // i32
93  acc_lo = _mm_add_epi32(acc_lo, prod_lo_L);
94 
95  __m128i q4_lo_16_H = _mm_cvtepu8_epi16(_mm_srli_si128(q4_lo, 8)); // upper 8 -> 16
96  __m128i q8_lo_16_H = _mm_cvtepi8_epi16(_mm_srli_si128(q8_lo_vec, 8));
97  __m128i prod_lo_H = _mm_madd_epi16(q4_lo_16_H, q8_lo_16_H); // i32
98  acc_lo = _mm_add_epi32(acc_lo, prod_lo_H);
99 
100  // HI PART
101  __m128i q4_hi_16_L = _mm_cvtepu8_epi16(q4_hi);
102  __m128i q8_hi_16_L = _mm_cvtepi8_epi16(q8_hi_vec);
103  __m128i prod_hi_L = _mm_madd_epi16(q4_hi_16_L, q8_hi_16_L);
104  acc_hi = _mm_add_epi32(acc_hi, prod_hi_L);
105 
106  __m128i q4_hi_16_H = _mm_cvtepu8_epi16(_mm_srli_si128(q4_hi, 8));
107  __m128i q8_hi_16_H = _mm_cvtepi8_epi16(_mm_srli_si128(q8_hi_vec, 8));
108  __m128i prod_hi_H = _mm_madd_epi16(q4_hi_16_H, q8_hi_16_H);
109  acc_hi = _mm_add_epi32(acc_hi, prod_hi_H);
110  }
111 
112  int32_t sum_q4q8_lo = hsum_epi32_sse(acc_lo);
113  int32_t sum_q4q8_hi = hsum_epi32_sse(acc_hi);
114 
115  /* bsums: each bsum is 16 elements */
116  int32_t bsum_lo = (int32_t)b8->bsums[j / 16] +
117  (int32_t)b8->bsums[j / 16 + 1];
118  int32_t bsum_hi = (int32_t)b8->bsums[(j + 32) / 16] +
119  (int32_t)b8->bsums[(j + 32) / 16 + 1];
120 
121  sumf += d * (float)sc[is] * (float)sum_q4q8_lo;
122  sumf -= dmin * (float)m_val[is] * (float)bsum_lo;
123  sumf += d * (float)sc[is + 1] * (float)sum_q4q8_hi;
124  sumf -= dmin * (float)m_val[is + 1] * (float)bsum_hi;
125 
126  q_offset += 32;
127  is += 2;
128  }
129  }
130  y[row] = sumf;
131  }
132 }
#define CK_FP16_TO_FP32(x)
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
#define QK_K
static int32_t hsum_epi32_sse(__m128i v)
uint8_t scales[12]
uint8_t qs[256/2]
ck_half dmin
int8_t qs[256]
int16_t bsums[256/16]

References block_q8_K::bsums, CK_FP16_TO_FP32, block_q4_K::d, block_q8_K::d, block_q4_K::dmin, hsum_epi32_sse(), QK_K, block_q4_K::qs, block_q8_K::qs, block_q4_K::scales, and unpack_q4_k_scales().

Referenced by gemv_q4_k_q8_k().

◆ hsum_epi32_sse()

static int32_t hsum_epi32_sse ( __m128i  v)
inlinestatic

Definition at line 25 of file gemm_kernels_q4k_sse.c.

25  {
26  __m128i shuf = _mm_shuffle_epi32(v, _MM_SHUFFLE(1, 0, 3, 2));
27  __m128i sums = _mm_add_epi32(v, shuf);
28  shuf = _mm_shuffle_epi32(sums, _MM_SHUFFLE(2, 3, 0, 1));
29  sums = _mm_add_epi32(sums, shuf);
30  return _mm_cvtsi128_si32(sums);
31 }

Referenced by gemv_q4_k_q8_k_sse().