← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_0_sse.c
Go to the documentation of this file.
1 /**
2  * @file gemm_kernels_q5_0_sse.c
3  * @brief SSE4.1 GEMM for Q5_0 quantized weights
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Compatible with Sandy Bridge/Ivy Bridge and later.
15  */
16 
17 #include <immintrin.h>
18 #include <stdint.h>
19 #include <string.h>
20 
21 #include "ckernel_quant.h"
22 
23 void gemm_nt_q5_0_sse(const float *A,
24  const void *B,
25  const float *bias,
26  float *C,
27  int M, int N, int K)
28 {
29  const block_q5_0 *blocks = (const block_q5_0 *)B;
30  const int blocks_per_row = K / QK5_0;
31 
32  const __m128i mask_lo = _mm_set1_epi8(0x0F);
33  const __m128i sixteen = _mm_set1_epi8(16);
34 
35  for (int m = 0; m < M; m++) {
36  const float *a_row = &A[m * K];
37 
38  for (int n = 0; n < N; n++) {
39  __m128 sum_v = _mm_setzero_ps();
40 
41  for (int b = 0; b < blocks_per_row; b++) {
42  const block_q5_0 *block = &blocks[n * blocks_per_row + b];
43  float d_val = CK_FP16_TO_FP32(block->d);
44  __m128 d = _mm_set1_ps(d_val);
45  const float *ap = &a_row[b * QK5_0];
46 
47  uint32_t qh_val;
48  memcpy(&qh_val, block->qh, sizeof(qh_val));
49 
50  // Load 16 bytes (32 weights compressed)
51  __m128i qs = _mm_loadu_si128((const __m128i *)block->qs);
52 
53  // Low nibbles (0-15)
54  __m128i lo = _mm_and_si128(qs, mask_lo);
55  // High nibbles (16-31) - shift right by 4
56  __m128i hi = _mm_and_si128(_mm_srli_epi16(qs, 4), mask_lo);
57 
58  // Now we need to add the high bits from qh
59  // qh has 32 bits.
60  // For j=0..15: bit j of qh.
61  // For j=16..31: bit j-16+12 = j-4 of qh?
62  // Wait, ref code:
63  // j=0..15: qh >> j
64  // j=16..31: qh >> (j-16 + 12) = qh >> (j-4)
65 
66  // We will process in chunks of 16 weights (128-bit)?
67  // No, converting int8 to float takes space.
68  // We need to unpack to 32-bit integers to convert to float.
69 
70  // Strategy: Extract 32 bytes of weights.
71  uint8_t q_vals[32];
72  // Vectorized extraction is painful in SSE without AVX2 gathers.
73  // Scalar unpacking of 32 bytes is better than bit-level math inside the loop.
74  // Actually, let's use the SIMD for the float math (the heavy part)
75  // and scalar for the unpacking if needed, OR try to vectorize unpacking.
76 
77  // Vectorized unpacking:
78  // lo (16 bytes). We need to add (qh & (1<<j)) << 4.
79  // Construct a 16-byte mask from qh bits 0-15.
80  // Then another mask from qh bits 12-27.
81 
82  // Optimized bit extraction is hard in SSE.
83  // Let's do a hybrid: unpack to stack buffer, then load as floats.
84  // Or just do scalar unpack since loop count is small (32).
85  // But the main cost is the 32 muls.
86 
87  for (int j = 0; j < 16; j++) {
88  uint8_t v = (block->qs[j] & 0x0F) | (((qh_val >> j) & 1) << 4);
89  q_vals[j] = v;
90  }
91  for (int j = 0; j < 16; j++) {
92  uint8_t v = (block->qs[j] >> 4) | (((qh_val >> (j+12)) & 1) << 4);
93  q_vals[j+16] = v;
94  }
95 
96  // Now we have 32 uint8_t values 0..31.
97  // Subtract 16, convert to float, mul by d, mul by x.
98 
99  // Process 32 elements in 8x __m128 ops
100  for (int k=0; k<32; k+=4) {
101  // Load 4 bytes
102  // Convert to 4 floats
103  // (x - 16) * d * a
104 
105  float w0 = (float)((int)q_vals[k] - 16) * d_val;
106  float w1 = (float)((int)q_vals[k+1] - 16) * d_val;
107  float w2 = (float)((int)q_vals[k+2] - 16) * d_val;
108  float w3 = (float)((int)q_vals[k+3] - 16) * d_val;
109 
110  __m128 w = _mm_set_ps(w3, w2, w1, w0);
111  __m128 x = _mm_loadu_ps(&ap[k]);
112  sum_v = _mm_add_ps(sum_v, _mm_mul_ps(w, x));
113  }
114  }
115 
116  // Hsum
117  float output;
118  _mm_store_ss(&output, _mm_hadd_ps(_mm_hadd_ps(sum_v, sum_v), sum_v));
119  C[m * N + n] = output + (bias ? bias[n] : 0.0f);
120  }
121  }
122 }
Quantization block structures for weight-only quantization.
#define QK5_0
Definition: ckernel_quant.h:67
#define CK_FP16_TO_FP32(x)
void gemm_nt_q5_0_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define C(color)
Definition: show_config.c:39
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72