← Back to C-Kernel-Engine Docs Doxygen Source Documentation
quantize_row_q8_k_sse.c
Go to the documentation of this file.
1 /**
2  * @file quantize_row_q8_k_sse.c
3  * @brief SSE-optimized Q8_K row quantization kernel
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  */
14 
15 #include <assert.h>
16 #include <math.h>
17 #include <string.h>
18 #include <immintrin.h>
19 
20 #include "ckernel_quant.h"
21 
22 static inline int ck_nearest_int(float fval) {
23  float val = fval + 12582912.f;
24  int i;
25  memcpy(&i, &val, sizeof(int));
26  return (i & 0x007fffff) - 0x00400000;
27 }
28 
29 void quantize_row_q8_k_sse(const float *x, void *vy, int k) {
30  if (!x || !vy || k <= 0) {
31  return;
32  }
33  assert(k % QK_K == 0);
34  const int nb = k / QK_K;
35  block_q8_K *y = (block_q8_K *)vy;
36 
37  for (int i = 0; i < nb; ++i) {
38  float max = 0.0f;
39 
40  // SSE max absolute value
41  __m128 v_max = _mm_setzero_ps();
42  for (int j = 0; j < QK_K; j += 4) {
43  __m128 v = _mm_loadu_ps(x + j);
44  __m128 v_abs = _mm_andnot_ps(_mm_set1_ps(-0.0f), v);
45  v_max = _mm_max_ps(v_max, v_abs);
46  }
47 
48  // Horizontal max
49  v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(1, 0, 3, 2)));
50  v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(0, 1, 0, 1)));
51  _mm_store_ss(&max, v_max);
52 
53  if (max == 0.0f) {
54  y[i].d = 0.0f;
55  memset(y[i].qs, 0, sizeof(y[i].qs));
56  memset(y[i].bsums, 0, sizeof(y[i].bsums));
57  x += QK_K;
58  continue;
59  }
60 
61  const float iscale = -127.0f / max;
62  __m128 v_iscale = _mm_set1_ps(iscale);
63 
64  // Quantize and compute bsums in SSE
65  for (int j = 0; j < QK_K; j += 16) {
66  __m128 x0 = _mm_loadu_ps(x + j + 0);
67  __m128 x1 = _mm_loadu_ps(x + j + 4);
68  __m128 x2 = _mm_loadu_ps(x + j + 8);
69  __m128 x3 = _mm_loadu_ps(x + j + 12);
70 
71  __m128i q0 = _mm_cvtps_epi32(_mm_mul_ps(x0, v_iscale));
72  __m128i q1 = _mm_cvtps_epi32(_mm_mul_ps(x1, v_iscale));
73  __m128i q2 = _mm_cvtps_epi32(_mm_mul_ps(x2, v_iscale));
74  __m128i q3 = _mm_cvtps_epi32(_mm_mul_ps(x3, v_iscale));
75 
76  // Pack i32 -> i16 -> i8
77  __m128i q01 = _mm_packs_epi32(q0, q1);
78  __m128i q23 = _mm_packs_epi32(q2, q3);
79  __m128i q0123 = _mm_packs_epi16(q01, q23);
80 
81  _mm_storeu_si128((__m128i *)(y[i].qs + j), q0123);
82 
83  // Compute bsum for these 16 elements
84  // Each bsum[j/16] covers 16 elements
85  __m128i p01 = _mm_add_epi16(q01, q23);
86  p01 = _mm_add_epi16(p01, _mm_shuffle_epi32(p01, _MM_SHUFFLE(1, 0, 3, 2)));
87  p01 = _mm_add_epi16(p01, _mm_shufflelo_epi16(p01, _MM_SHUFFLE(1, 0, 3, 2)));
88  int16_t bsum = (int16_t)_mm_extract_epi16(p01, 0) + (int16_t)_mm_extract_epi16(p01, 1);
89  y[i].bsums[j / 16] = bsum;
90  }
91 
92  y[i].d = 1.0f / iscale;
93  x += QK_K;
94  }
95 }
Quantization block structures for weight-only quantization.
#define QK_K
static int ck_nearest_int(float fval)
void quantize_row_q8_k_sse(const float *x, void *vy, int k)
int16_t bsums[256/16]