← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemv_omp.c File Reference
#include <omp.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

void gemv_fused_q5_0_bias_parallel_omp (float *y, const void *W, const float *x, const float *bias, int M, int K)
 
void gemv_q5_0_q8_0_parallel_omp (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q8_0_q8_0_parallel_omp (float *y, const void *W, const void *x_q8, int M, int K)
 
void quantize_row_q8_0 (const float *x, void *y, int k)
 Quantize FP32 to Q8_0 format (scalar reference) More...
 
void vec_dot_q5_0_q8_0 (int n, float *s, const void *vx, const void *vy)
 Auto-dispatch quantized dot product Q5_0 x Q8_0. More...
 
void vec_dot_q8_0_q8_0 (int n, float *s, const void *vx, const void *vy)
 Auto-dispatch quantized dot product Q8_0 x Q8_0. More...
 

Function Documentation

◆ gemv_fused_q5_0_bias_parallel_omp()

void gemv_fused_q5_0_bias_parallel_omp ( float *  y,
const void *  W,
const float *  x,
const float *  bias,
int  M,
int  K 
)

Definition at line 96 of file gemv_omp.c.

101 {
102  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
103  const int blocks_per_row = K / QK5_0;
104 
105  /* Quantize input ONCE (serial, fast — K=896 → 28 blocks = 952 bytes) */
106  block_q8_0 x_q8[K / QK8_0];
107  quantize_row_q8_0(x, (void *)x_q8, K);
108 
109  /* Parallel GEMV over output rows */
110  #pragma omp parallel for schedule(static)
111  for (int row = 0; row < M; row++) {
112  vec_dot_q5_0_q8_0(K, &y[row],
113  &w_blocks[row * blocks_per_row],
114  x_q8);
115  if (bias) y[row] += bias[row];
116  }
117 }
#define QK5_0
Definition: ckernel_quant.h:67
#define QK8_0
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.
void quantize_row_q8_0(const float *x, void *y, int k)
Quantize FP32 to Q8_0 format (scalar reference)

References QK5_0, QK8_0, quantize_row_q8_0(), and vec_dot_q5_0_q8_0().

◆ gemv_q5_0_q8_0_parallel_omp()

void gemv_q5_0_q8_0_parallel_omp ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 72 of file gemv_omp.c.

76 {
77  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
78  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
79  const int blocks_per_row = K / QK5_0;
80 
81  #pragma omp parallel for schedule(static)
82  for (int row = 0; row < M; row++) {
83  vec_dot_q5_0_q8_0(K, &y[row],
84  &w_blocks[row * blocks_per_row],
85  x_blocks);
86  }
87 }

References QK5_0, and vec_dot_q5_0_q8_0().

◆ gemv_q8_0_q8_0_parallel_omp()

void gemv_q8_0_q8_0_parallel_omp ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 50 of file gemv_omp.c.

54 {
55  const block_q8_0 *w_blocks = (const block_q8_0 *)W;
56  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
57  const int blocks_per_row = K / QK8_0;
58 
59  #pragma omp parallel for schedule(static)
60  for (int row = 0; row < M; row++) {
61  vec_dot_q8_0_q8_0(K, &y[row],
62  &w_blocks[row * blocks_per_row],
63  x_blocks);
64  }
65 }
void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q8_0 x Q8_0.

References QK8_0, and vec_dot_q8_0_q8_0().

◆ quantize_row_q8_0()

void quantize_row_q8_0 ( const float *  x,
void *  vy,
int  k 
)

Quantize FP32 to Q8_0 format (scalar reference)

Parameters
xInput FP32 values
vyOutput Q8_0 blocks
kNumber of elements (must be multiple of 32)

Definition at line 59 of file gemm_kernels_q8_0.c.

60 {
61  block_q8_0 *y = (block_q8_0 *)vy;
62  const int nb = k / QK8_0; /* QK8_0 = 32 */
63 
64 #if defined(__AVX__)
65  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
66  const __m256 v_half = _mm256_set1_ps(0.5f);
67  const __m256 v_min = _mm256_set1_ps(-127.0f);
68  const __m256 v_max = _mm256_set1_ps(127.0f);
69 
70  for (int i = 0; i < nb; i++) {
71  __m256 v0 = _mm256_loadu_ps(x + 0);
72  __m256 v1 = _mm256_loadu_ps(x + 8);
73  __m256 v2 = _mm256_loadu_ps(x + 16);
74  __m256 v3 = _mm256_loadu_ps(x + 24);
75  x += QK8_0;
76 
77  __m256 max_abs = _mm256_andnot_ps(sign_bit, v0);
78  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v1));
79  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v2));
80  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v3));
81 
82  __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(max_abs, 1),
83  _mm256_castps256_ps128(max_abs));
84  max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
85  max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
86  const float max_scalar = _mm_cvtss_f32(max4);
87 
88  const float d = max_scalar / 127.0f;
89  const float id = max_scalar != 0.0f ? 127.0f / max_scalar : 0.0f;
90  y[i].d = CK_FP32_TO_FP16(d);
91 
92  const __m256 mul = _mm256_set1_ps(id);
93  v0 = _mm256_mul_ps(v0, mul);
94  v1 = _mm256_mul_ps(v1, mul);
95  v2 = _mm256_mul_ps(v2, mul);
96  v3 = _mm256_mul_ps(v3, mul);
97 
98  v0 = _mm256_min_ps(_mm256_max_ps(v0, v_min), v_max);
99  v1 = _mm256_min_ps(_mm256_max_ps(v1, v_min), v_max);
100  v2 = _mm256_min_ps(_mm256_max_ps(v2, v_min), v_max);
101  v3 = _mm256_min_ps(_mm256_max_ps(v3, v_min), v_max);
102 
103  /* Round half away from zero to match the scalar path */
104  v0 = _mm256_add_ps(v0, _mm256_or_ps(_mm256_and_ps(v0, sign_bit), v_half));
105  v1 = _mm256_add_ps(v1, _mm256_or_ps(_mm256_and_ps(v1, sign_bit), v_half));
106  v2 = _mm256_add_ps(v2, _mm256_or_ps(_mm256_and_ps(v2, sign_bit), v_half));
107  v3 = _mm256_add_ps(v3, _mm256_or_ps(_mm256_and_ps(v3, sign_bit), v_half));
108 
109  __m256i i0 = _mm256_cvttps_epi32(v0);
110  __m256i i1 = _mm256_cvttps_epi32(v1);
111  __m256i i2 = _mm256_cvttps_epi32(v2);
112  __m256i i3 = _mm256_cvttps_epi32(v3);
113 
114 #if defined(__AVX2__)
115  i0 = _mm256_packs_epi32(i0, i1);
116  i2 = _mm256_packs_epi32(i2, i3);
117  i0 = _mm256_packs_epi16(i0, i2);
118 
119  const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
120  i0 = _mm256_permutevar8x32_epi32(i0, perm);
121  _mm256_storeu_si256((__m256i *)y[i].qs, i0);
122 #else
123  __m128i ni0 = _mm256_castsi256_si128(i0);
124  __m128i ni1 = _mm256_extractf128_si256(i0, 1);
125  __m128i ni2 = _mm256_castsi256_si128(i1);
126  __m128i ni3 = _mm256_extractf128_si256(i1, 1);
127  __m128i ni4 = _mm256_castsi256_si128(i2);
128  __m128i ni5 = _mm256_extractf128_si256(i2, 1);
129  __m128i ni6 = _mm256_castsi256_si128(i3);
130  __m128i ni7 = _mm256_extractf128_si256(i3, 1);
131 
132  ni0 = _mm_packs_epi32(ni0, ni1);
133  ni2 = _mm_packs_epi32(ni2, ni3);
134  ni4 = _mm_packs_epi32(ni4, ni5);
135  ni6 = _mm_packs_epi32(ni6, ni7);
136 
137  ni0 = _mm_packs_epi16(ni0, ni2);
138  ni4 = _mm_packs_epi16(ni4, ni6);
139 
140  _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
141  _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
142 #endif
143  }
144 #else
145  for (int i = 0; i < nb; i++) {
146  const float *xb = x + i * QK8_0;
147 
148  /* Find max absolute value in block */
149  float amax = 0.0f;
150  for (int j = 0; j < QK8_0; j++) {
151  float av = xb[j] >= 0 ? xb[j] : -xb[j];
152  if (av > amax) amax = av;
153  }
154 
155  /* Compute scale: d = max / 127 */
156  float d = amax / 127.0f;
157  float id = d != 0.0f ? 127.0f / amax : 0.0f;
158 
159  /* Store scale as FP16 */
160  y[i].d = CK_FP32_TO_FP16(d);
161 
162  /* Quantize values */
163  for (int j = 0; j < QK8_0; j++) {
164  float v = xb[j] * id;
165  /* Round to nearest int and clamp to [-127, 127] */
166  int q = (int)(v + (v >= 0 ? 0.5f : -0.5f));
167  if (q > 127) q = 127;
168  if (q < -127) q = -127;
169  y[i].qs[j] = (int8_t)q;
170  }
171  }
172 #endif
173 }
#define CK_FP32_TO_FP16(x)
int8_t qs[32]
int32_t id
Definition: tokenizer.h:315

Referenced by gemv_fused_q5_0_bias_parallel_omp(), and quantize_batch_q8_0().

◆ vec_dot_q5_0_q8_0()

void vec_dot_q5_0_q8_0 ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Auto-dispatch quantized dot product Q5_0 x Q8_0.

Dispatch priority:

  1. AVX512 (best performance on modern Intel/AMD)
  2. AVX (256-bit float ops, works on Sandy/Ivy Bridge and newer)
  3. SSSE3 (128-bit fallback)
  4. Reference scalar (last resort)

Definition at line 1498 of file gemm_kernels_q5_0.c.

1499 {
1500 #if defined(__AVX512F__)
1501  vec_dot_q5_0_q8_0_avx512(n, s, vx, vy);
1502 #elif defined(__AVX__)
1503  /* AVX for 256-bit float ops (works on Ivy Bridge and newer) */
1504  vec_dot_q5_0_q8_0_avx(n, s, vx, vy);
1505 #elif defined(__SSSE3__)
1506  /* SSSE3 - most efficient on older CPUs */
1507  vec_dot_q5_0_q8_0_sse(n, s, vx, vy);
1508 #else
1509  vec_dot_q5_0_q8_0_ref(n, s, vx, vy);
1510 #endif
1511 }
void vec_dot_q5_0_q8_0_ref(int n, float *s, const void *vx, const void *vy)
Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference)

Referenced by gemm_nt_q5_0_q8_0(), gemv_fused_q5_0_bias_parallel_omp(), gemv_q5_0_q8_0(), gemv_q5_0_q8_0_parallel_omp(), and gemv_q5_0_q8_0_parallel_simd().

◆ vec_dot_q8_0_q8_0()

void vec_dot_q8_0_q8_0 ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Auto-dispatch quantized dot product Q8_0 x Q8_0.

Definition at line 1013 of file gemm_kernels_q8_0.c.

1014 {
1015 #ifdef __AVX512F__
1016  vec_dot_q8_0_q8_0_avx512(n, s, vx, vy);
1017 #elif defined(__AVX__)
1018  vec_dot_q8_0_q8_0_avx(n, s, vx, vy);
1019 #elif defined(__SSE4_1__)
1020  vec_dot_q8_0_q8_0_sse(n, s, vx, vy);
1021 #else
1022  vec_dot_q8_0_q8_0_ref(n, s, vx, vy);
1023 #endif
1024 }
void vec_dot_q8_0_q8_0_ref(int n, float *s, const void *vx, const void *vy)
Quantized dot product: Q8_0 weights x Q8_0 input (scalar reference)

Referenced by gemv_q8_0_q8_0(), gemv_q8_0_q8_0_parallel(), gemv_q8_0_q8_0_parallel_omp(), and gemv_q8_0_q8_0_parallel_simd().