18 #include <immintrin.h>
23 float val = fval + 12582912.f;
25 memcpy(&i, &val,
sizeof(
int));
26 return (i & 0x007fffff) - 0x00400000;
30 if (!x || !vy || k <= 0) {
33 assert(k %
QK_K == 0);
34 const int nb = k /
QK_K;
37 for (
int i = 0; i < nb; ++i) {
41 __m128 v_max = _mm_setzero_ps();
42 for (
int j = 0; j <
QK_K; j += 4) {
43 __m128 v = _mm_loadu_ps(x + j);
44 __m128 v_abs = _mm_andnot_ps(_mm_set1_ps(-0.0f), v);
45 v_max = _mm_max_ps(v_max, v_abs);
49 v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(1, 0, 3, 2)));
50 v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(0, 1, 0, 1)));
51 _mm_store_ss(&max, v_max);
55 memset(y[i].qs, 0,
sizeof(y[i].qs));
56 memset(y[i].bsums, 0,
sizeof(y[i].bsums));
61 const float iscale = -127.0f / max;
62 __m128 v_iscale = _mm_set1_ps(iscale);
65 for (
int j = 0; j <
QK_K; j += 16) {
66 __m128 x0 = _mm_loadu_ps(x + j + 0);
67 __m128 x1 = _mm_loadu_ps(x + j + 4);
68 __m128 x2 = _mm_loadu_ps(x + j + 8);
69 __m128 x3 = _mm_loadu_ps(x + j + 12);
71 __m128i q0 = _mm_cvtps_epi32(_mm_mul_ps(x0, v_iscale));
72 __m128i q1 = _mm_cvtps_epi32(_mm_mul_ps(x1, v_iscale));
73 __m128i q2 = _mm_cvtps_epi32(_mm_mul_ps(x2, v_iscale));
74 __m128i q3 = _mm_cvtps_epi32(_mm_mul_ps(x3, v_iscale));
77 __m128i q01 = _mm_packs_epi32(q0, q1);
78 __m128i q23 = _mm_packs_epi32(q2, q3);
79 __m128i q0123 = _mm_packs_epi16(q01, q23);
81 _mm_storeu_si128((__m128i *)(y[i].qs + j), q0123);
85 __m128i p01 = _mm_add_epi16(q01, q23);
86 p01 = _mm_add_epi16(p01, _mm_shuffle_epi32(p01, _MM_SHUFFLE(1, 0, 3, 2)));
87 p01 = _mm_add_epi16(p01, _mm_shufflelo_epi16(p01, _MM_SHUFFLE(1, 0, 3, 2)));
88 int16_t bsum = (int16_t)_mm_extract_epi16(p01, 0) + (int16_t)_mm_extract_epi16(p01, 1);
89 y[i].
bsums[j / 16] = bsum;
92 y[i].
d = 1.0f / iscale;
Quantization block structures for weight-only quantization.
static int ck_nearest_int(float fval)
void quantize_row_q8_k_sse(const float *x, void *vy, int k)