24 #include <immintrin.h>
42 for (
int i = 0; i <
QK4_0 / 2; i++) {
43 const uint8_t packed = block->
qs[i];
46 const int8_t q0 = (packed & 0x0F) - 8;
48 const int8_t q1 = (packed >> 4) - 8;
50 output[i] = d * (float)q0;
51 output[i +
QK4_0 / 2] = d * (float)q1;
64 const size_t n_blocks = n_elements /
QK4_0;
66 for (
size_t b = 0; b < n_blocks; b++) {
78 void dequant_q4_0_block_avx512(
const block_q4_0 *block,
79 __m512 *out_lo, __m512 *out_hi)
82 const __m512i offset = _mm512_set1_epi32(8);
85 __m128i packed = _mm_loadu_si128((
const __m128i *)block->
qs);
88 __m512i lo_nibbles = _mm512_cvtepu8_epi32(packed);
89 lo_nibbles = _mm512_and_epi32(lo_nibbles, _mm512_set1_epi32(0x0F));
90 lo_nibbles = _mm512_sub_epi32(lo_nibbles, offset);
93 __m512i hi_nibbles = _mm512_cvtepu8_epi32(packed);
94 hi_nibbles = _mm512_srli_epi32(hi_nibbles, 4);
95 hi_nibbles = _mm512_sub_epi32(hi_nibbles, offset);
98 *out_lo = _mm512_mul_ps(_mm512_cvtepi32_ps(lo_nibbles), scale);
99 *out_hi = _mm512_mul_ps(_mm512_cvtepi32_ps(hi_nibbles), scale);
122 for (
int i = 0; i <
QK4_1 / 2; i++) {
123 const uint8_t packed = block->
qs[i];
126 const int q0 = (packed & 0x0F);
128 const int q1 = (packed >> 4);
131 output[i] = d * (float)q0 + m;
132 output[i +
QK4_1 / 2] = d * (float)q1 + m;
142 const size_t n_blocks = n_elements /
QK4_1;
144 for (
size_t b = 0; b < n_blocks; b++) {
167 memcpy(&qh, block->
qh,
sizeof(qh));
173 for (
int j = 0; j <
QK5_0 / 2; j++) {
174 const uint8_t packed = block->
qs[j];
177 const int lo = (packed & 0x0F);
178 const int hi = (packed >> 4);
181 const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
182 const int xh_1 = ((qh >> (j + 12))) & 0x10;
185 const int q0 = (lo | xh_0) - 16;
186 const int q1 = (hi | xh_1) - 16;
188 output[j] = d * (float)q0;
189 output[j + 16] = d * (float)q1;
199 const size_t n_blocks = n_elements /
QK5_0;
201 for (
size_t b = 0; b < n_blocks; b++) {
225 memcpy(&qh, block->
qh,
sizeof(qh));
231 for (
int j = 0; j <
QK5_1 / 2; j++) {
232 const uint8_t packed = block->
qs[j];
235 const int lo = (packed & 0x0F);
236 const int hi = (packed >> 4);
239 const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
240 const int xh_1 = ((qh >> (j + 12))) & 0x10;
243 const int q0 = (lo | xh_0);
244 const int q1 = (hi | xh_1);
247 output[j] = d * (float)q0 + m;
248 output[j + 16] = d * (float)q1 + m;
258 const size_t n_blocks = n_elements /
QK5_1;
260 for (
size_t b = 0; b < n_blocks; b++) {
278 for (
int i = 0; i <
QK8_0; i++) {
279 output[i] = d * (float)block->
qs[i];
289 const size_t n_blocks = n_elements /
QK8_0;
291 for (
size_t b = 0; b < n_blocks; b++) {
300 void dequant_q8_0_block_avx512(
const block_q8_0 *block,
301 __m512 *out0, __m512 *out1)
306 __m128i q0 = _mm_loadu_si128((
const __m128i *)&block->
qs[0]);
307 __m128i q1 = _mm_loadu_si128((
const __m128i *)&block->
qs[16]);
310 __m512i i0 = _mm512_cvtepi8_epi32(q0);
311 __m512i i1 = _mm512_cvtepi8_epi32(q1);
313 *out0 = _mm512_mul_ps(_mm512_cvtepi32_ps(i0), scale);
314 *out1 = _mm512_mul_ps(_mm512_cvtepi32_ps(i1), scale);
344 for (
int iter = 0; iter < 4; iter++) {
345 const float d1 = d * (float)sc[2 * iter];
346 const float m1 = dmin * (float)m[2 * iter];
347 const float d2 = d * (float)sc[2 * iter + 1];
348 const float m2 = dmin * (float)m[2 * iter + 1];
350 const uint8_t *qs = &block->
qs[iter * 32];
351 float *out = &output[iter * 64];
354 for (
int l = 0; l < 32; l++) {
355 const int q = (qs[l] & 0x0F);
356 out[l] = d1 * (float)q - m1;
360 for (
int l = 0; l < 32; l++) {
361 const int q = (qs[l] >> 4);
362 out[32 + l] = d2 * (float)q - m2;
373 const size_t n_blocks = n_elements /
QK_K;
375 for (
size_t b = 0; b < n_blocks; b++) {
392 const uint8_t *ql = block->
ql;
393 const uint8_t *qh = block->
qh;
394 const int8_t *sc = block->
scales;
397 for (
int n = 0; n <
QK_K; n += 128) {
398 for (
int l = 0; l < 32; ++l) {
399 const int is = l / 16;
400 const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
401 const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
402 const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
403 const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
405 y[l + 0] = d * (float)sc[is + 0] * (
float)q1;
406 y[l + 32] = d * (float)sc[is + 2] * (
float)q2;
407 y[l + 64] = d * (float)sc[is + 4] * (
float)q3;
408 y[l + 96] = d * (float)sc[is + 6] * (
float)q4;
423 const size_t n_blocks = n_elements /
QK_K;
425 for (
size_t b = 0; b < n_blocks; b++) {
449 void dequant_q4_k_block_avx512(
const block_q4_K *block,
float *output)
457 const __m512i mask_lo = _mm512_set1_epi32(0x0F);
460 for (
int iter = 0; iter < 4; iter++) {
461 const float d1 = d * (float)sc[2 * iter];
462 const float m1 = dmin * (float)m[2 * iter];
463 const float d2 = d * (float)sc[2 * iter + 1];
464 const float m2 = dmin * (float)m[2 * iter + 1];
466 const __m512 vd1 = _mm512_set1_ps(d1);
467 const __m512 vm1 = _mm512_set1_ps(m1);
468 const __m512 vd2 = _mm512_set1_ps(d2);
469 const __m512 vm2 = _mm512_set1_ps(m2);
471 const uint8_t *qs = &block->
qs[iter * 32];
472 float *out = &output[iter * 64];
475 for (
int chunk = 0; chunk < 2; chunk++) {
476 __m128i packed = _mm_loadu_si128((
const __m128i *)&qs[chunk * 16]);
477 __m512i bytes = _mm512_cvtepu8_epi32(packed);
478 __m512i lo = _mm512_and_epi32(bytes, mask_lo);
480 __m512 w = _mm512_fnmadd_ps(_mm512_set1_ps(1.0f), vm1,
481 _mm512_mul_ps(_mm512_cvtepi32_ps(lo), vd1));
482 _mm512_storeu_ps(&out[chunk * 16], w);
486 for (
int chunk = 0; chunk < 2; chunk++) {
487 __m128i packed = _mm_loadu_si128((
const __m128i *)&qs[chunk * 16]);
488 __m512i bytes = _mm512_cvtepu8_epi32(packed);
489 __m512i hi = _mm512_srli_epi32(bytes, 4);
491 __m512 w = _mm512_fnmadd_ps(_mm512_set1_ps(1.0f), vm2,
492 _mm512_mul_ps(_mm512_cvtepi32_ps(hi), vd2));
493 _mm512_storeu_ps(&out[32 + chunk * 16], w);
CKDataType
Supported data types in C-Kernel-Engine.
Quantization block structures for weight-only quantization.
#define GGML_FP16_TO_FP32
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
void dequant_q4_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_0 row (multiple blocks)
void dequant_q5_0_block(const block_q5_0 *block, float *output)
Dequantize a single Q5_0 block to FP32.
void dequant_q5_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_0 row (multiple blocks)
void dequant_q8_0_block(const block_q8_0 *block, float *output)
Dequantize a single Q8_0 block to FP32.
void dequant_q4_1_block(const block_q4_1 *block, float *output)
Dequantize a single Q4_1 block to FP32.
void dequant_q6_k_block(const block_q6_K *block, float *output)
Dequantize a single Q6_K block to FP32.
void dequant_q4_k_block(const block_q4_K *block, float *output)
Dequantize a single Q4_K block to FP32.
void dequant_q4_0_block(const block_q4_0 *block, float *output)
Dequantize a single Q4_0 block to FP32.
void dequant_row(CKDataType dtype, const void *src, float *dst, size_t n_elements)
Dequantize a row of quantized data to FP32.
void dequant_q8_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q8_0 row (multiple blocks)
void dequant_q5_1_block(const block_q5_1 *block, float *output)
Dequantize a single Q5_1 block to FP32.
void dequant_q5_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_1 row (multiple blocks)
void dequant_q4_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_1 row (multiple blocks)
void dequant_q6_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q6_K row (multiple blocks)
void dequant_q4_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_K row (multiple blocks)