17 #include <immintrin.h>
30 const int blocks_per_row = K /
QK5_0;
32 const __m128i mask_lo = _mm_set1_epi8(0x0F);
33 const __m128i sixteen = _mm_set1_epi8(16);
35 for (
int m = 0; m < M; m++) {
36 const float *a_row = &A[m * K];
38 for (
int n = 0; n < N; n++) {
39 __m128 sum_v = _mm_setzero_ps();
41 for (
int b = 0; b < blocks_per_row; b++) {
42 const block_q5_0 *block = &blocks[n * blocks_per_row + b];
44 __m128 d = _mm_set1_ps(d_val);
45 const float *ap = &a_row[b *
QK5_0];
48 memcpy(&qh_val, block->
qh,
sizeof(qh_val));
51 __m128i qs = _mm_loadu_si128((
const __m128i *)block->
qs);
54 __m128i lo = _mm_and_si128(qs, mask_lo);
56 __m128i hi = _mm_and_si128(_mm_srli_epi16(qs, 4), mask_lo);
87 for (
int j = 0; j < 16; j++) {
88 uint8_t v = (block->
qs[j] & 0x0F) | (((qh_val >> j) & 1) << 4);
91 for (
int j = 0; j < 16; j++) {
92 uint8_t v = (block->
qs[j] >> 4) | (((qh_val >> (j+12)) & 1) << 4);
100 for (
int k=0; k<32; k+=4) {
105 float w0 = (float)((
int)q_vals[k] - 16) * d_val;
106 float w1 = (float)((
int)q_vals[k+1] - 16) * d_val;
107 float w2 = (float)((
int)q_vals[k+2] - 16) * d_val;
108 float w3 = (float)((
int)q_vals[k+3] - 16) * d_val;
110 __m128 w = _mm_set_ps(w3, w2, w1, w0);
111 __m128 x = _mm_loadu_ps(&ap[k]);
112 sum_v = _mm_add_ps(sum_v, _mm_mul_ps(w, x));
118 _mm_store_ss(&output, _mm_hadd_ps(_mm_hadd_ps(sum_v, sum_v), sum_v));
119 C[m * N + n] = output + (bias ? bias[n] : 0.0f);
Quantization block structures for weight-only quantization.
#define CK_FP16_TO_FP32(x)
void gemm_nt_q5_0_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)