15 #include <immintrin.h>
23 void gemm_nt_q5_0_ref(
const float *A,
const void *B,
const float *bias,
float *
C,
int M,
int N,
int K);
26 const uint8_t *qs_w = bw->
qs;
27 const int8_t *qs_a = ba->
qs + q8_offset;
30 memcpy(&qh, bw->
qh,
sizeof(qh));
34 __m128i qs_vec = _mm_loadu_si128((
const __m128i *)qs_w);
35 __m128i mask_0f = _mm_set1_epi8(0x0F);
37 __m128i w_lo = _mm_and_si128(qs_vec, mask_0f);
38 __m128i w_hi = _mm_and_si128(_mm_srli_epi16(qs_vec, 4), mask_0f);
45 for (
int j = 0; j < 16; j++) {
46 w[j] = (qs_w[j] & 0x0F) | (((qh >> (j + 0)) << 4) & 0x10);
47 w[j+16] = (qs_w[j] >> 4) | ((qh >> (j + 12)) & 0x10);
50 __m128i vw0 = _mm_loadu_si128((
const __m128i *)&w[0]);
51 __m128i vw1 = _mm_loadu_si128((
const __m128i *)&w[16]);
52 __m128i va0 = _mm_loadu_si128((
const __m128i *)&qs_a[0]);
53 __m128i va1 = _mm_loadu_si128((
const __m128i *)&qs_a[16]);
56 __m128i p0 = _mm_maddubs_epi16(vw0, va0);
57 __m128i p1 = _mm_maddubs_epi16(vw1, va1);
60 __m128i one = _mm_set1_epi16(1);
61 __m128i s0 = _mm_madd_epi16(p0, one);
62 __m128i s1 = _mm_madd_epi16(p1, one);
63 __m128i acc_i32 = _mm_add_epi32(s0, s1);
66 acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(1, 0, 3, 2)));
67 acc_i32 = _mm_add_epi32(acc_i32, _mm_shuffle_epi32(acc_i32, _MM_SHUFFLE(0, 1, 0, 1)));
68 int32_t dot_wa = _mm_cvtsi128_si32(acc_i32);
71 int32_t sum_a = (int32_t)ba->
bsums[q8_offset/16] + (int32_t)ba->
bsums[q8_offset/16 + 1];
73 float result = ((float)dot_wa - 16.0f * (
float)sum_a) *
CK_FP16_TO_FP32(bw->
d) * ba->
d;
92 const int blocks_per_row = K / 32;
94 for (
int m = 0; m < M; m++) {
97 for (
int n = 0; n < N; n++) {
99 const block_q5_0 *w_row = weights + n * blocks_per_row;
101 for (
int b = 0; b < blocks_per_row; b++) {
102 int q8_block_idx = (b * 32) /
QK_K;
103 int q8_offset = (b * 32) %
QK_K;
107 C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
Quantization block structures for weight-only quantization.
#define CK_FP16_TO_FP32(x)
void gemm_nt_q5_0_sse_v2(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_0 weights: C = A @ B^T.
static float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)