15 #pragma GCC target("sse4.1,ssse3")
16 #include <immintrin.h>
34 const uint8_t *ql = bw->
ql;
35 const uint8_t *qh = bw->
qh;
36 const int8_t *sc = bw->
scales;
37 const int8_t *qa = ba->
qs;
42 for (
int n = 0; n <
QK_K; n += 128) {
43 for (
int l = 0; l < 32; ++l) {
44 const int is = l / 16;
46 const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
47 const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
48 const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
49 const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
51 sum += (double)(d * (
float)sc[is + 0] * (float)q1) * (double)qa[l + 0];
52 sum += (double)(d * (
float)sc[is + 2] * (float)q2) * (double)qa[l + 32];
53 sum += (double)(d * (
float)sc[is + 4] * (float)q3) * (double)qa[l + 64];
54 sum += (double)(d * (
float)sc[is + 6] * (float)q4) * (double)qa[l + 96];
62 return (
float)(sum * ba->
d);
81 const int blocks_per_row = K /
QK_K;
83 for (
int m = 0; m < M; m++) {
86 for (
int n = 0; n < N; n++) {
88 const block_q6_K *w_row = weights + n * blocks_per_row;
90 for (
int b = 0; b < blocks_per_row; b++) {
94 C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
Quantization block structures for weight-only quantization.
void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
#define CK_FP16_TO_FP32(x)
static float dot_q6_k_q8_k_256_sse(const block_q6_K *bw, const block_q8_K *ba)
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q6_k_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)