← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ckernel_quant.h File Reference

Quantization block structures for weight-only quantization. More...

#include <stdint.h>
#include <stddef.h>
#include "ckernel_dtype.h"

Go to the source code of this file.

Data Structures

struct  block_q4_0
 
struct  block_q4_1
 
struct  block_q4_K
 
struct  block_q5_0
 
struct  block_q5_1
 
struct  block_q5_K
 
struct  block_q6_K
 
struct  block_q8_0
 
struct  block_q8_K
 

Macros

#define CK_FP16_TO_FP32(x)   ck_fp16_to_fp32(x)
 
#define CK_FP16_TO_FP32_SIMD(x)   ck_fp16_to_fp32_simd(x)
 
#define CK_FP16_TO_FP32_SOFT(x)   ck_fp16_to_fp32_soft(x)
 
#define CK_FP32_TO_FP16(x)   ck_fp32_to_fp16(x)
 
#define CK_FP32_TO_FP16_SIMD(x)   ck_fp32_to_fp16_simd(x)
 
#define CK_FP32_TO_FP16_SOFT(x)   ck_fp32_to_fp16_soft(x)
 
#define ggml_fp16_to_fp32   ck_fp16_to_fp32
 
#define GGML_FP16_TO_FP32   CK_FP16_TO_FP32
 
#define ggml_fp32_to_fp16   ck_fp32_to_fp16
 
#define GGML_FP32_TO_FP16   CK_FP32_TO_FP16
 
#define K_SCALE_SIZE   12
 
#define QK4_0   32
 
#define QK4_1   32
 
#define QK5_0   32
 
#define QK5_1   32
 
#define QK8_0   32
 
#define QK_K   256
 

Typedefs

typedef uint16_t ck_half
 
typedef ck_half ggml_half
 

Functions

static float ck_fp16_to_fp32 (ck_half h)
 
static float ck_fp16_to_fp32_soft (ck_half h)
 Convert FP16 (ck_half) to FP32 — software implementation. More...
 
static ck_half ck_fp32_to_fp16 (float f)
 
static ck_half ck_fp32_to_fp16_soft (float f)
 Convert FP32 to FP16 (ck_half) — software implementation. More...
 
static size_t ck_quant_block_size (int type)
 Get the block size (number of weights per block) for a quant type. More...
 
static size_t ck_quant_row_size (int type, int64_t n_elements)
 Calculate total bytes needed for n_elements with given quant type. More...
 
static size_t ck_quant_type_size (int type)
 Get the byte size per block for a quant type. More...
 
void gemm_nt_q5_0_q8_0 (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)
 Batch GEMM with Q5_0 weights and Q8_0 activations for prefill. More...
 
void gemm_nt_q5_0_q8_0_unroll_avx (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q5_0_sse_v2 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q5_k (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q5_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q6_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q6_k_sse (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemv_q4_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q5_k (float *y, const void *W, const float *x, int M, int K)
 
void gemv_q5_k_ref (float *y, const void *W, const float *x, int M, int K)
 
void quantize_row_q8_0 (const float *x, void *vy, int k)
 Quantize FP32 to Q8_0 format (scalar reference) More...
 
void quantize_row_q8_k_sse (const float *x, void *vy, int k)
 
void rmsnorm_q8_k_fused (const float *input, const float *gamma, void *vy, int tokens, int d_model, int aligned_embed_dim, float eps)
 
static void unpack_q4_k_scales (const uint8_t *scales, uint8_t *sc, uint8_t *m)
 Unpack Q4_K sub-block scales and mins. More...
 
static void unpack_q5_k_scales (const uint8_t *scales, uint8_t *sc, uint8_t *m)
 Unpack Q5_K sub-block scales and mins. More...
 
void vec_dot_q5_0_q8_0 (int n, float *s, const void *vx, const void *vy)
 Auto-dispatch quantized dot product Q5_0 x Q8_0. More...
 
void vec_dot_q8_0_q8_0 (int n, float *s, const void *vx, const void *vy)
 Auto-dispatch quantized dot product Q8_0 x Q8_0. More...
 

Detailed Description

Quantization block structures for weight-only quantization.

Defines block structures for various quantization formats used in LLM inference. Primary focus on Q4_K_M which is commonly used for LLM weight compression.

Block structures are compatible with llama.cpp/GGML for model loading.

Definition in file ckernel_quant.h.

Macro Definition Documentation

◆ CK_FP16_TO_FP32

#define CK_FP16_TO_FP32 (   x)    ck_fp16_to_fp32(x)

Definition at line 400 of file ckernel_quant.h.

◆ CK_FP16_TO_FP32_SIMD

#define CK_FP16_TO_FP32_SIMD (   x)    ck_fp16_to_fp32_simd(x)

Definition at line 402 of file ckernel_quant.h.

◆ CK_FP16_TO_FP32_SOFT

#define CK_FP16_TO_FP32_SOFT (   x)    ck_fp16_to_fp32_soft(x)

Definition at line 404 of file ckernel_quant.h.

◆ CK_FP32_TO_FP16

#define CK_FP32_TO_FP16 (   x)    ck_fp32_to_fp16(x)

Definition at line 401 of file ckernel_quant.h.

◆ CK_FP32_TO_FP16_SIMD

#define CK_FP32_TO_FP16_SIMD (   x)    ck_fp32_to_fp16_simd(x)

Definition at line 403 of file ckernel_quant.h.

◆ CK_FP32_TO_FP16_SOFT

#define CK_FP32_TO_FP16_SOFT (   x)    ck_fp32_to_fp16_soft(x)

Definition at line 405 of file ckernel_quant.h.

◆ ggml_fp16_to_fp32

#define ggml_fp16_to_fp32   ck_fp16_to_fp32

Definition at line 409 of file ckernel_quant.h.

◆ GGML_FP16_TO_FP32

#define GGML_FP16_TO_FP32   CK_FP16_TO_FP32

Definition at line 411 of file ckernel_quant.h.

◆ ggml_fp32_to_fp16

#define ggml_fp32_to_fp16   ck_fp32_to_fp16

Definition at line 410 of file ckernel_quant.h.

◆ GGML_FP32_TO_FP16

#define GGML_FP32_TO_FP16   CK_FP32_TO_FP16

Definition at line 412 of file ckernel_quant.h.

◆ K_SCALE_SIZE

#define K_SCALE_SIZE   12

Definition at line 121 of file ckernel_quant.h.

◆ QK4_0

#define QK4_0   32

Definition at line 35 of file ckernel_quant.h.

◆ QK4_1

#define QK4_1   32

Definition at line 50 of file ckernel_quant.h.

◆ QK5_0

#define QK5_0   32

Definition at line 67 of file ckernel_quant.h.

◆ QK5_1

#define QK5_1   32

Definition at line 84 of file ckernel_quant.h.

◆ QK8_0

#define QK8_0   32

Definition at line 101 of file ckernel_quant.h.

◆ QK_K

#define QK_K   256

Definition at line 120 of file ckernel_quant.h.

Typedef Documentation

◆ ck_half

typedef uint16_t ck_half

Definition at line 26 of file ckernel_quant.h.

◆ ggml_half

typedef ck_half ggml_half

Definition at line 408 of file ckernel_quant.h.

Function Documentation

◆ ck_fp16_to_fp32()

static float ck_fp16_to_fp32 ( ck_half  h)
inlinestatic

Definition at line 383 of file ckernel_quant.h.

383  {
384 #if defined(__F16C__)
385  return ck_fp16_to_fp32_simd(h);
386 #else
387  return ck_fp16_to_fp32_soft(h);
388 #endif
389 }
static float ck_fp16_to_fp32_soft(ck_half h)
Convert FP16 (ck_half) to FP32 — software implementation.

References ck_fp16_to_fp32_soft().

◆ ck_fp16_to_fp32_soft()

static float ck_fp16_to_fp32_soft ( ck_half  h)
inlinestatic

Convert FP16 (ck_half) to FP32 — software implementation.

Definition at line 303 of file ckernel_quant.h.

303  {
304  uint32_t sign = (h & 0x8000) << 16;
305  uint32_t exp = (h >> 10) & 0x1F;
306  uint32_t mant = h & 0x3FF;
307 
308  uint32_t result;
309 
310  if (exp == 0) {
311  if (mant == 0) {
312  result = sign;
313  } else {
314  /* Denormalized - convert to normalized FP32 */
315  exp = 1;
316  while ((mant & 0x400) == 0) {
317  mant <<= 1;
318  exp--;
319  }
320  mant &= 0x3FF;
321  result = sign | ((exp + 127 - 15) << 23) | (mant << 13);
322  }
323  } else if (exp == 31) {
324  result = sign | 0x7F800000 | (mant << 13);
325  } else {
326  result = sign | ((exp + 127 - 15) << 23) | (mant << 13);
327  }
328 
329  union { uint32_t u; float f; } u;
330  u.u = result;
331  return u.f;
332 }

Referenced by ck_fp16_to_fp32().

◆ ck_fp32_to_fp16()

static ck_half ck_fp32_to_fp16 ( float  f)
inlinestatic

Definition at line 391 of file ckernel_quant.h.

391  {
392 #if defined(__F16C__)
393  return ck_fp32_to_fp16_simd(f);
394 #else
395  return ck_fp32_to_fp16_soft(f);
396 #endif
397 }
static ck_half ck_fp32_to_fp16_soft(float f)
Convert FP32 to FP16 (ck_half) — software implementation.

References ck_fp32_to_fp16_soft().

◆ ck_fp32_to_fp16_soft()

static ck_half ck_fp32_to_fp16_soft ( float  f)
inlinestatic

Convert FP32 to FP16 (ck_half) — software implementation.

Definition at line 337 of file ckernel_quant.h.

337  {
338  union { uint32_t u; float f; } u;
339  u.f = f;
340 
341  uint32_t sign = (u.u >> 16) & 0x8000;
342  int32_t exp = ((u.u >> 23) & 0xFF) - 127 + 15;
343  uint32_t mant = (u.u >> 13) & 0x3FF;
344 
345  if (exp <= 0) {
346  if (exp < -10) {
347  return sign;
348  }
349  mant = (mant | 0x400) >> (1 - exp);
350  return sign | mant;
351  } else if (exp >= 31) {
352  return sign | 0x7C00;
353  }
354 
355  return sign | (exp << 10) | mant;
356 }

Referenced by ck_fp32_to_fp16().

◆ ck_quant_block_size()

static size_t ck_quant_block_size ( int  type)
inlinestatic

Get the block size (number of weights per block) for a quant type.

Definition at line 184 of file ckernel_quant.h.

184  {
185  switch (type) {
186  case 0: return QK4_0; /* Q4_0 */
187  case 1: return QK8_0; /* Q8_0 */
188  case 2: return QK_K; /* Q4_K */
189  case 3: return QK_K; /* Q8_K */
190  case CK_DT_Q4_1: return QK4_1;
191  case CK_DT_Q5_0: return QK5_0;
192  case CK_DT_Q5_1: return QK5_1;
193  case CK_DT_Q5_K: return QK_K;
194  case CK_DT_Q6_K: return QK_K;
195  default: return 1;
196  }
197 }
@ CK_DT_Q5_0
Definition: ckernel_dtype.h:44
@ CK_DT_Q5_K
Definition: ckernel_dtype.h:46
@ CK_DT_Q6_K
Definition: ckernel_dtype.h:41
@ CK_DT_Q4_1
Definition: ckernel_dtype.h:39
@ CK_DT_Q5_1
Definition: ckernel_dtype.h:45
#define QK5_0
Definition: ckernel_quant.h:67
#define QK5_1
Definition: ckernel_quant.h:84
#define QK4_0
Definition: ckernel_quant.h:35
#define QK4_1
Definition: ckernel_quant.h:50
#define QK8_0
#define QK_K

References CK_DT_Q4_1, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q5_K, CK_DT_Q6_K, QK4_0, QK4_1, QK5_0, QK5_1, QK8_0, and QK_K.

Referenced by ck_quant_row_size().

◆ ck_quant_row_size()

static size_t ck_quant_row_size ( int  type,
int64_t  n_elements 
)
inlinestatic

Calculate total bytes needed for n_elements with given quant type.

Definition at line 220 of file ckernel_quant.h.

220  {
221  size_t block_size = ck_quant_block_size(type);
222  size_t type_size = ck_quant_type_size(type);
223  return (n_elements / block_size) * type_size;
224 }
static size_t ck_quant_type_size(int type)
Get the byte size per block for a quant type.
static size_t ck_quant_block_size(int type)
Get the block size (number of weights per block) for a quant type.

References ck_quant_block_size(), and ck_quant_type_size().

◆ ck_quant_type_size()

static size_t ck_quant_type_size ( int  type)
inlinestatic

Get the byte size per block for a quant type.

Definition at line 202 of file ckernel_quant.h.

202  {
203  switch (type) {
204  case 0: return sizeof(block_q4_0);
205  case 1: return sizeof(block_q8_0);
206  case 2: return sizeof(block_q4_K);
207  case 3: return sizeof(block_q8_K);
208  case CK_DT_Q4_1: return sizeof(block_q4_1);
209  case CK_DT_Q5_0: return sizeof(block_q5_0);
210  case CK_DT_Q5_1: return sizeof(block_q5_1);
211  case CK_DT_Q5_K: return sizeof(block_q5_K);
212  case CK_DT_Q6_K: return sizeof(block_q6_K);
213  default: return 4; /* FP32 */
214  }
215 }

References CK_DT_Q4_1, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q5_K, and CK_DT_Q6_K.

Referenced by ck_quant_row_size().

◆ gemm_nt_q5_0_q8_0()

void gemm_nt_q5_0_q8_0 ( const void *  A_q8,
const void *  B_q5,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.

Computes C = A @ B^T + bias where: A: [M x K] Q8_0 quantized activations (M tokens, K features) B: [N x K] Q5_0 quantized weights (N outputs, K features) C: [M x N] FP32 output

This is the INT8 batch kernel for prefill, using pre-quantized activations to avoid FP32->Q8_0 conversion overhead per operation.

Parameters
A_q8Input activations in Q8_0 format [M rows of K/32 blocks each]
B_q5Weights in Q5_0 format [N rows of K/32 blocks each]
biasOptional bias vector [N], NULL if not used
COutput matrix [M x N], row-major FP32
MBatch size (number of tokens)
NOutput dimension (number of output features)
KInput dimension (must be multiple of 32)

Definition at line 1617 of file gemm_kernels_q5_0.c.

1625 {
1626  const block_q5_0 *weights = (const block_q5_0 *)B_q5;
1627  const block_q8_0 *inputs = (const block_q8_0 *)A_q8;
1628  const int blocks_per_row = K / QK5_0;
1629 
1630  for (int m = 0; m < M; m++) {
1631  const block_q8_0 *input_row = &inputs[m * blocks_per_row];
1632 
1633  for (int n = 0; n < N; n++) {
1634  const block_q5_0 *weight_row = &weights[n * blocks_per_row];
1635  float *out = &C[m * N + n];
1636 
1637  /* Dispatches to vec_dot_q5_0_q8_0_avx (2x block unrolled) on AVX */
1638  vec_dot_q5_0_q8_0(K, out, weight_row, input_row);
1639 
1640  if (bias) {
1641  *out += bias[n];
1642  }
1643  }
1644  }
1645 }
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.
#define C(color)
Definition: show_config.c:39

References C, QK5_0, and vec_dot_q5_0_q8_0().

Referenced by gemm_nt_q8_0_dispatch(), and gemm_nt_q8_0_mlp_dispatch().

◆ gemm_nt_q5_0_q8_0_unroll_avx()

void gemm_nt_q5_0_q8_0_unroll_avx ( const void *  A_q8,
const void *  B_q5,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

◆ gemm_nt_q5_0_sse_v2()

void gemm_nt_q5_0_sse_v2 ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 77 of file gemm_kernels_q5_0_sse_v2.c.

82 {
83  if (K % QK_K != 0) {
84  gemm_nt_q5_0_ref(A, B, bias, C, M, N, K);
85  return;
86  }
87 
88  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
89  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
90 
91  const block_q5_0 *weights = (const block_q5_0 *)B;
92  const int blocks_per_row = K / 32;
93 
94  for (int m = 0; m < M; m++) {
95  quantize_row_q8_k(&A[m * K], A_q8, K);
96 
97  for (int n = 0; n < N; n++) {
98  float sumf = 0.0f;
99  const block_q5_0 *w_row = weights + n * blocks_per_row;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  int q8_block_idx = (b * 32) / QK_K;
103  int q8_offset = (b * 32) % QK_K;
104  sumf += dot_q5_0_q8_k_32_sse(&w_row[b], &A_q8[q8_block_idx], q8_offset);
105  }
106 
107  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
108  }
109  }
110 }
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_0 weights: C = A @ B^T.
static float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)

References C, dot_q5_0_q8_k_32_sse(), gemm_nt_q5_0_ref(), QK_K, and quantize_row_q8_k().

◆ gemm_nt_q5_k()

void gemm_nt_q5_k ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 218 of file gemm_kernels_q5_k.c.

223 {
224 #if defined(__AVX512F__)
225  /* TODO: AVX-512 implementation */
226  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
227 #elif defined(__AVX2__)
228  /* TODO: AVX-2 implementation */
229  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
230 #elif defined(__AVX__)
231  /* TODO: AVX implementation */
232  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
233 #elif defined(__SSE4_1__)
234  /* TODO: SSE4.1 implementation */
235  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
236 #else
237  gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
238 #endif
239 }
void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)

◆ gemm_nt_q5_k_ref()

void gemm_nt_q5_k_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 145 of file gemm_kernels_q5_k.c.

150 {
151  const block_q5_K *blocks = (const block_q5_K *)B;
152  const int blocks_per_col = K / QK_K;
153 
154  for (int m = 0; m < M; m++) {
155  const float *a_row = &A[m * K];
156 
157  for (int n = 0; n < N; n++) {
158  float sum = 0.0f;
159 
160  for (int b = 0; b < blocks_per_col; b++) {
161  const block_q5_K *block = &blocks[n * blocks_per_col + b];
162  const float d = CK_FP16_TO_FP32(block->d);
163  const float dmin = CK_FP16_TO_FP32(block->dmin);
164  const uint8_t *scales = block->scales;
165  const uint8_t *qh = block->qh;
166  const uint8_t *qs = block->qs;
167 
168  /* Process 8 sub-blocks of 32 weights each */
169  for (int sb = 0; sb < 8; sb++) {
170  uint8_t sc, m;
171  get_q5_k_scale_min(sb, scales, &sc, &m);
172 
173  const float d_sub = d * (float)sc / 64.0f;
174  const float m_sub = dmin * (float)m / 64.0f;
175 
176  const int qs_offset = sb * 16;
177  const int qh_offset = sb * 4;
178 
179  for (int i = 0; i < 32; i++) {
180  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
181  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
182  uint8_t q = qs_val | (qh_bit << 4);
183 
184  float w = d_sub * (float)q - m_sub;
185  sum += w * a_row[b * QK_K + sb * 32 + i];
186  }
187  }
188  }
189 
190  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
191  }
192  }
193 }
#define CK_FP16_TO_FP32(x)
static void get_q5_k_scale_min(int j, const uint8_t *scales, uint8_t *scale, uint8_t *min)
#define QK_K
ck_half dmin
uint8_t qh[256/8]
uint8_t qs[256/2]
uint8_t scales[12]

References C, CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemm_nt_q5_k().

◆ gemm_nt_q6_k_ref()

void gemm_nt_q6_k_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 243 of file gemm_kernels_q6k.c.

248 {
249  gemm_nt_q6_k(A, B, bias, C, M, N, K);
250 }
void gemm_nt_q6_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)

References C, and gemm_nt_q6_k().

Referenced by gemm_nt_q6_k_sse().

◆ gemm_nt_q6_k_sse()

void gemm_nt_q6_k_sse ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 66 of file gemm_kernels_q6k_sse.c.

71 {
72  if (K % QK_K != 0) {
73  gemm_nt_q6_k_ref(A, B, bias, C, M, N, K);
74  return;
75  }
76 
77  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
78  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
79 
80  const block_q6_K *weights = (const block_q6_K *)B;
81  const int blocks_per_row = K / QK_K;
82 
83  for (int m = 0; m < M; m++) {
84  quantize_row_q8_k(&A[m * K], A_q8, K);
85 
86  for (int n = 0; n < N; n++) {
87  float sumf = 0.0f;
88  const block_q6_K *w_row = weights + n * blocks_per_row;
89 
90  for (int b = 0; b < blocks_per_row; b++) {
91  sumf += dot_q6_k_q8_k_256_sse(&w_row[b], &A_q8[b]);
92  }
93 
94  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
95  }
96  }
97 }
void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
static float dot_q6_k_q8_k_256_sse(const block_q6_K *bw, const block_q8_K *ba)
void quantize_row_q8_k(const float *x, void *vy, int k)

References C, dot_q6_k_q8_k_256_sse(), gemm_nt_q6_k_ref(), QK_K, and quantize_row_q8_k().

◆ gemv_q4_k_q8_k_sse()

void gemv_q4_k_q8_k_sse ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 33 of file gemm_kernels_q4k_sse.c.

37 {
38  const block_q4_K *blocks = (const block_q4_K *)W;
39  const block_q8_K *x = (const block_q8_K *)x_q8;
40  const int blocks_per_row = K / QK_K;
41 
42  const __m128i mask_low = _mm_set1_epi8(0x0F);
43 
44  for (int row = 0; row < M; ++row) {
45  float sumf = 0.0f;
46  const block_q4_K *w_row = blocks + row * blocks_per_row;
47 
48  for (int i = 0; i < blocks_per_row; ++i) {
49  const block_q4_K *b4 = &w_row[i];
50  const block_q8_K *b8 = &x[i];
51 
52  // Unpack scales (same as ref)
53  uint8_t sc[8], m_val[8];
54  unpack_q4_k_scales(b4->scales, sc, m_val);
55 
56  float d = CK_FP16_TO_FP32(b4->d) * b8->d;
57  float dmin = CK_FP16_TO_FP32(b4->dmin) * b8->d;
58 
59  int is = 0;
60  int q_offset = 0;
61 
62  // Process 4 chunks of 64 elements (256 total)
63  for (int j = 0; j < QK_K; j += 64) {
64  // We process 32 bytes of qs (covering 64 elements via low/high nibbles)
65  // We access qs[0..31] relative to q_offset
66 
67  // Accumulators for this 64-element chunk
68  __m128i acc_lo = _mm_setzero_si128();
69  __m128i acc_hi = _mm_setzero_si128();
70 
71  // Inner loop: 2 iters of 16 bytes (32 elements)
72  for (int l = 0; l < 32; l += 16) {
73  // Load 16 bytes of Q4
74  __m128i q4_vec = _mm_loadu_si128((const __m128i *)(b4->qs + q_offset + l));
75 
76  // Low nibbles -> correspond to q8_lo (elements j+l .. j+l+15)
77  __m128i q4_lo = _mm_and_si128(q4_vec, mask_low);
78 
79  // High nibbles -> correspond to q8_hi (elements j+32+l .. j+32+l+15)
80  __m128i q4_hi = _mm_and_si128(_mm_srli_epi16(q4_vec, 4), mask_low);
81 
82  // Load Q8
83  __m128i q8_lo_vec = _mm_loadu_si128((const __m128i *)(b8->qs + j + l));
84  __m128i q8_hi_vec = _mm_loadu_si128((const __m128i *)(b8->qs + j + 32 + l));
85 
86  // Expand and Multiply-Add: Q4(u8) * Q8(s8) -> i32
87  // Since Q4 is u8 and Q8 is s8, we use intermediate i16
88 
89  // LO PART
90  __m128i q4_lo_16_L = _mm_cvtepu8_epi16(q4_lo); // lower 8 -> 16
91  __m128i q8_lo_16_L = _mm_cvtepi8_epi16(q8_lo_vec);
92  __m128i prod_lo_L = _mm_madd_epi16(q4_lo_16_L, q8_lo_16_L); // i32
93  acc_lo = _mm_add_epi32(acc_lo, prod_lo_L);
94 
95  __m128i q4_lo_16_H = _mm_cvtepu8_epi16(_mm_srli_si128(q4_lo, 8)); // upper 8 -> 16
96  __m128i q8_lo_16_H = _mm_cvtepi8_epi16(_mm_srli_si128(q8_lo_vec, 8));
97  __m128i prod_lo_H = _mm_madd_epi16(q4_lo_16_H, q8_lo_16_H); // i32
98  acc_lo = _mm_add_epi32(acc_lo, prod_lo_H);
99 
100  // HI PART
101  __m128i q4_hi_16_L = _mm_cvtepu8_epi16(q4_hi);
102  __m128i q8_hi_16_L = _mm_cvtepi8_epi16(q8_hi_vec);
103  __m128i prod_hi_L = _mm_madd_epi16(q4_hi_16_L, q8_hi_16_L);
104  acc_hi = _mm_add_epi32(acc_hi, prod_hi_L);
105 
106  __m128i q4_hi_16_H = _mm_cvtepu8_epi16(_mm_srli_si128(q4_hi, 8));
107  __m128i q8_hi_16_H = _mm_cvtepi8_epi16(_mm_srli_si128(q8_hi_vec, 8));
108  __m128i prod_hi_H = _mm_madd_epi16(q4_hi_16_H, q8_hi_16_H);
109  acc_hi = _mm_add_epi32(acc_hi, prod_hi_H);
110  }
111 
112  int32_t sum_q4q8_lo = hsum_epi32_sse(acc_lo);
113  int32_t sum_q4q8_hi = hsum_epi32_sse(acc_hi);
114 
115  /* bsums: each bsum is 16 elements */
116  int32_t bsum_lo = (int32_t)b8->bsums[j / 16] +
117  (int32_t)b8->bsums[j / 16 + 1];
118  int32_t bsum_hi = (int32_t)b8->bsums[(j + 32) / 16] +
119  (int32_t)b8->bsums[(j + 32) / 16 + 1];
120 
121  sumf += d * (float)sc[is] * (float)sum_q4q8_lo;
122  sumf -= dmin * (float)m_val[is] * (float)bsum_lo;
123  sumf += d * (float)sc[is + 1] * (float)sum_q4q8_hi;
124  sumf -= dmin * (float)m_val[is + 1] * (float)bsum_hi;
125 
126  q_offset += 32;
127  is += 2;
128  }
129  }
130  y[row] = sumf;
131  }
132 }
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
static int32_t hsum_epi32_sse(__m128i v)
uint8_t scales[12]
uint8_t qs[256/2]
ck_half dmin
int8_t qs[256]
int16_t bsums[256/16]

References block_q8_K::bsums, CK_FP16_TO_FP32, block_q4_K::d, block_q8_K::d, block_q4_K::dmin, hsum_epi32_sse(), QK_K, block_q4_K::qs, block_q8_K::qs, block_q4_K::scales, and unpack_q4_k_scales().

◆ gemv_q5_k()

void gemv_q5_k ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Definition at line 199 of file gemm_kernels_q5_k.c.

200 {
201 #if defined(__AVX512F__)
202  /* TODO: AVX-512 implementation */
203  gemv_q5_k_ref(y, W, x, M, K);
204 #elif defined(__AVX2__)
205  /* TODO: AVX-2 implementation */
206  gemv_q5_k_ref(y, W, x, M, K);
207 #elif defined(__AVX__)
208  /* TODO: AVX implementation */
209  gemv_q5_k_ref(y, W, x, M, K);
210 #elif defined(__SSE4_1__)
211  /* TODO: SSE4.1 implementation */
212  gemv_q5_k_ref(y, W, x, M, K);
213 #else
214  gemv_q5_k_ref(y, W, x, M, K);
215 #endif
216 }
void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)

◆ gemv_q5_k_ref()

void gemv_q5_k_ref ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Definition at line 92 of file gemm_kernels_q5_k.c.

93 {
94  const block_q5_K *blocks = (const block_q5_K *)W;
95  const int blocks_per_row = K / QK_K;
96 
97  for (int m = 0; m < M; m++) {
98  const float *x_row = x;
99  float sum = 0.0f;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  const block_q5_K *block = &blocks[m * blocks_per_row + b];
103  const float d = CK_FP16_TO_FP32(block->d);
104  const float dmin = CK_FP16_TO_FP32(block->dmin);
105  const uint8_t *scales = block->scales;
106  const uint8_t *qh = block->qh;
107  const uint8_t *qs = block->qs;
108 
109  /* Process 8 sub-blocks of 32 weights each */
110  for (int sb = 0; sb < 8; sb++) {
111  uint8_t sc, m;
112  get_q5_k_scale_min(sb, scales, &sc, &m);
113 
114  const float d_sub = d * (float)sc / 64.0f;
115  const float m_sub = dmin * (float)m / 64.0f;
116 
117  /* Each sub-block has 32 weights: low 4 bits in qs, high 1 bit in qh */
118  const int qs_offset = sb * 16; /* 16 bytes per sub-block */
119  const int qh_offset = sb * 4; /* 4 bytes per sub-block */
120 
121  for (int i = 0; i < 32; i++) {
122  uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;
123  uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;
124  uint8_t q = qs_val | (qh_bit << 4);
125 
126  /* Q5_K dequantization: w = d * sc/64 * q - dmin * m/64 */
127  float w = d_sub * (float)q - m_sub;
128  sum += w * x_row[b * QK_K + sb * 32 + i];
129  }
130  }
131  }
132 
133  y[m] = sum;
134  }
135 }

References CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.

Referenced by gemv_q5_k().

◆ quantize_row_q8_0()

void quantize_row_q8_0 ( const float *  x,
void *  vy,
int  k 
)

Quantize FP32 to Q8_0 format (scalar reference)

Parameters
xInput FP32 values
vyOutput Q8_0 blocks
kNumber of elements (must be multiple of 32)

Definition at line 59 of file gemm_kernels_q8_0.c.

60 {
61  block_q8_0 *y = (block_q8_0 *)vy;
62  const int nb = k / QK8_0; /* QK8_0 = 32 */
63 
64 #if defined(__AVX__)
65  const __m256 sign_bit = _mm256_set1_ps(-0.0f);
66  const __m256 v_half = _mm256_set1_ps(0.5f);
67  const __m256 v_min = _mm256_set1_ps(-127.0f);
68  const __m256 v_max = _mm256_set1_ps(127.0f);
69 
70  for (int i = 0; i < nb; i++) {
71  __m256 v0 = _mm256_loadu_ps(x + 0);
72  __m256 v1 = _mm256_loadu_ps(x + 8);
73  __m256 v2 = _mm256_loadu_ps(x + 16);
74  __m256 v3 = _mm256_loadu_ps(x + 24);
75  x += QK8_0;
76 
77  __m256 max_abs = _mm256_andnot_ps(sign_bit, v0);
78  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v1));
79  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v2));
80  max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v3));
81 
82  __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(max_abs, 1),
83  _mm256_castps256_ps128(max_abs));
84  max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
85  max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
86  const float max_scalar = _mm_cvtss_f32(max4);
87 
88  const float d = max_scalar / 127.0f;
89  const float id = max_scalar != 0.0f ? 127.0f / max_scalar : 0.0f;
90  y[i].d = CK_FP32_TO_FP16(d);
91 
92  const __m256 mul = _mm256_set1_ps(id);
93  v0 = _mm256_mul_ps(v0, mul);
94  v1 = _mm256_mul_ps(v1, mul);
95  v2 = _mm256_mul_ps(v2, mul);
96  v3 = _mm256_mul_ps(v3, mul);
97 
98  v0 = _mm256_min_ps(_mm256_max_ps(v0, v_min), v_max);
99  v1 = _mm256_min_ps(_mm256_max_ps(v1, v_min), v_max);
100  v2 = _mm256_min_ps(_mm256_max_ps(v2, v_min), v_max);
101  v3 = _mm256_min_ps(_mm256_max_ps(v3, v_min), v_max);
102 
103  /* Round half away from zero to match the scalar path */
104  v0 = _mm256_add_ps(v0, _mm256_or_ps(_mm256_and_ps(v0, sign_bit), v_half));
105  v1 = _mm256_add_ps(v1, _mm256_or_ps(_mm256_and_ps(v1, sign_bit), v_half));
106  v2 = _mm256_add_ps(v2, _mm256_or_ps(_mm256_and_ps(v2, sign_bit), v_half));
107  v3 = _mm256_add_ps(v3, _mm256_or_ps(_mm256_and_ps(v3, sign_bit), v_half));
108 
109  __m256i i0 = _mm256_cvttps_epi32(v0);
110  __m256i i1 = _mm256_cvttps_epi32(v1);
111  __m256i i2 = _mm256_cvttps_epi32(v2);
112  __m256i i3 = _mm256_cvttps_epi32(v3);
113 
114 #if defined(__AVX2__)
115  i0 = _mm256_packs_epi32(i0, i1);
116  i2 = _mm256_packs_epi32(i2, i3);
117  i0 = _mm256_packs_epi16(i0, i2);
118 
119  const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
120  i0 = _mm256_permutevar8x32_epi32(i0, perm);
121  _mm256_storeu_si256((__m256i *)y[i].qs, i0);
122 #else
123  __m128i ni0 = _mm256_castsi256_si128(i0);
124  __m128i ni1 = _mm256_extractf128_si256(i0, 1);
125  __m128i ni2 = _mm256_castsi256_si128(i1);
126  __m128i ni3 = _mm256_extractf128_si256(i1, 1);
127  __m128i ni4 = _mm256_castsi256_si128(i2);
128  __m128i ni5 = _mm256_extractf128_si256(i2, 1);
129  __m128i ni6 = _mm256_castsi256_si128(i3);
130  __m128i ni7 = _mm256_extractf128_si256(i3, 1);
131 
132  ni0 = _mm_packs_epi32(ni0, ni1);
133  ni2 = _mm_packs_epi32(ni2, ni3);
134  ni4 = _mm_packs_epi32(ni4, ni5);
135  ni6 = _mm_packs_epi32(ni6, ni7);
136 
137  ni0 = _mm_packs_epi16(ni0, ni2);
138  ni4 = _mm_packs_epi16(ni4, ni6);
139 
140  _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
141  _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
142 #endif
143  }
144 #else
145  for (int i = 0; i < nb; i++) {
146  const float *xb = x + i * QK8_0;
147 
148  /* Find max absolute value in block */
149  float amax = 0.0f;
150  for (int j = 0; j < QK8_0; j++) {
151  float av = xb[j] >= 0 ? xb[j] : -xb[j];
152  if (av > amax) amax = av;
153  }
154 
155  /* Compute scale: d = max / 127 */
156  float d = amax / 127.0f;
157  float id = d != 0.0f ? 127.0f / amax : 0.0f;
158 
159  /* Store scale as FP16 */
160  y[i].d = CK_FP32_TO_FP16(d);
161 
162  /* Quantize values */
163  for (int j = 0; j < QK8_0; j++) {
164  float v = xb[j] * id;
165  /* Round to nearest int and clamp to [-127, 127] */
166  int q = (int)(v + (v >= 0 ? 0.5f : -0.5f));
167  if (q > 127) q = 127;
168  if (q < -127) q = -127;
169  y[i].qs[j] = (int8_t)q;
170  }
171  }
172 #endif
173 }
#define CK_FP32_TO_FP16(x)
int8_t qs[32]
int32_t id
Definition: tokenizer.h:315

◆ quantize_row_q8_k_sse()

void quantize_row_q8_k_sse ( const float *  x,
void *  vy,
int  k 
)

Definition at line 29 of file quantize_row_q8_k_sse.c.

29  {
30  if (!x || !vy || k <= 0) {
31  return;
32  }
33  assert(k % QK_K == 0);
34  const int nb = k / QK_K;
35  block_q8_K *y = (block_q8_K *)vy;
36 
37  for (int i = 0; i < nb; ++i) {
38  float max = 0.0f;
39 
40  // SSE max absolute value
41  __m128 v_max = _mm_setzero_ps();
42  for (int j = 0; j < QK_K; j += 4) {
43  __m128 v = _mm_loadu_ps(x + j);
44  __m128 v_abs = _mm_andnot_ps(_mm_set1_ps(-0.0f), v);
45  v_max = _mm_max_ps(v_max, v_abs);
46  }
47 
48  // Horizontal max
49  v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(1, 0, 3, 2)));
50  v_max = _mm_max_ps(v_max, _mm_shuffle_ps(v_max, v_max, _MM_SHUFFLE(0, 1, 0, 1)));
51  _mm_store_ss(&max, v_max);
52 
53  if (max == 0.0f) {
54  y[i].d = 0.0f;
55  memset(y[i].qs, 0, sizeof(y[i].qs));
56  memset(y[i].bsums, 0, sizeof(y[i].bsums));
57  x += QK_K;
58  continue;
59  }
60 
61  const float iscale = -127.0f / max;
62  __m128 v_iscale = _mm_set1_ps(iscale);
63 
64  // Quantize and compute bsums in SSE
65  for (int j = 0; j < QK_K; j += 16) {
66  __m128 x0 = _mm_loadu_ps(x + j + 0);
67  __m128 x1 = _mm_loadu_ps(x + j + 4);
68  __m128 x2 = _mm_loadu_ps(x + j + 8);
69  __m128 x3 = _mm_loadu_ps(x + j + 12);
70 
71  __m128i q0 = _mm_cvtps_epi32(_mm_mul_ps(x0, v_iscale));
72  __m128i q1 = _mm_cvtps_epi32(_mm_mul_ps(x1, v_iscale));
73  __m128i q2 = _mm_cvtps_epi32(_mm_mul_ps(x2, v_iscale));
74  __m128i q3 = _mm_cvtps_epi32(_mm_mul_ps(x3, v_iscale));
75 
76  // Pack i32 -> i16 -> i8
77  __m128i q01 = _mm_packs_epi32(q0, q1);
78  __m128i q23 = _mm_packs_epi32(q2, q3);
79  __m128i q0123 = _mm_packs_epi16(q01, q23);
80 
81  _mm_storeu_si128((__m128i *)(y[i].qs + j), q0123);
82 
83  // Compute bsum for these 16 elements
84  // Each bsum[j/16] covers 16 elements
85  __m128i p01 = _mm_add_epi16(q01, q23);
86  p01 = _mm_add_epi16(p01, _mm_shuffle_epi32(p01, _MM_SHUFFLE(1, 0, 3, 2)));
87  p01 = _mm_add_epi16(p01, _mm_shufflelo_epi16(p01, _MM_SHUFFLE(1, 0, 3, 2)));
88  int16_t bsum = (int16_t)_mm_extract_epi16(p01, 0) + (int16_t)_mm_extract_epi16(p01, 1);
89  y[i].bsums[j / 16] = bsum;
90  }
91 
92  y[i].d = 1.0f / iscale;
93  x += QK_K;
94  }
95 }

References block_q8_K::bsums, block_q8_K::d, and QK_K.

◆ rmsnorm_q8_k_fused()

void rmsnorm_q8_k_fused ( const float *  input,
const float *  gamma,
void *  vy,
int  tokens,
int  d_model,
int  aligned_embed_dim,
float  eps 
)

Fused RMSNorm + Q8_K Quantization

Benefits:

  1. Single pass over input data (reduces DRAM pressure)
  2. Normalization results stay in registers for quantization
  3. Keeps hot data in L1/L2 cache

Definition at line 54 of file rmsnorm_q8_k_fused.c.

61 {
62  const int T = tokens;
63  const int D = d_model;
64  block_q8_K *y = (block_q8_K *)vy;
65 
66  for (int t = 0; t < T; ++t) {
67  const float *x = input + (size_t)t * aligned_embed_dim;
68 
69  // 1. Compute sum of squares using AVX
70  __m256 sum_sq_vec = _mm256_setzero_ps();
71  for (int d = 0; d < D; d += 8) {
72  __m256 xv = _mm256_loadu_ps(&x[d]);
73  sum_sq_vec = _mm256_add_ps(sum_sq_vec, _mm256_mul_ps(xv, xv));
74  }
75  float sum_sq = hsum256_ps_fused(sum_sq_vec);
76  float rstd = 1.0f / sqrtf(sum_sq / (float)D + eps);
77  __m256 vrstd = _mm256_set1_ps(rstd);
78 
79  // 2. We need the max absolute value of the NORMALIZED data for quantization
80  // y_i = gamma_i * (x_i * rstd)
81  // We do this in blocks of QK_K (256) to match Q8_K layout
82  for (int b = 0; b < D / QK_K; ++b) {
83  const float *xb = x + b * QK_K;
84  const float *gb = gamma + b * QK_K;
85  block_q8_K *out_block = &y[t * (D / QK_K) + b];
86 
87  // Local normalization and max search
88  __m256 v_max_abs = _mm256_setzero_ps();
89  float norm_buf[QK_K];
90 
91  for (int d = 0; d < QK_K; d += 8) {
92  __m256 xv = _mm256_loadu_ps(&xb[d]);
93  __m256 gv = _mm256_loadu_ps(&gb[d]);
94  __m256 normalized = _mm256_mul_ps(_mm256_mul_ps(xv, vrstd), gv);
95 
96  _mm256_storeu_ps(&norm_buf[d], normalized);
97 
98  __m256 v_abs = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), normalized);
99  v_max_abs = _mm256_max_ps(v_max_abs, v_abs);
100  }
101 
102  float max_val = hmax256_ps_fused(v_max_abs);
103  if (max_val == 0.0f) {
104  out_block->d = 0.0f;
105  memset(out_block->qs, 0, QK_K);
106  memset(out_block->bsums, 0, sizeof(out_block->bsums));
107  continue;
108  }
109 
110  // 3. Quantize to Q8_K
111  float iscale = -127.0f / max_val;
112  __m256 v_iscale = _mm256_set1_ps(iscale);
113  out_block->d = 1.0f / iscale;
114 
115  for (int j = 0; j < QK_K; j += 16) {
116  // AVX1 doesn't have 256-bit integer conversion, so we use 128-bit SSE for packing
117  __m128 n0 = _mm_loadu_ps(&norm_buf[j + 0]);
118  __m128 n1 = _mm_loadu_ps(&norm_buf[j + 4]);
119  __m128 n2 = _mm_loadu_ps(&norm_buf[j + 8]);
120  __m128 n3 = _mm_loadu_ps(&norm_buf[j + 12]);
121 
122  __m128i q0 = _mm_cvtps_epi32(_mm_mul_ps(n0, _mm256_castps256_ps128(v_iscale)));
123  __m128i q1 = _mm_cvtps_epi32(_mm_mul_ps(n1, _mm256_castps256_ps128(v_iscale)));
124  __m128i q2 = _mm_cvtps_epi32(_mm_mul_ps(n2, _mm256_castps256_ps128(v_iscale)));
125  __m128i q3 = _mm_cvtps_epi32(_mm_mul_ps(n3, _mm256_castps256_ps128(v_iscale)));
126 
127  __m128i q01 = _mm_packs_epi32(q0, q1);
128  __m128i q23 = _mm_packs_epi32(q2, q3);
129  __m128i q0123 = _mm_packs_epi16(q01, q23);
130 
131  _mm_storeu_si128((__m128i *)(out_block->qs + j), q0123);
132 
133  // Compute bsum for 16 elements
134  __m128i p01 = _mm_add_epi16(q01, q23);
135  p01 = _mm_add_epi16(p01, _mm_shuffle_epi32(p01, _MM_SHUFFLE(1, 0, 3, 2)));
136  p01 = _mm_add_epi16(p01, _mm_shufflelo_epi16(p01, _MM_SHUFFLE(1, 0, 3, 2)));
137  int16_t bsum = (int16_t)_mm_extract_epi16(p01, 0) + (int16_t)_mm_extract_epi16(p01, 1);
138  out_block->bsums[j / 16] = bsum;
139  }
140  }
141  }
142 }
static float hmax256_ps_fused(__m256 v)
static float hsum256_ps_fused(__m256 v)

References block_q8_K::bsums, block_q8_K::d, hmax256_ps_fused(), hsum256_ps_fused(), QK_K, and block_q8_K::qs.

◆ unpack_q4_k_scales()

static void unpack_q4_k_scales ( const uint8_t *  scales,
uint8_t *  sc,
uint8_t *  m 
)
inlinestatic

Unpack Q4_K sub-block scales and mins.

Parameters
scalesThe packed scales[12] array from block_q4_K
scOutput: 8 unpacked scale values (multiply by super-block d)
mOutput: 8 unpacked min values (multiply by super-block dmin)

This matches llama.cpp's get_scale_min_k4() function exactly. The 12-byte scales array layout:

  • bytes 0-3: 6-bit scales[0-3] (high 2 bits used for scales[4-7])
  • bytes 4-7: 6-bit mins[0-3] (high 2 bits used for mins[4-7])
  • bytes 8-11: low 4 bits for scales[4-7], high 4 bits for mins[4-7]

Definition at line 246 of file ckernel_quant.h.

247  {
248  /* Direct 6-bit values for indices 0-3 */
249  sc[0] = scales[0] & 0x3F;
250  sc[1] = scales[1] & 0x3F;
251  sc[2] = scales[2] & 0x3F;
252  sc[3] = scales[3] & 0x3F;
253 
254  m[0] = scales[4] & 0x3F;
255  m[1] = scales[5] & 0x3F;
256  m[2] = scales[6] & 0x3F;
257  m[3] = scales[7] & 0x3F;
258 
259  /* 6-bit values for indices 4-7: low 4 bits from bytes 8-11,
260  * high 2 bits from upper bits of bytes 0-3 (scales) and 4-7 (mins) */
261  sc[4] = (scales[8] & 0x0F) | ((scales[0] >> 6) << 4);
262  sc[5] = (scales[9] & 0x0F) | ((scales[1] >> 6) << 4);
263  sc[6] = (scales[10] & 0x0F) | ((scales[2] >> 6) << 4);
264  sc[7] = (scales[11] & 0x0F) | ((scales[3] >> 6) << 4);
265 
266  m[4] = (scales[8] >> 4) | ((scales[4] >> 6) << 4);
267  m[5] = (scales[9] >> 4) | ((scales[5] >> 6) << 4);
268  m[6] = (scales[10] >> 4) | ((scales[6] >> 6) << 4);
269  m[7] = (scales[11] >> 4) | ((scales[7] >> 6) << 4);
270 }

Referenced by dequant_q4_k_block(), dot_q4_k_q8_k_ref(), gemv_q4_k_backward_ref(), gemv_q4_k_q8_k_sse(), gemv_q4_k_ref(), and unpack_q5_k_scales().

◆ unpack_q5_k_scales()

static void unpack_q5_k_scales ( const uint8_t *  scales,
uint8_t *  sc,
uint8_t *  m 
)
inlinestatic

Unpack Q5_K sub-block scales and mins.

Parameters
scalesThe packed scales[12] array from block_q5_K
scOutput: 8 unpacked scale values (multiply by super-block d)
mOutput: 8 unpacked min values (multiply by super-block dmin)

Q5_K uses the same 6-bit packed format as Q4_K for scales/mins. The 12-byte scales array layout is identical:

  • bytes 0-3: 6-bit scales[0-3] (high 2 bits used for scales[4-7])
  • bytes 4-7: 6-bit mins[0-3] (high 2 bits used for mins[4-7])
  • bytes 8-11: low 4 bits for scales[4-7], high 4 bits for mins[4-7]

Definition at line 285 of file ckernel_quant.h.

286  {
287  /* Q5_K uses identical packing as Q4_K for scales/mins */
288  unpack_q4_k_scales(scales, sc, m);
289 }

References unpack_q4_k_scales().

◆ vec_dot_q5_0_q8_0()

void vec_dot_q5_0_q8_0 ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Auto-dispatch quantized dot product Q5_0 x Q8_0.

Dispatch priority:

  1. AVX512 (best performance on modern Intel/AMD)
  2. AVX (256-bit float ops, works on Sandy/Ivy Bridge and newer)
  3. SSSE3 (128-bit fallback)
  4. Reference scalar (last resort)

Definition at line 1498 of file gemm_kernels_q5_0.c.

1499 {
1500 #if defined(__AVX512F__)
1501  vec_dot_q5_0_q8_0_avx512(n, s, vx, vy);
1502 #elif defined(__AVX__)
1503  /* AVX for 256-bit float ops (works on Ivy Bridge and newer) */
1504  vec_dot_q5_0_q8_0_avx(n, s, vx, vy);
1505 #elif defined(__SSSE3__)
1506  /* SSSE3 - most efficient on older CPUs */
1507  vec_dot_q5_0_q8_0_sse(n, s, vx, vy);
1508 #else
1509  vec_dot_q5_0_q8_0_ref(n, s, vx, vy);
1510 #endif
1511 }
void vec_dot_q5_0_q8_0_ref(int n, float *s, const void *vx, const void *vy)
Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference)

References vec_dot_q5_0_q8_0_ref().

Referenced by out_proj_head_major_q5_0_q8_0().

◆ vec_dot_q8_0_q8_0()

void vec_dot_q8_0_q8_0 ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Auto-dispatch quantized dot product Q8_0 x Q8_0.

Definition at line 1013 of file gemm_kernels_q8_0.c.

1014 {
1015 #ifdef __AVX512F__
1016  vec_dot_q8_0_q8_0_avx512(n, s, vx, vy);
1017 #elif defined(__AVX__)
1018  vec_dot_q8_0_q8_0_avx(n, s, vx, vy);
1019 #elif defined(__SSE4_1__)
1020  vec_dot_q8_0_q8_0_sse(n, s, vx, vy);
1021 #else
1022  vec_dot_q8_0_q8_0_ref(n, s, vx, vy);
1023 #endif
1024 }
void vec_dot_q8_0_q8_0_ref(int n, float *s, const void *vx, const void *vy)
Quantized dot product: Q8_0 weights x Q8_0 input (scalar reference)

References vec_dot_q8_0_q8_0_ref().

Referenced by out_proj_head_major_q8_0_q8_0().