Quantization block structures for weight-only quantization. More...
Go to the source code of this file.
Data Structures | |
| struct | block_q4_0 |
| struct | block_q4_1 |
| struct | block_q4_K |
| struct | block_q5_0 |
| struct | block_q5_1 |
| struct | block_q5_K |
| struct | block_q6_K |
| struct | block_q8_0 |
| struct | block_q8_K |
Macros | |
| #define | CK_FP16_TO_FP32(x) ck_fp16_to_fp32(x) |
| #define | CK_FP16_TO_FP32_SIMD(x) ck_fp16_to_fp32_simd(x) |
| #define | CK_FP16_TO_FP32_SOFT(x) ck_fp16_to_fp32_soft(x) |
| #define | CK_FP32_TO_FP16(x) ck_fp32_to_fp16(x) |
| #define | CK_FP32_TO_FP16_SIMD(x) ck_fp32_to_fp16_simd(x) |
| #define | CK_FP32_TO_FP16_SOFT(x) ck_fp32_to_fp16_soft(x) |
| #define | ggml_fp16_to_fp32 ck_fp16_to_fp32 |
| #define | GGML_FP16_TO_FP32 CK_FP16_TO_FP32 |
| #define | ggml_fp32_to_fp16 ck_fp32_to_fp16 |
| #define | GGML_FP32_TO_FP16 CK_FP32_TO_FP16 |
| #define | K_SCALE_SIZE 12 |
| #define | QK4_0 32 |
| #define | QK4_1 32 |
| #define | QK5_0 32 |
| #define | QK5_1 32 |
| #define | QK8_0 32 |
| #define | QK_K 256 |
Typedefs | |
| typedef uint16_t | ck_half |
| typedef ck_half | ggml_half |
Functions | |
| static float | ck_fp16_to_fp32 (ck_half h) |
| static float | ck_fp16_to_fp32_soft (ck_half h) |
| Convert FP16 (ck_half) to FP32 — software implementation. More... | |
| static ck_half | ck_fp32_to_fp16 (float f) |
| static ck_half | ck_fp32_to_fp16_soft (float f) |
| Convert FP32 to FP16 (ck_half) — software implementation. More... | |
| static size_t | ck_quant_block_size (int type) |
| Get the block size (number of weights per block) for a quant type. More... | |
| static size_t | ck_quant_row_size (int type, int64_t n_elements) |
| Calculate total bytes needed for n_elements with given quant type. More... | |
| static size_t | ck_quant_type_size (int type) |
| Get the byte size per block for a quant type. More... | |
| void | gemm_nt_q5_0_q8_0 (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K) |
| Batch GEMM with Q5_0 weights and Q8_0 activations for prefill. More... | |
| void | gemm_nt_q5_0_q8_0_unroll_avx (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q5_0_sse_v2 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q5_k (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q5_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q6_k_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q6_k_sse (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemv_q4_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q5_k (float *y, const void *W, const float *x, int M, int K) |
| void | gemv_q5_k_ref (float *y, const void *W, const float *x, int M, int K) |
| void | quantize_row_q8_0 (const float *x, void *vy, int k) |
| Quantize FP32 to Q8_0 format (scalar reference) More... | |
| void | quantize_row_q8_k_sse (const float *x, void *vy, int k) |
| void | rmsnorm_q8_k_fused (const float *input, const float *gamma, void *vy, int tokens, int d_model, int aligned_embed_dim, float eps) |
| static void | unpack_q4_k_scales (const uint8_t *scales, uint8_t *sc, uint8_t *m) |
| Unpack Q4_K sub-block scales and mins. More... | |
| static void | unpack_q5_k_scales (const uint8_t *scales, uint8_t *sc, uint8_t *m) |
| Unpack Q5_K sub-block scales and mins. More... | |
| void | vec_dot_q5_0_q8_0 (int n, float *s, const void *vx, const void *vy) |
| Auto-dispatch quantized dot product Q5_0 x Q8_0. More... | |
| void | vec_dot_q8_0_q8_0 (int n, float *s, const void *vx, const void *vy) |
| Auto-dispatch quantized dot product Q8_0 x Q8_0. More... | |
Quantization block structures for weight-only quantization.
Defines block structures for various quantization formats used in LLM inference. Primary focus on Q4_K_M which is commonly used for LLM weight compression.
Block structures are compatible with llama.cpp/GGML for model loading.
Definition in file ckernel_quant.h.
| #define CK_FP16_TO_FP32 | ( | x | ) | ck_fp16_to_fp32(x) |
Definition at line 400 of file ckernel_quant.h.
| #define CK_FP16_TO_FP32_SIMD | ( | x | ) | ck_fp16_to_fp32_simd(x) |
Definition at line 402 of file ckernel_quant.h.
| #define CK_FP16_TO_FP32_SOFT | ( | x | ) | ck_fp16_to_fp32_soft(x) |
Definition at line 404 of file ckernel_quant.h.
| #define CK_FP32_TO_FP16 | ( | x | ) | ck_fp32_to_fp16(x) |
Definition at line 401 of file ckernel_quant.h.
| #define CK_FP32_TO_FP16_SIMD | ( | x | ) | ck_fp32_to_fp16_simd(x) |
Definition at line 403 of file ckernel_quant.h.
| #define CK_FP32_TO_FP16_SOFT | ( | x | ) | ck_fp32_to_fp16_soft(x) |
Definition at line 405 of file ckernel_quant.h.
| #define ggml_fp16_to_fp32 ck_fp16_to_fp32 |
Definition at line 409 of file ckernel_quant.h.
| #define GGML_FP16_TO_FP32 CK_FP16_TO_FP32 |
Definition at line 411 of file ckernel_quant.h.
| #define ggml_fp32_to_fp16 ck_fp32_to_fp16 |
Definition at line 410 of file ckernel_quant.h.
| #define GGML_FP32_TO_FP16 CK_FP32_TO_FP16 |
Definition at line 412 of file ckernel_quant.h.
| #define K_SCALE_SIZE 12 |
Definition at line 121 of file ckernel_quant.h.
| #define QK4_0 32 |
Definition at line 35 of file ckernel_quant.h.
| #define QK4_1 32 |
Definition at line 50 of file ckernel_quant.h.
| #define QK5_0 32 |
Definition at line 67 of file ckernel_quant.h.
| #define QK5_1 32 |
Definition at line 84 of file ckernel_quant.h.
| #define QK8_0 32 |
Definition at line 101 of file ckernel_quant.h.
| #define QK_K 256 |
Definition at line 120 of file ckernel_quant.h.
| typedef uint16_t ck_half |
Definition at line 26 of file ckernel_quant.h.
Definition at line 408 of file ckernel_quant.h.
|
inlinestatic |
Definition at line 383 of file ckernel_quant.h.
References ck_fp16_to_fp32_soft().
|
inlinestatic |
Convert FP16 (ck_half) to FP32 — software implementation.
Definition at line 303 of file ckernel_quant.h.
Referenced by ck_fp16_to_fp32().
|
inlinestatic |
Definition at line 391 of file ckernel_quant.h.
References ck_fp32_to_fp16_soft().
|
inlinestatic |
Convert FP32 to FP16 (ck_half) — software implementation.
Definition at line 337 of file ckernel_quant.h.
Referenced by ck_fp32_to_fp16().
|
inlinestatic |
Get the block size (number of weights per block) for a quant type.
Definition at line 184 of file ckernel_quant.h.
References CK_DT_Q4_1, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q5_K, CK_DT_Q6_K, QK4_0, QK4_1, QK5_0, QK5_1, QK8_0, and QK_K.
Referenced by ck_quant_row_size().
|
inlinestatic |
Calculate total bytes needed for n_elements with given quant type.
Definition at line 220 of file ckernel_quant.h.
References ck_quant_block_size(), and ck_quant_type_size().
|
inlinestatic |
Get the byte size per block for a quant type.
Definition at line 202 of file ckernel_quant.h.
References CK_DT_Q4_1, CK_DT_Q5_0, CK_DT_Q5_1, CK_DT_Q5_K, and CK_DT_Q6_K.
Referenced by ck_quant_row_size().
| void gemm_nt_q5_0_q8_0 | ( | const void * | A_q8, |
| const void * | B_q5, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.
Computes C = A @ B^T + bias where: A: [M x K] Q8_0 quantized activations (M tokens, K features) B: [N x K] Q5_0 quantized weights (N outputs, K features) C: [M x N] FP32 output
This is the INT8 batch kernel for prefill, using pre-quantized activations to avoid FP32->Q8_0 conversion overhead per operation.
| A_q8 | Input activations in Q8_0 format [M rows of K/32 blocks each] |
| B_q5 | Weights in Q5_0 format [N rows of K/32 blocks each] |
| bias | Optional bias vector [N], NULL if not used |
| C | Output matrix [M x N], row-major FP32 |
| M | Batch size (number of tokens) |
| N | Output dimension (number of output features) |
| K | Input dimension (must be multiple of 32) |
Definition at line 1617 of file gemm_kernels_q5_0.c.
References C, QK5_0, and vec_dot_q5_0_q8_0().
Referenced by gemm_nt_q8_0_dispatch(), and gemm_nt_q8_0_mlp_dispatch().
| void gemm_nt_q5_0_q8_0_unroll_avx | ( | const void * | A_q8, |
| const void * | B_q5, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
| void gemm_nt_q5_0_sse_v2 | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 77 of file gemm_kernels_q5_0_sse_v2.c.
References C, dot_q5_0_q8_k_32_sse(), gemm_nt_q5_0_ref(), QK_K, and quantize_row_q8_k().
| void gemm_nt_q5_k | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 218 of file gemm_kernels_q5_k.c.
| void gemm_nt_q5_k_ref | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 145 of file gemm_kernels_q5_k.c.
References C, CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.
Referenced by gemm_nt_q5_k().
| void gemm_nt_q6_k_ref | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 243 of file gemm_kernels_q6k.c.
References C, and gemm_nt_q6_k().
Referenced by gemm_nt_q6_k_sse().
| void gemm_nt_q6_k_sse | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 66 of file gemm_kernels_q6k_sse.c.
References C, dot_q6_k_q8_k_256_sse(), gemm_nt_q6_k_ref(), QK_K, and quantize_row_q8_k().
| void gemv_q4_k_q8_k_sse | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 33 of file gemm_kernels_q4k_sse.c.
References block_q8_K::bsums, CK_FP16_TO_FP32, block_q4_K::d, block_q8_K::d, block_q4_K::dmin, hsum_epi32_sse(), QK_K, block_q4_K::qs, block_q8_K::qs, block_q4_K::scales, and unpack_q4_k_scales().
| void gemv_q5_k | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 199 of file gemm_kernels_q5_k.c.
| void gemv_q5_k_ref | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 92 of file gemm_kernels_q5_k.c.
References CK_FP16_TO_FP32, block_q5_K::d, block_q5_K::dmin, get_q5_k_scale_min(), block_q5_K::qh, QK_K, block_q5_K::qs, and block_q5_K::scales.
Referenced by gemv_q5_k().
| void quantize_row_q8_0 | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Quantize FP32 to Q8_0 format (scalar reference)
| x | Input FP32 values |
| vy | Output Q8_0 blocks |
| k | Number of elements (must be multiple of 32) |
Definition at line 59 of file gemm_kernels_q8_0.c.
| void quantize_row_q8_k_sse | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Definition at line 29 of file quantize_row_q8_k_sse.c.
References block_q8_K::bsums, block_q8_K::d, and QK_K.
| void rmsnorm_q8_k_fused | ( | const float * | input, |
| const float * | gamma, | ||
| void * | vy, | ||
| int | tokens, | ||
| int | d_model, | ||
| int | aligned_embed_dim, | ||
| float | eps | ||
| ) |
Fused RMSNorm + Q8_K Quantization
Benefits:
Definition at line 54 of file rmsnorm_q8_k_fused.c.
References block_q8_K::bsums, block_q8_K::d, hmax256_ps_fused(), hsum256_ps_fused(), QK_K, and block_q8_K::qs.
|
inlinestatic |
Unpack Q4_K sub-block scales and mins.
| scales | The packed scales[12] array from block_q4_K |
| sc | Output: 8 unpacked scale values (multiply by super-block d) |
| m | Output: 8 unpacked min values (multiply by super-block dmin) |
This matches llama.cpp's get_scale_min_k4() function exactly. The 12-byte scales array layout:
Definition at line 246 of file ckernel_quant.h.
Referenced by dequant_q4_k_block(), dot_q4_k_q8_k_ref(), gemv_q4_k_backward_ref(), gemv_q4_k_q8_k_sse(), gemv_q4_k_ref(), and unpack_q5_k_scales().
|
inlinestatic |
Unpack Q5_K sub-block scales and mins.
| scales | The packed scales[12] array from block_q5_K |
| sc | Output: 8 unpacked scale values (multiply by super-block d) |
| m | Output: 8 unpacked min values (multiply by super-block dmin) |
Q5_K uses the same 6-bit packed format as Q4_K for scales/mins. The 12-byte scales array layout is identical:
Definition at line 285 of file ckernel_quant.h.
References unpack_q4_k_scales().
| void vec_dot_q5_0_q8_0 | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Auto-dispatch quantized dot product Q5_0 x Q8_0.
Dispatch priority:
Definition at line 1498 of file gemm_kernels_q5_0.c.
References vec_dot_q5_0_q8_0_ref().
Referenced by out_proj_head_major_q5_0_q8_0().
| void vec_dot_q8_0_q8_0 | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Auto-dispatch quantized dot product Q8_0 x Q8_0.
Definition at line 1013 of file gemm_kernels_q8_0.c.
References vec_dot_q8_0_q8_0_ref().
Referenced by out_proj_head_major_q8_0_q8_0().