GEMM/GEMV kernels with Q8_0 quantized weights. More...
#include <stdint.h>#include <stddef.h>#include <string.h>#include "ckernel_quant.h"#include "ck_features.h"Go to the source code of this file.
Functions | |
| float | dot_q8_0 (const void *w_q8_0, const float *x, int K) |
| void | gemm_nt_q8_0 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias. More... | |
| void | gemm_q8_0 (float *Y, const void *W, const float *X, int M, int N, int K) |
| Matrix-matrix multiply with Q8_0 weights. More... | |
| void | gemm_q8_0_backward (float *dX, const void *W, const float *dY, int M, int N, int K) |
| Batched backward pass. More... | |
| void | gemv_q8_0 (float *y, const void *W, const float *x, int M, int K) |
| Auto-dispatch GEMV for Q8_0 weights based on CPU features. More... | |
| void | gemv_q8_0_backward (float *dX, const void *W, const float *dY, int M, int K) |
| Auto-dispatch backward. More... | |
| void | gemv_q8_0_backward_ref (float *dX, const void *W, const float *dY, int M, int K) |
| Backward pass: compute input gradient (scalar reference) More... | |
| void | gemv_q8_0_parallel_simd (float *y, const void *W, const float *x, int M, int K, int ith, int nth) |
| Parallel SIMD GEMV for Q8_0 weights x FP32 input with prefetching. More... | |
| void | gemv_q8_0_q8_0 (float *y, const void *W, const void *x_q8, int M, int K) |
| Matrix-vector multiply with Q8_0 weights and Q8_0 input. More... | |
| void | gemv_q8_0_q8_0_parallel (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| Parallel reference GEMV for Q8_0 x Q8_0. More... | |
| void | gemv_q8_0_q8_0_parallel_simd (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| Parallel SIMD GEMV for Q8_0 x Q8_0 with prefetching. More... | |
| void | gemv_q8_0_ref (float *y, const void *W, const float *x, int M, int K) |
| Matrix-vector multiply with Q8_0 weights (scalar reference) More... | |
| void | quantize_batch_q8_0 (const float *x, void *vy, int num_rows, int k) |
| Batch quantize FP32 to Q8_0 format (row-major output) More... | |
| void | quantize_batch_q8_k (const float *x, void *vy, int num_rows, int k) |
| Batch quantize FP32 to Q8_K format (row-major output) More... | |
| void | quantize_row_q8_0 (const float *x, void *vy, int k) |
| Quantize FP32 to Q8_0 format (scalar reference) More... | |
| void | quantize_row_q8_k (const float *x, void *vy, int k) |
| void | vec_dot_q8_0_q8_0 (int n, float *s, const void *vx, const void *vy) |
| Auto-dispatch quantized dot product Q8_0 x Q8_0. More... | |
| void | vec_dot_q8_0_q8_0_ref (int n, float *s, const void *vx, const void *vy) |
| Quantized dot product: Q8_0 weights x Q8_0 input (scalar reference) More... | |
GEMM/GEMV kernels with Q8_0 quantized weights.
After changes: make test && make llamacpp-parity-full
Q8_0 Format:
Operations: Forward: Y = W @ X (W is Q8_0, X and Y are FP32) Backward: dX = W^T @ dY (gradient w.r.t. input)
Note: Q8_0 is often used for activation quantization or as an intermediate format. Higher precision than Q4_0/Q4_K.
Definition in file gemm_kernels_q8_0.c.
| float dot_q8_0 | ( | const void * | w_q8_0, |
| const float * | x, | ||
| int | K | ||
| ) |
Definition at line 834 of file gemm_kernels_q8_0.c.
References gemv_q8_0().
| void gemm_nt_q8_0 | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.
| A | Input matrix [M x K], row-major FP32 |
| B | Weight matrix in Q8_0 format, [N x K] stored row-major |
| bias | Optional bias [N], NULL if not used |
| C | Output [M x N], row-major FP32 |
| M | Batch size (number of tokens) |
| N | Output dimension (number of rows in B) |
| K | Input dimension |
Definition at line 681 of file gemm_kernels_q8_0.c.
References C, CK_FP16_TO_FP32, block_q8_0::d, gemv_q8_0(), QK8_0, and block_q8_0::qs.
Referenced by ck_gemm_nt_quant(), qwen2_0_5b_decode_decode_token(), qwen2_0_5b_decode_forward_prefill_impl(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().
| void gemm_q8_0 | ( | float * | Y, |
| const void * | W, | ||
| const float * | X, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Matrix-matrix multiply with Q8_0 weights.
Definition at line 656 of file gemm_kernels_q8_0.c.
References gemv_q8_0().
| void gemm_q8_0_backward | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Batched backward pass.
Definition at line 820 of file gemm_kernels_q8_0.c.
References gemv_q8_0_backward().
| void gemv_q8_0 | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Auto-dispatch GEMV for Q8_0 weights based on CPU features.
Dispatch priority (best available):
Uses ck_features.h for standardized feature detection.
| y | Output vector [M] |
| W | Weight matrix in Q8_0 format [M x K] |
| x | Input vector [K] |
| M | Number of output rows |
| K | Number of input columns (hidden dimension) |
Definition at line 630 of file gemm_kernels_q8_0.c.
References gemv_q8_0_ref().
Referenced by dot_q8_0(), gemm_nt_q8_0(), and gemm_q8_0().
| void gemv_q8_0_backward | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | K | ||
| ) |
Auto-dispatch backward.
Definition at line 805 of file gemm_kernels_q8_0.c.
References gemv_q8_0_backward_ref().
Referenced by gemm_q8_0_backward().
| void gemv_q8_0_backward_ref | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | K | ||
| ) |
Backward pass: compute input gradient (scalar reference)
| dX | Output gradient w.r.t. input [K] |
| W | Weight matrix in Q8_0 format [M x K] |
| dY | Gradient w.r.t. output [M] |
| M | Number of output rows |
| K | Number of columns (input dimension) |
Definition at line 733 of file gemm_kernels_q8_0.c.
References CK_FP16_TO_FP32, block_q8_0::d, QK8_0, and block_q8_0::qs.
Referenced by gemv_q8_0_backward().
| void gemv_q8_0_parallel_simd | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel SIMD GEMV for Q8_0 weights x FP32 input with prefetching.
Definition at line 1153 of file gemm_kernels_q8_0.c.
References gemv_q8_0_ref(), and QK8_0.
| void gemv_q8_0_q8_0 | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Matrix-vector multiply with Q8_0 weights and Q8_0 input.
| y | Output vector [M] |
| W | Weight matrix in Q8_0 format [M x K] |
| x_q8 | Input vector in Q8_0 format [K] |
| M | Number of output rows |
| K | Number of columns (must be multiple of 32) |
Definition at line 1042 of file gemm_kernels_q8_0.c.
References QK8_0, and vec_dot_q8_0_q8_0().
Referenced by ck_test_gemv_q8_0(), and ck_test_gemv_q8_0_q8_0().
| void gemv_q8_0_q8_0_parallel | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel reference GEMV for Q8_0 x Q8_0.
Definition at line 1068 of file gemm_kernels_q8_0.c.
References QK8_0, and vec_dot_q8_0_q8_0().
| void gemv_q8_0_q8_0_parallel_simd | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel SIMD GEMV for Q8_0 x Q8_0 with prefetching.
Each thread processes rows [r0, r1) where r0 = ith * ceil(M/nth). Prefetches upcoming weight rows to hide memory latency.
Definition at line 1100 of file gemm_kernels_q8_0.c.
References QK8_0, and vec_dot_q8_0_q8_0().
| void gemv_q8_0_ref | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Matrix-vector multiply with Q8_0 weights (scalar reference)
| y | Output vector [M] |
| W | Weight matrix in Q8_0 format [M x K] |
| x | Input vector [K] |
| M | Number of output rows |
| K | Number of columns (must be multiple of 32) |
Definition at line 252 of file gemm_kernels_q8_0.c.
References CK_FP16_TO_FP32, block_q8_0::d, QK8_0, and block_q8_0::qs.
Referenced by gemv_q8_0(), and gemv_q8_0_parallel_simd().
| void quantize_batch_q8_0 | ( | const float * | x, |
| void * | vy, | ||
| int | num_rows, | ||
| int | k | ||
| ) |
Batch quantize FP32 to Q8_0 format (row-major output)
Quantizes multiple rows of FP32 data to Q8_0 format, placing each row's Q8_0 output at the correct byte offset for GEMM compatibility.
Memory layout: Input: [num_rows, k] FP32, row-major (stride = k * sizeof(float)) Output: [num_rows, q8_row_bytes] Q8_0, row-major (stride = q8_row_bytes)
where q8_row_bytes = (k / 32) * sizeof(block_q8_0) = (k / 32) * 34
| x | Input FP32 values [num_rows * k] |
| vy | Output Q8_0 blocks [num_rows * (k/32) blocks] |
| num_rows | Number of rows (batch size / tokens) |
| k | Elements per row (must be multiple of 32) |
Definition at line 192 of file gemm_kernels_q8_0.c.
References QK8_0, and quantize_row_q8_0().
| void quantize_batch_q8_k | ( | const float * | x, |
| void * | vy, | ||
| int | num_rows, | ||
| int | k | ||
| ) |
Batch quantize FP32 to Q8_K format (row-major output)
Same as quantize_batch_q8_0 but for Q8_K format (super-blocks).
| x | Input FP32 values [num_rows * k] |
| vy | Output Q8_K blocks |
| num_rows | Number of rows (batch size / tokens) |
| k | Elements per row (must be multiple of 256) |
Definition at line 219 of file gemm_kernels_q8_0.c.
References quantize_row_q8_k().
| void quantize_row_q8_0 | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Quantize FP32 to Q8_0 format (scalar reference)
| x | Input FP32 values |
| vy | Output Q8_0 blocks |
| k | Number of elements (must be multiple of 32) |
Definition at line 59 of file gemm_kernels_q8_0.c.
References CK_FP32_TO_FP16, block_q8_0::d, id, QK8_0, and block_q8_0::qs.
Referenced by ck_test_gemm_q5_0(), ck_test_gemm_q8_0(), ck_test_gemv_q5_0(), ck_test_gemv_q5_0_q8_0(), ck_test_gemv_q8_0(), ck_test_gemv_q8_0_q8_0(), fused_mlp_swiglu_prefill_w1w2_quant(), fused_rmsnorm_qkv_prefill_head_major_quant(), gemv_fused_q5_0_bias_parallel_omp(), gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), mega_fused_attention_decode_q5_0(), mega_fused_attention_decode_q5_0_parallel_simd(), quantize_attn_out_head_major_q8_0(), and quantize_batch_q8_0().
| void quantize_row_q8_k | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Definition at line 107 of file gemm_kernels_q4k_q8k.c.
Referenced by quantize_batch_q8_k().
| void vec_dot_q8_0_q8_0 | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Auto-dispatch quantized dot product Q8_0 x Q8_0.
Definition at line 1013 of file gemm_kernels_q8_0.c.
References vec_dot_q8_0_q8_0_ref().
Referenced by ck_test_vec_dot_q8_0_q8_0(), gemv_q8_0_from_fp32(), gemv_q8_0_q8_0(), gemv_q8_0_q8_0_parallel(), gemv_q8_0_q8_0_parallel_omp(), gemv_q8_0_q8_0_parallel_simd(), and out_proj_head_major_q8_0_q8_0().
| void vec_dot_q8_0_q8_0_ref | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Quantized dot product: Q8_0 weights x Q8_0 input (scalar reference)
| n | Number of elements (must be multiple of 32) |
| s | Output: scalar dot product result |
| vx | Q8_0 quantized weights |
| vy | Q8_0 quantized input |
Definition at line 863 of file gemm_kernels_q8_0.c.
References CK_FP16_TO_FP32, QK8_0, and block_q8_0::qs.
Referenced by vec_dot_q8_0_q8_0().