GEMM/GEMV kernels with Q5_0 quantized weights. More...
#include <stdint.h>#include <stddef.h>#include <string.h>#include <stdio.h>#include "ckernel_quant.h"#include "ck_features.h"Go to the source code of this file.
Functions | |
| void | dequant_q5_0_block (const block_q5_0 *block, float *output) |
| Dequantize a single Q5_0 block to FP32. More... | |
| void | dequant_q5_0_row (const void *src, float *dst, size_t n_elements) |
| Dequantize Q5_0 row (multiple blocks) More... | |
| float | dot_q5_0 (const void *w_q5_0, const float *x, int K) |
| void | gemm_nt_q5_0 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_nt_q5_0_q8_0 (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K) |
| Batch GEMM with Q5_0 weights and Q8_0 activations for prefill. More... | |
| void | gemm_nt_q5_0_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| GEMM with transposed Q5_0 weights: C = A @ B^T. More... | |
| void | gemm_nt_q5_0_sse_v2 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_q5_0 (float *Y, const void *W, const float *X, int M, int N, int K) |
| Matrix-matrix multiply with Q5_0 weights. More... | |
| void | gemm_q5_0_backward (float *dX, const void *W, const float *dY, int M, int N, int K) |
| Batched backward pass. More... | |
| void | gemv_q5_0 (float *y, const void *W, const float *x, int M, int K) |
| Auto-dispatch GEMV for Q5_0 weights based on CPU features. More... | |
| void | gemv_q5_0_backward (float *dX, const void *W, const float *dY, int M, int K) |
| Auto-dispatch backward. More... | |
| void | gemv_q5_0_backward_ref (float *dX, const void *W, const float *dY, int M, int K) |
| Backward pass: compute input gradient. More... | |
| void | gemv_q5_0_parallel (float *y, const void *W, const float *x, int M, int K, int ith, int nth) |
| Parallel reference GEMV for Q5_0 × FP32. More... | |
| void | gemv_q5_0_parallel_simd (float *y, const void *W, const float *x, int M, int K, int ith, int nth) |
| Parallel SIMD GEMV for Q5_0 × FP32 with prefetching. More... | |
| void | gemv_q5_0_q8_0 (float *y, const void *W, const void *x_q8, int M, int K) |
| Matrix-vector multiply with Q5_0 weights and Q8_0 input. More... | |
| void | gemv_q5_0_q8_0_parallel_simd (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| Parallel SIMD GEMV for Q5_0 x Q8_0 with prefetching. More... | |
| void | gemv_q5_0_ref (float *y, const void *W, const float *x, int M, int K) |
| Matrix-vector multiply with Q5_0 weights (scalar reference) More... | |
| void | vec_dot_q5_0_q8_0 (int n, float *s, const void *vx, const void *vy) |
| Auto-dispatch quantized dot product Q5_0 x Q8_0. More... | |
| void | vec_dot_q5_0_q8_0_ref (int n, float *s, const void *vx, const void *vy) |
| Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference) More... | |
GEMM/GEMV kernels with Q5_0 quantized weights.
After changes: make test && make llamacpp-parity-full
Q5_0 Format:
Dequantization: w = scale * (q5 - 16) where q5 = low4bit | (highbit << 4), giving values 0-31, then subtract 16 for signed -16 to +15
Operations: Forward: Y = W @ X (W is Q5_0, X and Y are FP32) Backward: dX = W^T @ dY (gradient w.r.t. input)
Definition in file gemm_kernels_q5_0.c.
| void dequant_q5_0_block | ( | const block_q5_0 * | block, |
| float * | output | ||
| ) |
Dequantize a single Q5_0 block to FP32.
| block | Pointer to Q5_0 block (22 bytes) |
| output | Output FP32 array (32 floats) |
Definition at line 161 of file dequant_kernels.c.
References block_q5_0::d, GGML_FP16_TO_FP32, block_q5_0::qh, QK5_0, and block_q5_0::qs.
Referenced by dequant_q5_0_row().
| void dequant_q5_0_row | ( | const void * | src, |
| float * | dst, | ||
| size_t | n_elements | ||
| ) |
Dequantize Q5_0 row (multiple blocks)
Definition at line 196 of file dequant_kernels.c.
Referenced by dequant_row().
| float dot_q5_0 | ( | const void * | w_q5_0, |
| const float * | x, | ||
| int | K | ||
| ) |
Definition at line 870 of file gemm_kernels_q5_0.c.
References gemv_q5_0().
| void gemm_nt_q5_0 | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 831 of file gemm_kernels_q5_0.c.
References C, gemm_q5_0(), and gemv_q5_0().
Referenced by ck_gemm_nt_quant(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().
| void gemm_nt_q5_0_q8_0 | ( | const void * | A_q8, |
| const void * | B_q5, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.
Computes C = A @ B^T + bias where: A: [M x K] Q8_0 quantized activations (M tokens, K features) B: [N x K] Q5_0 quantized weights (N outputs, K features) C: [M x N] FP32 output
This is the INT8 batch kernel for prefill, using pre-quantized activations to avoid FP32->Q8_0 conversion overhead per operation.
| A_q8 | Input activations in Q8_0 format [M rows of K/32 blocks each] |
| B_q5 | Weights in Q5_0 format [N rows of K/32 blocks each] |
| bias | Optional bias vector [N], NULL if not used |
| C | Output matrix [M x N], row-major FP32 |
| M | Batch size (number of tokens) |
| N | Output dimension (number of output features) |
| K | Input dimension (must be multiple of 32) |
Definition at line 1617 of file gemm_kernels_q5_0.c.
References C, QK5_0, and vec_dot_q5_0_q8_0().
Referenced by ck_test_gemm_q5_0(), gemm_nt_q8_0_dispatch(), and gemm_nt_q8_0_mlp_dispatch().
| void gemm_nt_q5_0_ref | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
GEMM with transposed Q5_0 weights: C = A @ B^T.
| A | Input activations [M x K], row-major FP32 |
| B | Weight matrix in Q5_0 format [N x K], row-major quantized |
| bias | Optional bias [N], NULL if not used |
| C | Output [M x N], row-major FP32 |
| M | Batch size (number of tokens) |
| N | Output dimension (number of rows in B) |
| K | Input dimension |
Definition at line 788 of file gemm_kernels_q5_0.c.
References C, CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.
Referenced by gemm_nt_q5_0_sse_v2().
| void gemm_nt_q5_0_sse_v2 | ( | const float * | A, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 77 of file gemm_kernels_q5_0_sse_v2.c.
| void gemm_q5_0 | ( | float * | Y, |
| const void * | W, | ||
| const float * | X, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Matrix-matrix multiply with Q5_0 weights.
Definition at line 682 of file gemm_kernels_q5_0.c.
References gemv_q5_0().
Referenced by gemm_nt_q5_0().
| void gemm_q5_0_backward | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Batched backward pass.
Definition at line 762 of file gemm_kernels_q5_0.c.
References gemv_q5_0_backward().
| void gemv_q5_0 | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Auto-dispatch GEMV for Q5_0 weights based on CPU features.
Dispatch priority (best available):
Uses ck_features.h for standardized feature detection.
| y | Output vector [M] |
| W | Weight matrix in Q5_0 format [M x K] |
| x | Input vector [K] |
| M | Number of output rows |
| K | Number of input columns (hidden dimension) |
Definition at line 547 of file gemm_kernels_q5_0.c.
References gemv_q5_0_ref().
Referenced by dot_q5_0(), gemm_nt_q5_0(), and gemm_q5_0().
| void gemv_q5_0_backward | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | K | ||
| ) |
Auto-dispatch backward.
Definition at line 751 of file gemm_kernels_q5_0.c.
References gemv_q5_0_backward_ref().
Referenced by gemm_q5_0_backward().
| void gemv_q5_0_backward_ref | ( | float * | dX, |
| const void * | W, | ||
| const float * | dY, | ||
| int | M, | ||
| int | K | ||
| ) |
Backward pass: compute input gradient.
| dX | Output gradient w.r.t. input [K] |
| W | Weight matrix in Q5_0 format [M x K] |
| dY | Gradient w.r.t. output [M] |
| M | Number of output rows |
| K | Number of columns (input dimension) |
Definition at line 705 of file gemm_kernels_q5_0.c.
References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.
Referenced by gemv_q5_0_backward().
| void gemv_q5_0_parallel | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel reference GEMV for Q5_0 × FP32.
Definition at line 576 of file gemm_kernels_q5_0.c.
References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.
Referenced by gemv_q5_0_parallel_simd().
| void gemv_q5_0_parallel_simd | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel SIMD GEMV for Q5_0 × FP32 with prefetching.
Definition at line 622 of file gemm_kernels_q5_0.c.
References gemv_q5_0_parallel(), and QK5_0.
| void gemv_q5_0_q8_0 | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Matrix-vector multiply with Q5_0 weights and Q8_0 input.
| y | Output vector [M] |
| W | Weight matrix in Q5_0 format [M x K] |
| x_q8 | Input vector in Q8_0 format [K] |
| M | Number of output rows |
| K | Number of columns (must be multiple of 32) |
Definition at line 1529 of file gemm_kernels_q5_0.c.
References QK5_0, and vec_dot_q5_0_q8_0().
Referenced by ck_test_gemv_q5_0(), and ck_test_gemv_q5_0_q8_0().
| void gemv_q5_0_q8_0_parallel_simd | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel SIMD GEMV for Q5_0 x Q8_0 with prefetching.
Each thread processes rows [r0, r1) where r0 = ith * ceil(M/nth). Uses vec_dot_q5_0_q8_0 dispatch (auto-selects AVX512/AVX/SSE/scalar).
Definition at line 1551 of file gemm_kernels_q5_0.c.
References QK5_0, and vec_dot_q5_0_q8_0().
| void gemv_q5_0_ref | ( | float * | y, |
| const void * | W, | ||
| const float * | x, | ||
| int | M, | ||
| int | K | ||
| ) |
Matrix-vector multiply with Q5_0 weights (scalar reference)
| y | Output vector [M] |
| W | Weight matrix in Q5_0 format [M x K] |
| x | Input vector [K] |
| M | Number of output rows |
| K | Number of columns (must be multiple of 32) |
Definition at line 64 of file gemm_kernels_q5_0.c.
References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.
Referenced by gemv_q5_0().
| void vec_dot_q5_0_q8_0 | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Auto-dispatch quantized dot product Q5_0 x Q8_0.
Dispatch priority:
Definition at line 1498 of file gemm_kernels_q5_0.c.
References vec_dot_q5_0_q8_0_ref().
Referenced by ck_test_vec_dot_q5_0_q8_0(), gemm_nt_q5_0_q8_0(), gemv_fused_q5_0_bias_parallel_omp(), gemv_q5_0_from_fp32(), gemv_q5_0_q8_0(), gemv_q5_0_q8_0_parallel_omp(), gemv_q5_0_q8_0_parallel_simd(), mega_fused_attention_decode_q5_0(), mega_fused_attention_decode_q5_0_parallel_simd(), and out_proj_head_major_q5_0_q8_0().
| void vec_dot_q5_0_q8_0_ref | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference)
| n | Number of elements (must be multiple of 32) |
| s | Output: scalar dot product result |
| vx | Q5_0 quantized weights |
| vy | Q8_0 quantized input |
Definition at line 899 of file gemm_kernels_q5_0.c.
References CK_FP16_TO_FP32, QK5_0, and block_q8_0::qs.
Referenced by vec_dot_q5_0_q8_0().