Q4_K (weights) x Q8_K (activations) kernels for inference. More...
Go to the source code of this file.
Functions | |
| static int | ck_nearest_int (float fval) |
| static float | dot_q4_k_q8_k_ref (const block_q4_K *w, const block_q8_K *x, int k) |
| void | gemm_nt_q4_k_q8_k (const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K) |
| void | gemm_q4_k_q8_k (float *Y, const void *W, const void *X_q8, int M, int N, int K) |
| void | gemm_q4_k_q8_k_ref (float *Y, const void *W, const void *X_q8, int M, int N, int K) |
| void | gemv_q4_k_q8_k (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q4_k_q8_k_avx (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q4_k_q8_k_avx2 (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q4_k_q8_k_parallel (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| void | gemv_q4_k_q8_k_ref (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q4_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q4_k_q8_k_vnni (float *y, const void *W, const void *x_q8, int M, int K) |
| void | quantize_row_q8_k (const float *x, void *vy, int k) |
| void | quantize_row_q8_k_ref (const float *x, void *vy, int k) |
| void | quantize_row_q8_k_sse (const float *x, void *vy, int k) |
Q4_K (weights) x Q8_K (activations) kernels for inference.
After changes: make test && make llamacpp-parity-full
Implements decode-style matvec/matmul where weights are Q4_K and the activations are quantized on-the-fly to Q8_K. This is inference-only; no backward pass is provided here.
Definition in file gemm_kernels_q4k_q8k.c.
|
inlinestatic |
|
static |
Definition at line 115 of file gemm_kernels_q4k_q8k.c.
References block_q8_K::bsums, CK_FP16_TO_FP32, block_q8_K::d, QK_K, block_q4_K::qs, block_q8_K::qs, and unpack_q4_k_scales().
Referenced by gemv_q4_k_q8_k_parallel(), and gemv_q4_k_q8_k_ref().
| void gemm_nt_q4_k_q8_k | ( | const void * | A_q8, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 295 of file gemm_kernels_q4k_q8k.c.
References C, and gemm_q4_k_q8_k().
Referenced by ck_attention_project_head_major_q4_k_q8_k(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), ck_mlp_swiglu_forward_q4_k_q8_k(), ck_mlp_swiglu_forward_q4_k_q8_k_prefill(), ck_qkv_project_head_major_token_q4_k_q8_k(), ck_test_gemm_q4_k(), gemm_nt_q8_k_mlp_dispatch(), gemm_nt_q8_k_qkv_dispatch(), model_forward_prefill_impl(), and qwen2_0_5b_decode_forward_prefill_impl().
| void gemm_q4_k_q8_k | ( | float * | Y, |
| const void * | W, | ||
| const void * | X_q8, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 277 of file gemm_kernels_q4k_q8k.c.
References gemv_q4_k_q8_k(), and QK_K.
Referenced by gemm_nt_q4_k_q8_k().
| void gemm_q4_k_q8_k_ref | ( | float * | Y, |
| const void * | W, | ||
| const void * | X_q8, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Definition at line 259 of file gemm_kernels_q4k_q8k.c.
References gemv_q4_k_q8_k_ref(), and QK_K.
| void gemv_q4_k_q8_k | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 239 of file gemm_kernels_q4k_q8k.c.
References gemv_q4_k_q8_k_avx(), gemv_q4_k_q8_k_avx2(), gemv_q4_k_q8_k_ref(), gemv_q4_k_q8_k_sse(), and gemv_q4_k_q8_k_vnni().
Referenced by ck_test_gemv_q4_k(), fused_rmsnorm_linear_q4k(), gemm_q4_k_q8_k(), model_decode_token(), model_layer_0_decode(), model_layer_10_decode(), model_layer_11_decode(), model_layer_12_decode(), model_layer_13_decode(), model_layer_14_decode(), model_layer_15_decode(), model_layer_16_decode(), model_layer_17_decode(), model_layer_18_decode(), model_layer_19_decode(), model_layer_1_decode(), model_layer_20_decode(), model_layer_21_decode(), model_layer_22_decode(), model_layer_23_decode(), model_layer_2_decode(), model_layer_3_decode(), model_layer_4_decode(), model_layer_5_decode(), model_layer_6_decode(), model_layer_7_decode(), model_layer_8_decode(), model_layer_9_decode(), qwen2_0_5b_decode_decode_token(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), qwen2_0_5b_decode_layer_9_decode(), and unfused_rmsnorm_linear_q4k_ref().
| void gemv_q4_k_q8_k_avx | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 251 of file gemm_kernels_q4k_avx.c.
Referenced by gemv_q4_k_q8_k().
| void gemv_q4_k_q8_k_avx2 | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 89 of file gemm_kernels_q4k_q8k_avx2.c.
Referenced by gemv_q4_k_q8_k().
| void gemv_q4_k_q8_k_parallel | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Definition at line 206 of file gemm_kernels_q4k_q8k.c.
References dot_q4_k_q8_k_ref(), and QK_K.
Referenced by gemv_q4_k_q8_k_parallel_simd().
| void gemv_q4_k_q8_k_ref | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 177 of file gemm_kernels_q4k_q8k.c.
References dot_q4_k_q8_k_ref(), and QK_K.
Referenced by gemm_q4_k_q8_k_ref(), gemv_q4_k_q8_k(), gemv_q4_k_q8_k_amx(), gemv_q4_k_q8_k_avx(), gemv_q4_k_q8_k_avx2(), and gemv_q4_k_q8_k_vnni().
| void gemv_q4_k_q8_k_sse | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
| void gemv_q4_k_q8_k_vnni | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 95 of file gemm_kernels_q4k_q8k_vnni.c.
Referenced by gemv_q4_k_q8_k().
| void quantize_row_q8_k | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Definition at line 107 of file gemm_kernels_q4k_q8k.c.
References quantize_row_q8_k_ref(), and quantize_row_q8_k_sse().
Referenced by ck_attention_project_head_major_q4_k_q8_k(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), ck_mlp_swiglu_forward_q4_k_q8_k(), ck_mlp_swiglu_forward_q4_k_q8_k_prefill(), ck_qkv_project_head_major_q4_k_q8_k(), ck_test_gemm_q4_k(), ck_test_gemm_q6_k(), ck_test_gemv_q4_k(), ck_test_quantize_q8_k(), decode_layer_parallel(), fused_mlp_swiglu_prefill_w1w2_quant(), fused_rmsnorm_qkv_prefill_head_major_quant(), gemm_nt_q5_0_sse_v2(), gemm_nt_q6_k_sse(), mlp_parallel(), model_decode_token(), model_forward_prefill_impl(), model_layer_0_decode(), model_layer_10_decode(), model_layer_11_decode(), model_layer_12_decode(), model_layer_13_decode(), model_layer_14_decode(), model_layer_15_decode(), model_layer_16_decode(), model_layer_17_decode(), model_layer_18_decode(), model_layer_19_decode(), model_layer_1_decode(), model_layer_20_decode(), model_layer_21_decode(), model_layer_22_decode(), model_layer_23_decode(), model_layer_2_decode(), model_layer_3_decode(), model_layer_4_decode(), model_layer_5_decode(), model_layer_6_decode(), model_layer_7_decode(), model_layer_8_decode(), model_layer_9_decode(), quantize_batch_q8_k(), qwen2_0_5b_decode_decode_token(), qwen2_0_5b_decode_forward_prefill_impl(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), qwen2_0_5b_decode_layer_9_decode(), and unfused_rmsnorm_linear_q4k_ref().
| void quantize_row_q8_k_ref | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |
Definition at line 53 of file gemm_kernels_q4k_q8k.c.
References block_q8_K::bsums, ck_nearest_int(), block_q8_K::d, QK_K, and block_q8_K::qs.
Referenced by quantize_row_q8_k().
| void quantize_row_q8_k_sse | ( | const float * | x, |
| void * | vy, | ||
| int | k | ||
| ) |