Q6_K (weights) x Q8_K (activations) kernels for inference. More...
#include <assert.h>#include <math.h>#include <string.h>#include <stdint.h>#include <stddef.h>#include "ckernel_quant.h"Go to the source code of this file.
Functions | |
| static float | dot_q6_k_q8_k_ref (const block_q6_K *w, const block_q8_K *x, int K) |
| Scalar dot product for Q6_K x Q8_K. More... | |
| void | gemm_nt_q6_k_q8_k (const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K) |
| NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K. More... | |
| void | gemm_q6_k_q8_k (float *Y, const void *W, const void *X_q8, int M, int N, int K) |
| GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K. More... | |
| void | gemv_q6_k_q8_k (float *y, const void *W, const void *x_q8, int M, int K) |
| GEMV: y = W @ x where W is Q6_K and x is Q8_K. More... | |
| void | gemv_q6_k_q8_k_avx (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q6_k_q8_k_avx2 (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q6_k_q8_k_avx512 (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q6_k_q8_k_avx512_vbmi (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q6_k_q8_k_parallel (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| Parallel reference GEMV for Q6_K × Q8_K. More... | |
| void | gemv_q6_k_q8_k_parallel_simd (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth) |
| Parallel SIMD GEMV for Q6_K × Q8_K. More... | |
| void | gemv_q6_k_q8_k_ref (float *y, const void *W, const void *x_q8, int M, int K) |
| void | gemv_q6_k_q8_k_sse (float *y, const void *W, const void *x_q8, int M, int K) |
| void | vec_dot_q6_k_q8_k (int n, float *s, const void *vx, const void *vy) |
| Q6_K x Q8_K dot product (single row) More... | |
Q6_K (weights) x Q8_K (activations) kernels for inference.
After changes: make test && make llamacpp-parity-full
Implements decode-style matvec/matmul where weights are Q6_K and the activations are quantized on-the-fly to Q8_K. This is inference-only; no backward pass is provided here.
Q6_K Format (256 weights per block):
Q8_K Format (256 weights per block):
Definition in file gemm_kernels_q6k_q8k.c.
|
static |
Scalar dot product for Q6_K x Q8_K.
Q6_K layout: 256 weights per block
The dequantization formula for each weight is: weight = d * scale[sub] * (q6_value - 32) where q6_value is the 6-bit unsigned value (0..63).
Definition at line 67 of file gemm_kernels_q6k_q8k.c.
References block_q8_K::d, GGML_FP16_TO_FP32, block_q6_K::qh, QK_K, block_q6_K::ql, block_q8_K::qs, and block_q6_K::scales.
Referenced by gemv_q6_k_q8_k_parallel(), gemv_q6_k_q8_k_parallel_simd(), gemv_q6_k_q8_k_ref(), and vec_dot_q6_k_q8_k().
| void gemm_nt_q6_k_q8_k | ( | const void * | A_q8, |
| const void * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K.
This is the typical inference pattern:
| A_q8 | Input activations in Q8_K format |
| B | Weight matrix in Q6_K format |
| bias | Optional bias vector [N] |
| C | Output matrix |
| M | Batch size (number of tokens) |
| N | Output dimension |
| K | Input dimension |
Definition at line 1144 of file gemm_kernels_q6k_q8k.c.
References C, and gemm_q6_k_q8_k().
Referenced by ck_test_gemm_q6_k(), gemm_nt_q8_k_mlp_dispatch(), and gemm_nt_q8_k_qkv_dispatch().
| void gemm_q6_k_q8_k | ( | float * | Y, |
| const void * | W, | ||
| const void * | X_q8, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K.
| Y | Output matrix [N x M] in row-major |
| W | Weight matrix in Q6_K format [M x K] |
| X_q8 | Input matrix in Q8_K format [N x K] |
| M | Number of output rows (output dim) |
| N | Number of input vectors (batch size) |
| K | Input dimension |
Definition at line 1110 of file gemm_kernels_q6k_q8k.c.
References gemv_q6_k_q8_k(), and QK_K.
Referenced by gemm_nt_q6_k_q8_k().
| void gemv_q6_k_q8_k | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
GEMV: y = W @ x where W is Q6_K and x is Q8_K.
Definition at line 980 of file gemm_kernels_q6k_q8k.c.
References gemv_q6_k_q8_k_avx(), gemv_q6_k_q8_k_avx2(), gemv_q6_k_q8_k_avx512(), gemv_q6_k_q8_k_ref(), and gemv_q6_k_q8_k_sse().
Referenced by gemm_q6_k_q8_k().
| void gemv_q6_k_q8_k_avx | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Referenced by gemv_q6_k_q8_k().
| void gemv_q6_k_q8_k_avx2 | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Referenced by gemv_q6_k_q8_k().
| void gemv_q6_k_q8_k_avx512 | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Referenced by gemv_q6_k_q8_k().
| void gemv_q6_k_q8_k_avx512_vbmi | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
| void gemv_q6_k_q8_k_parallel | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel reference GEMV for Q6_K × Q8_K.
Caller provides ith (thread index) and nth (total threads). Each thread processes rows [r0, r1).
Definition at line 1014 of file gemm_kernels_q6k_q8k.c.
References dot_q6_k_q8_k_ref(), and QK_K.
| void gemv_q6_k_q8_k_parallel_simd | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K, | ||
| int | ith, | ||
| int | nth | ||
| ) |
Parallel SIMD GEMV for Q6_K × Q8_K.
Uses best available SIMD (AVX/SSE) with row prefetching. Caller provides ith/nth from OpenMP region.
Definition at line 1046 of file gemm_kernels_q6k_q8k.c.
References dot_q6_k_q8_k_ref(), and QK_K.
| void gemv_q6_k_q8_k_ref | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Definition at line 119 of file gemm_kernels_q6k_q8k.c.
References dot_q6_k_q8_k_ref(), and QK_K.
Referenced by gemv_q6_k_q8_k().
| void gemv_q6_k_q8_k_sse | ( | float * | y, |
| const void * | W, | ||
| const void * | x_q8, | ||
| int | M, | ||
| int | K | ||
| ) |
Referenced by gemv_q6_k_q8_k().
| void vec_dot_q6_k_q8_k | ( | int | n, |
| float * | s, | ||
| const void * | vx, | ||
| const void * | vy | ||
| ) |
Q6_K x Q8_K dot product (single row)
Definition at line 954 of file gemm_kernels_q6k_q8k.c.
References dot_q6_k_q8_k_ref().