C-Kernel-Engine/doxygen/gemm__kernels__q5__k_8c_source.html

 /**

  * @file gemm_kernels_q5_k.c

  * @brief GEMM/GEMV kernels with Q5_K quantized weights

  *

  * CK-ENGINE KERNEL RULES:

  * =======================

  * 1. NO malloc/free - memory via bump allocator, pointers passed in

  * 2. NO OpenMP - parallelization at orchestrator/codegen layer

  * 3. API must define: inputs, outputs, workspace, and memory layouts

  * 4. Pure computation - deterministic, no side effects

  *

  * After changes: make test && make llamacpp-parity-full

  *

  * Implements matrix multiplication where:

  *   - Activations (input): FP32

  *   - Weights: Q5_K (5-bit super-block quant)

  *   - Output: FP32

  *

  * Q5_K Format (256 weights per super-block):

  *   - d: FP16 super-block scale

  *   - dmin: FP16 super-block minimum

  *   - scales[12]: 8 sub-block scales + 8 sub-block mins (6 bits each, packed)

  *   - qh[32]: high bits for 256 weights (1 bit each)

  *   - qs[128]: low 4 bits for 256 weights (4 bits each)

  *

  * Total: 2 + 2 + 12 + 32 + 128 = 176 bytes per 256 weights = 5.5 bits/weight

  *

  * Dequantization formula (matches llama.cpp):

  *   w = d * (scale/64) * q - dmin * (mins/64)

  *   where q = qs_val | (qh_bit << 4) = 5-bit value [0, 31]

  */


 #include <stdint.h>

 #include <stddef.h>

 #include <string.h>

 #include "ckernel_quant.h"


 /* Include SIMD headers based on available extensions */

 #if defined(__AVX512F__) || defined(__AVX2__) || defined(__AVX__) || defined(__SSE4_1__)

 #include <immintrin.h>

 #endif


 /* Q5_K constants */

 #define QK_K 256


 /*

  * Q5_K / Q4_K scale unpacking.

  *

  * K-quant super-blocks pack 8 sub-block scales and 8 sub-block mins

  * into 12 bytes using a 6-bit encoding:

  *

  *   scales[0..3]:  low 6 bits of sub-block scales 0-3

  *   scales[4..7]:  low 6 bits of sub-block mins 0-3

  *   scales[8..11]: low 4 bits = scales 4-7, high 4 bits = mins 4-7

  *                  ...with the top 2 bits of scales/mins 4-7 stored

  *                  in the top 2 bits of scales[0..3] and scales[4..7]

  *

  * WHY THIS IS ERROR-PRONE:

  *   The mapping between array index j and which bytes to read from

  *   scales[] changes across three ranges (j<4, j<8, j>=8). The index

  *   arithmetic for the "top 2 bits" source uses j-4 for both scale

  *   and min in the j=4..7 case, but it's tempting to write j-0

  *   (i.e. just j) because the *scale* line above uses j-4 and you

  *   might think the *min* line should use the "other" offset.

  *   In fact both lines source their top 2 bits from scales[j-4].

  *

  *   Reference: llama.cpp ggml-quants.c get_scale_min_k4()

  */


 /* Helper: extract scale and min for a sub-block index from packed scales[12]

  * Matches llama.cpp's get_scale_min_k4() pattern

  * For Q5_K, scales[0-5] and scales[6-11] are directly 6-bit values

  */

 static inline void get_q5_k_scale_min(int j, const uint8_t *scales,

                                        uint8_t *scale, uint8_t *min) {

     if (j < 4) {

         *scale = scales[j] & 63;

         *min = scales[j + 4] & 63;

     } else if (j < 8) {

         *scale = (scales[j + 4] & 0x0F) | ((scales[j - 4] >> 6) << 4);

         *min = (scales[j + 4] >> 4) | ((scales[j - 4] >> 6) << 4);

     } else {

         *scale = (scales[j - 4] & 0x0F) | ((scales[j - 8] >> 6) << 4);

         *min = (scales[j - 4] >> 4) | ((scales[j - 8] >> 6) << 4);

     }

 }


 /* ============================================================================

  * GEMV Reference: y = W @ x  (W is Q5_K, x and y are FP32)

  * ============================================================================ */


 void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)

 {

     const block_q5_K *blocks = (const block_q5_K *)W;

     const int blocks_per_row = K / QK_K;


     for (int m = 0; m < M; m++) {

         const float *x_row = x;

         float sum = 0.0f;


         for (int b = 0; b < blocks_per_row; b++) {

             const block_q5_K *block = &blocks[m * blocks_per_row + b];

             const float d = CK_FP16_TO_FP32(block->d);

             const float dmin = CK_FP16_TO_FP32(block->dmin);

             const uint8_t *scales = block->scales;

             const uint8_t *qh = block->qh;

             const uint8_t *qs = block->qs;


             /* Process 8 sub-blocks of 32 weights each */

             for (int sb = 0; sb < 8; sb++) {

                 uint8_t sc, m;

                 get_q5_k_scale_min(sb, scales, &sc, &m);


                 const float d_sub = d * (float)sc / 64.0f;

                 const float m_sub = dmin * (float)m / 64.0f;


                 /* Each sub-block has 32 weights: low 4 bits in qs, high 1 bit in qh */

                 const int qs_offset = sb * 16;  /* 16 bytes per sub-block */

                 const int qh_offset = sb * 4;   /* 4 bytes per sub-block */


                 for (int i = 0; i < 32; i++) {

                     uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;

                     uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;

                     uint8_t q = qs_val | (qh_bit << 4);


                     /* Q5_K dequantization: w = d * sc/64 * q - dmin * m/64 */

                     float w = d_sub * (float)q - m_sub;

                     sum += w * x_row[b * QK_K + sb * 32 + i];

                 }

             }

         }


         y[m] = sum;

     }

 }


 /* ============================================================================

  * GEMM NT Reference: C = A @ B^T + bias

  *   - A: FP32 activation matrix [M, K]

  *   - B: Q5_K weight matrix [N, K] (stored transposed, accessed as [N, K])

  *   - bias: Optional FP32 bias [N]

  *   - C: FP32 output matrix [M, N]

  * ============================================================================ */


 void gemm_nt_q5_k_ref(const float *A,

                       const void *B,

                       const float *bias,

                       float *C,

                       int M, int N, int K)

 {

     const block_q5_K *blocks = (const block_q5_K *)B;

     const int blocks_per_col = K / QK_K;


     for (int m = 0; m < M; m++) {

         const float *a_row = &A[m * K];


         for (int n = 0; n < N; n++) {

             float sum = 0.0f;


             for (int b = 0; b < blocks_per_col; b++) {

                 const block_q5_K *block = &blocks[n * blocks_per_col + b];

                 const float d = CK_FP16_TO_FP32(block->d);

                 const float dmin = CK_FP16_TO_FP32(block->dmin);

                 const uint8_t *scales = block->scales;

                 const uint8_t *qh = block->qh;

                 const uint8_t *qs = block->qs;


                 /* Process 8 sub-blocks of 32 weights each */

                 for (int sb = 0; sb < 8; sb++) {

                     uint8_t sc, m;

                     get_q5_k_scale_min(sb, scales, &sc, &m);


                     const float d_sub = d * (float)sc / 64.0f;

                     const float m_sub = dmin * (float)m / 64.0f;


                     const int qs_offset = sb * 16;

                     const int qh_offset = sb * 4;


                     for (int i = 0; i < 32; i++) {

                         uint8_t qs_val = (qs[qs_offset + i/2] >> (4 * (i % 2))) & 0xF;

                         uint8_t qh_bit = (qh[qh_offset + i/8] >> (i % 8)) & 1;

                         uint8_t q = qs_val | (qh_bit << 4);


                         float w = d_sub * (float)q - m_sub;

                         sum += w * a_row[b * QK_K + sb * 32 + i];

                     }

                 }

             }


             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);

         }

     }

 }


 /* ============================================================================

  * Dispatch wrappers - select best available implementation

  * ============================================================================ */


 void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)

 {

 #if defined(__AVX512F__)

     /* TODO: AVX-512 implementation */

     gemv_q5_k_ref(y, W, x, M, K);

 #elif defined(__AVX2__)

     /* TODO: AVX-2 implementation */

     gemv_q5_k_ref(y, W, x, M, K);

 #elif defined(__AVX__)

     /* TODO: AVX implementation */

     gemv_q5_k_ref(y, W, x, M, K);

 #elif defined(__SSE4_1__)

     /* TODO: SSE4.1 implementation */

     gemv_q5_k_ref(y, W, x, M, K);

 #else

     gemv_q5_k_ref(y, W, x, M, K);

 #endif

 }


 void gemm_nt_q5_k(const float *A,

                   const void *B,

                   const float *bias,

                   float *C,

                   int M, int N, int K)

 {

 #if defined(__AVX512F__)

     /* TODO: AVX-512 implementation */

     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);

 #elif defined(__AVX2__)

     /* TODO: AVX-2 implementation */

     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);

 #elif defined(__AVX__)

     /* TODO: AVX implementation */

     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);

 #elif defined(__SSE4_1__)

     /* TODO: SSE4.1 implementation */

     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);

 #else

     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);

 #endif

 }

ckernel_quant.h
Quantization block structures for weight-only quantization.

CK_FP16_TO_FP32
#define CK_FP16_TO_FP32(x)
Definition: ckernel_quant.h:400

gemv_q5_k_ref
void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)
Definition: gemm_kernels_q5_k.c:92

get_q5_k_scale_min
static void get_q5_k_scale_min(int j, const uint8_t *scales, uint8_t *scale, uint8_t *min)
Definition: gemm_kernels_q5_k.c:74

gemv_q5_k
void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)
Definition: gemm_kernels_q5_k.c:199

gemm_nt_q5_k_ref
void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_k.c:145

gemm_nt_q5_k
void gemm_nt_q5_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_k.c:218

QK_K
#define QK_K
Definition: gemm_kernels_q5_k.c:44

C
#define C(color)
Definition: show_config.c:39

block_q5_K
Definition: ckernel_quant.h:140

block_q5_K::dmin
ck_half dmin
Definition: ckernel_quant.h:142

block_q5_K::d
ck_half d
Definition: ckernel_quant.h:141

block_q5_K::qh
uint8_t qh[256/8]
Definition: ckernel_quant.h:144

block_q5_K::qs
uint8_t qs[256/2]
Definition: ckernel_quant.h:145

block_q5_K::scales
uint8_t scales[12]
Definition: ckernel_quant.h:143