C-Kernel-Engine/doxygen/gemm__kernels__q5__1_8c_source.html

 /**

  * @file gemm_kernels_q5_1.c

  * @brief GEMM/GEMV kernels with Q5_1 quantized weights

  *

  * CK-ENGINE KERNEL RULES:

  * =======================

  * 1. NO malloc/free - memory via bump allocator, pointers passed in

  * 2. NO OpenMP - parallelization at orchestrator/codegen layer

  * 3. API must define: inputs, outputs, workspace, and memory layouts

  * 4. Pure computation - deterministic, no side effects

  *

  * After changes: make test && make llamacpp-parity-full

  *

  * Q5_1 Format:

  *   - 32 weights per block

  *   - 1 FP16 scale (d) per block

  *   - 1 FP16 minimum (m) per block

  *   - Low 4-bits stored like Q4_1 (16 bytes)

  *   - High 1-bit packed separately (4 bytes)

  *   - 24 bytes per 32 weights = 6.0 bits/weight

  *

  * Dequantization: w = d * q5 + m

  * where q5 = low4bit | (highbit << 4), giving values 0-31

  *

  * Operations:

  *   Forward:  Y = W @ X  (W is Q5_1, X and Y are FP32)

  *   Backward: dX = W^T @ dY (gradient w.r.t. input)

  */


 #include <stdint.h>

 #include <stddef.h>

 #include <string.h>

 #include "ckernel_quant.h"


 #ifdef __AVX512F__

 #include <immintrin.h>

 #endif


 /* ============================================================================

  * Forward Pass: GEMV y = W @ x

  * ============================================================================ */


 /**

  * @brief Matrix-vector multiply with Q5_1 weights (scalar reference)

  *

  * @param y Output vector [M]

  * @param W Weight matrix in Q5_1 format [M x K]

  * @param x Input vector [K]

  * @param M Number of output rows

  * @param K Number of columns (must be multiple of 32)

  */

 void gemv_q5_1_ref(float *y,

                    const void *W,

                    const float *x,

                    int M, int K)

 {

     const block_q5_1 *blocks = (const block_q5_1 *)W;

     const int blocks_per_row = K / QK5_1;


     for (int row = 0; row < M; row++) {

         float sum = 0.0f;


         for (int b = 0; b < blocks_per_row; b++) {

             const block_q5_1 *block = &blocks[row * blocks_per_row + b];

             const float d = CK_FP16_TO_FP32(block->d);

             const float m = CK_FP16_TO_FP32(block->m);

             const float *xp = &x[b * QK5_1];


             /* Get high bits as 32-bit integer */

             uint32_t qh;

             memcpy(&qh, block->qh, sizeof(qh));


             /* GGML Q5_1 layout: weights 0-15 from LOW nibbles, 16-31 from HIGH nibbles.

              * High bits: bits 0-15 of qh → first half, bits 16-31 → second half.

              * NOT interleaved like Q4_0/Q4_1. */


             /* First 16 weights: low nibbles of qs[j], high bit from qh bits 0-15 */

             for (int j = 0; j < QK5_1 / 2; j++) {

                 const int lo = (block->qs[j] & 0x0F);

                 const int hi = ((qh >> j) & 1) << 4;

                 const float w = d * (float)(lo | hi) + m;

                 sum += w * xp[j];

             }


             /* Second 16 weights: high nibbles of qs[j], high bit from qh bits 16-31 */

             for (int j = 0; j < QK5_1 / 2; j++) {

                 const int lo = (block->qs[j] >> 4);

                 const int hi = ((qh >> (j + 16)) & 1) << 4;

                 const float w = d * (float)(lo | hi) + m;

                 sum += w * xp[j + QK5_1 / 2];

             }

         }


         y[row] = sum;

     }

 }


 #ifdef __AVX512F__

 /**

  * @brief Matrix-vector multiply with Q5_1 weights (AVX-512)

  *

  * GGML Q5_1 layout per block (32 weights):

  *   - Weights 0-15: low nibbles of qs[0..15], high bits from qh bits 0-15

  *   - Weights 16-31: high nibbles of qs[0..15], high bits from qh bits 16-31

  */

 void gemv_q5_1_avx512(float *y,

                       const void *W,

                       const float *x,

                       int M, int K)

 {

     const block_q5_1 *blocks = (const block_q5_1 *)W;

     const int blocks_per_row = K / QK5_1;

     const __m512i mask_lo = _mm512_set1_epi32(0x0F);


     for (int row = 0; row < M; row++) {

         __m512 acc = _mm512_setzero_ps();


         for (int b = 0; b < blocks_per_row; b++) {

             const block_q5_1 *block = &blocks[row * blocks_per_row + b];

             const __m512 vscale = _mm512_set1_ps(CK_FP16_TO_FP32(block->d));

             const __m512 vmin = _mm512_set1_ps(CK_FP16_TO_FP32(block->m));

             const float *xp = &x[b * QK5_1];


             /* Load high bits */

             uint32_t qh;

             memcpy(&qh, block->qh, sizeof(qh));


             /* Load 16 bytes = 32 x 4-bit weights */

             __m128i packed = _mm_loadu_si128((const __m128i *)block->qs);

             __m512i bytes = _mm512_cvtepu8_epi32(packed);


             /* Extract low nibbles (weights 0-15) and high nibbles (weights 16-31) */

             __m512i lo = _mm512_and_epi32(bytes, mask_lo);

             __m512i hi_shift = _mm512_srli_epi32(bytes, 4);


             /* High bit contribution for first 16 weights: qh bits 0-15 */

             __m512i qh_first = _mm512_set_epi32(

                 ((qh >> 15) & 1) << 4, ((qh >> 14) & 1) << 4,

                 ((qh >> 13) & 1) << 4, ((qh >> 12) & 1) << 4,

                 ((qh >> 11) & 1) << 4, ((qh >> 10) & 1) << 4,

                 ((qh >>  9) & 1) << 4, ((qh >>  8) & 1) << 4,

                 ((qh >>  7) & 1) << 4, ((qh >>  6) & 1) << 4,

                 ((qh >>  5) & 1) << 4, ((qh >>  4) & 1) << 4,

                 ((qh >>  3) & 1) << 4, ((qh >>  2) & 1) << 4,

                 ((qh >>  1) & 1) << 4, ((qh >>  0) & 1) << 4

             );


             /* High bit contribution for second 16 weights: qh bits 16-31 */

             __m512i qh_second = _mm512_set_epi32(

                 ((qh >> 31) & 1) << 4, ((qh >> 30) & 1) << 4,

                 ((qh >> 29) & 1) << 4, ((qh >> 28) & 1) << 4,

                 ((qh >> 27) & 1) << 4, ((qh >> 26) & 1) << 4,

                 ((qh >> 25) & 1) << 4, ((qh >> 24) & 1) << 4,

                 ((qh >> 23) & 1) << 4, ((qh >> 22) & 1) << 4,

                 ((qh >> 21) & 1) << 4, ((qh >> 20) & 1) << 4,

                 ((qh >> 19) & 1) << 4, ((qh >> 18) & 1) << 4,

                 ((qh >> 17) & 1) << 4, ((qh >> 16) & 1) << 4

             );


             /* Combine low + high bits */

             __m512i q_first  = _mm512_or_epi32(lo, qh_first);

             __m512i q_second = _mm512_or_epi32(hi_shift, qh_second);


             /* Dequantize: w = d * q + m */

             __m512 w_first  = _mm512_fmadd_ps(_mm512_cvtepi32_ps(q_first), vscale, vmin);

             __m512 w_second = _mm512_fmadd_ps(_mm512_cvtepi32_ps(q_second), vscale, vmin);


             /* Load sequential input: first 16 elements, then next 16 */

             __m512 x_first  = _mm512_loadu_ps(&xp[0]);

             __m512 x_second = _mm512_loadu_ps(&xp[16]);


             acc = _mm512_fmadd_ps(w_first, x_first, acc);

             acc = _mm512_fmadd_ps(w_second, x_second, acc);

         }


         y[row] = _mm512_reduce_add_ps(acc);

     }

 }

 #endif


 /**

  * @brief Auto-dispatch GEMV

  */

 void gemv_q5_1(float *y,

                const void *W,

                const float *x,

                int M, int K)

 {

 #ifdef __AVX512F__

     gemv_q5_1_avx512(y, W, x, M, K);

 #else

     gemv_q5_1_ref(y, W, x, M, K);

 #endif

 }


 /* ============================================================================

  * Forward Pass: GEMM Y = W @ X

  * ============================================================================ */


 /**

  * @brief Matrix-matrix multiply with Q5_1 weights

  */

 void gemm_q5_1(float *Y,

                const void *W,

                const float *X,

                int M, int N, int K)

 {

     for (int n = 0; n < N; n++) {

         gemv_q5_1(&Y[n * M], W, &X[n * K], M, K);

     }

 }


 /* ============================================================================

  * Backward Pass: Gradient w.r.t. Input

  * ============================================================================ */


 /**

  * @brief Backward pass: compute input gradient

  *

  * @param dX Output gradient w.r.t. input [K]

  * @param W Weight matrix in Q5_1 format [M x K]

  * @param dY Gradient w.r.t. output [M]

  * @param M Number of output rows

  * @param K Number of columns (input dimension)

  */

 void gemv_q5_1_backward_ref(float *dX,

                             const void *W,

                             const float *dY,

                             int M, int K)

 {

     const block_q5_1 *blocks = (const block_q5_1 *)W;

     const int blocks_per_row = K / QK5_1;


     /* Zero output gradient */

     memset(dX, 0, K * sizeof(float));


     /* Accumulate: dX += W^T @ dY */

     for (int row = 0; row < M; row++) {

         const float dy = dY[row];


         for (int b = 0; b < blocks_per_row; b++) {

             const block_q5_1 *block = &blocks[row * blocks_per_row + b];

             const float d = CK_FP16_TO_FP32(block->d);

             const float m = CK_FP16_TO_FP32(block->m);

             float *dxp = &dX[b * QK5_1];


             /* Get high bits */

             uint32_t qh;

             memcpy(&qh, block->qh, sizeof(qh));


             /* First 16 weights: low nibbles, high bits from qh[0:15] */

             for (int j = 0; j < QK5_1 / 2; j++) {

                 const int lo = (block->qs[j] & 0x0F);

                 const int hi = ((qh >> j) & 1) << 4;

                 const float w = d * (float)(lo | hi) + m;

                 dxp[j] += w * dy;

             }


             /* Second 16 weights: high nibbles, high bits from qh[16:31] */

             for (int j = 0; j < QK5_1 / 2; j++) {

                 const int lo = (block->qs[j] >> 4);

                 const int hi = ((qh >> (j + 16)) & 1) << 4;

                 const float w = d * (float)(lo | hi) + m;

                 dxp[j + QK5_1 / 2] += w * dy;

             }

         }

     }

 }


 /**

  * @brief Auto-dispatch backward

  */

 void gemv_q5_1_backward(float *dX,

                         const void *W,

                         const float *dY,

                         int M, int K)

 {

     gemv_q5_1_backward_ref(dX, W, dY, M, K);

 }


 /**

  * @brief Batched backward pass

  */

 void gemm_q5_1_backward(float *dX,

                         const void *W,

                         const float *dY,

                         int M, int N, int K)

 {

     for (int n = 0; n < N; n++) {

         gemv_q5_1_backward(&dX[n * K], W, &dY[n * M], M, K);

     }

 }


 /* ============================================================================

  * GEMM NT (Non-Transpose A, Transpose B) - C = A @ B^T

  * ============================================================================ */


 /**

  * @brief GEMM with transposed Q5_1 weights: C = A @ B^T

  *

  * @param A Input activations [M x K], row-major FP32

  * @param B Weight matrix in Q5_1 format [N x K], row-major quantized

  * @param bias Optional bias [N], NULL if not used

  * @param C Output [M x N], row-major FP32

  * @param M Batch size (number of tokens)

  * @param N Output dimension

  * @param K Input dimension

  */

 void gemm_nt_q5_1(const float *A,

                   const void *B,

                   const float *bias,

                   float *C,

                   int M, int N, int K)

 {

     const block_q5_1 *blocks = (const block_q5_1 *)B;

     const int blocks_per_row = K / QK5_1;


     for (int m = 0; m < M; m++) {

         const float *a_row = &A[m * K];


         for (int n = 0; n < N; n++) {

             float sum = 0.0f;


             for (int b = 0; b < blocks_per_row; b++) {

                 const block_q5_1 *block = &blocks[n * blocks_per_row + b];

                 const float d = CK_FP16_TO_FP32(block->d);

                 const float min = CK_FP16_TO_FP32(block->m);

                 const float *ap = &a_row[b * QK5_1];


                 uint32_t qh;

                 memcpy(&qh, block->qh, sizeof(qh));


                 /* First 16 weights: low nibbles, high bits from qh[0:15] */

                 for (int j = 0; j < QK5_1 / 2; j++) {

                     const int lo = (block->qs[j] & 0x0F);

                     const int hi = ((qh >> j) & 1) << 4;

                     sum += (d * (float)(lo | hi) + min) * ap[j];

                 }


                 /* Second 16 weights: high nibbles, high bits from qh[16:31] */

                 for (int j = 0; j < QK5_1 / 2; j++) {

                     const int lo = (block->qs[j] >> 4);

                     const int hi = ((qh >> (j + 16)) & 1) << 4;

                     sum += (d * (float)(lo | hi) + min) * ap[j + QK5_1 / 2];

                 }

             }


             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);

         }

     }

 }


 /* ============================================================================

  * Dot Product Utility

  * ============================================================================ */


 float dot_q5_1(const void *w_q5_1, const float *x, int K)

 {

     float result;

     gemv_q5_1(&result, w_q5_1, x, 1, K);

     return result;

 }

ckernel_quant.h
Quantization block structures for weight-only quantization.

CK_FP16_TO_FP32
#define CK_FP16_TO_FP32(x)
Definition: ckernel_quant.h:400

QK5_1
#define QK5_1
Definition: ckernel_quant.h:84

gemm_q5_1_backward
void gemm_q5_1_backward(float *dX, const void *W, const float *dY, int M, int N, int K)
Batched backward pass.
Definition: gemm_kernels_q5_1.c:284

dot_q5_1
float dot_q5_1(const void *w_q5_1, const float *x, int K)
Definition: gemm_kernels_q5_1.c:357

gemv_q5_1
void gemv_q5_1(float *y, const void *W, const float *x, int M, int K)
Auto-dispatch GEMV.
Definition: gemm_kernels_q5_1.c:184

gemv_q5_1_ref
void gemv_q5_1_ref(float *y, const void *W, const float *x, int M, int K)
Matrix-vector multiply with Q5_1 weights (scalar reference)
Definition: gemm_kernels_q5_1.c:52

gemm_nt_q5_1
void gemm_nt_q5_1(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_1 weights: C = A @ B^T.
Definition: gemm_kernels_q5_1.c:309

gemv_q5_1_backward_ref
void gemv_q5_1_backward_ref(float *dX, const void *W, const float *dY, int M, int K)
Backward pass: compute input gradient.
Definition: gemm_kernels_q5_1.c:226

gemv_q5_1_backward
void gemv_q5_1_backward(float *dX, const void *W, const float *dY, int M, int K)
Auto-dispatch backward.
Definition: gemm_kernels_q5_1.c:273

gemm_q5_1
void gemm_q5_1(float *Y, const void *W, const float *X, int M, int N, int K)
Matrix-matrix multiply with Q5_1 weights.
Definition: gemm_kernels_q5_1.c:203

C
#define C(color)
Definition: show_config.c:39

block_q5_1
Definition: ckernel_quant.h:86

block_q5_1::qs
uint8_t qs[32/2]
Definition: ckernel_quant.h:90

block_q5_1::qh
uint8_t qh[4]
Definition: ckernel_quant.h:89

block_q5_1::m
ck_half m
Definition: ckernel_quant.h:88

block_q5_1::d
ck_half d
Definition: ckernel_quant.h:87