← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_q5_0.c File Reference

GEMM/GEMV kernels with Q5_0 quantized weights. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include "ckernel_quant.h"
#include "ck_features.h"

Go to the source code of this file.

Functions

void dequant_q5_0_block (const block_q5_0 *block, float *output)
 Dequantize a single Q5_0 block to FP32. More...
 
void dequant_q5_0_row (const void *src, float *dst, size_t n_elements)
 Dequantize Q5_0 row (multiple blocks) More...
 
float dot_q5_0 (const void *w_q5_0, const float *x, int K)
 
void gemm_nt_q5_0 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_nt_q5_0_q8_0 (const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)
 Batch GEMM with Q5_0 weights and Q8_0 activations for prefill. More...
 
void gemm_nt_q5_0_ref (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 GEMM with transposed Q5_0 weights: C = A @ B^T. More...
 
void gemm_nt_q5_0_sse_v2 (const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
 
void gemm_q5_0 (float *Y, const void *W, const float *X, int M, int N, int K)
 Matrix-matrix multiply with Q5_0 weights. More...
 
void gemm_q5_0_backward (float *dX, const void *W, const float *dY, int M, int N, int K)
 Batched backward pass. More...
 
void gemv_q5_0 (float *y, const void *W, const float *x, int M, int K)
 Auto-dispatch GEMV for Q5_0 weights based on CPU features. More...
 
void gemv_q5_0_backward (float *dX, const void *W, const float *dY, int M, int K)
 Auto-dispatch backward. More...
 
void gemv_q5_0_backward_ref (float *dX, const void *W, const float *dY, int M, int K)
 Backward pass: compute input gradient. More...
 
void gemv_q5_0_parallel (float *y, const void *W, const float *x, int M, int K, int ith, int nth)
 Parallel reference GEMV for Q5_0 × FP32. More...
 
void gemv_q5_0_parallel_simd (float *y, const void *W, const float *x, int M, int K, int ith, int nth)
 Parallel SIMD GEMV for Q5_0 × FP32 with prefetching. More...
 
void gemv_q5_0_q8_0 (float *y, const void *W, const void *x_q8, int M, int K)
 Matrix-vector multiply with Q5_0 weights and Q8_0 input. More...
 
void gemv_q5_0_q8_0_parallel_simd (float *y, const void *W, const void *x_q8, int M, int K, int ith, int nth)
 Parallel SIMD GEMV for Q5_0 x Q8_0 with prefetching. More...
 
void gemv_q5_0_ref (float *y, const void *W, const float *x, int M, int K)
 Matrix-vector multiply with Q5_0 weights (scalar reference) More...
 
void vec_dot_q5_0_q8_0 (int n, float *s, const void *vx, const void *vy)
 Auto-dispatch quantized dot product Q5_0 x Q8_0. More...
 
void vec_dot_q5_0_q8_0_ref (int n, float *s, const void *vx, const void *vy)
 Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference) More...
 

Detailed Description

GEMM/GEMV kernels with Q5_0 quantized weights.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Q5_0 Format:

  • 32 weights per block
  • 1 FP16 scale per block
  • Low 4-bits stored like Q4_0 (16 bytes)
  • High 1-bit packed separately (4 bytes)
  • 22 bytes per 32 weights = 5.5 bits/weight

Dequantization: w = scale * (q5 - 16) where q5 = low4bit | (highbit << 4), giving values 0-31, then subtract 16 for signed -16 to +15

Operations: Forward: Y = W @ X (W is Q5_0, X and Y are FP32) Backward: dX = W^T @ dY (gradient w.r.t. input)

Definition in file gemm_kernels_q5_0.c.

Function Documentation

◆ dequant_q5_0_block()

void dequant_q5_0_block ( const block_q5_0 block,
float *  output 
)

Dequantize a single Q5_0 block to FP32.

Parameters
blockPointer to Q5_0 block (22 bytes)
outputOutput FP32 array (32 floats)

Definition at line 161 of file dequant_kernels.c.

162 {
163  const float d = GGML_FP16_TO_FP32(block->d);
164 
165  /* Get high bits as a 32-bit integer */
166  uint32_t qh;
167  memcpy(&qh, block->qh, sizeof(qh));
168 
169  /* llama.cpp Q5_0 layout:
170  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
171  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
172  */
173  for (int j = 0; j < QK5_0 / 2; j++) {
174  const uint8_t packed = block->qs[j];
175 
176  /* Extract low 4 bits for two weights */
177  const int lo = (packed & 0x0F);
178  const int hi = (packed >> 4);
179 
180  /* Extract high bits from qh - matches llama.cpp exactly */
181  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
182  const int xh_1 = ((qh >> (j + 12))) & 0x10;
183 
184  /* Combine: 5-bit value, range 0-31, then subtract 16 */
185  const int q0 = (lo | xh_0) - 16;
186  const int q1 = (hi | xh_1) - 16;
187 
188  output[j] = d * (float)q0;
189  output[j + 16] = d * (float)q1;
190  }
191 }
#define QK5_0
Definition: ckernel_quant.h:67
#define GGML_FP16_TO_FP32
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72

References block_q5_0::d, GGML_FP16_TO_FP32, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by dequant_q5_0_row().

◆ dequant_q5_0_row()

void dequant_q5_0_row ( const void *  src,
float *  dst,
size_t  n_elements 
)

Dequantize Q5_0 row (multiple blocks)

Definition at line 196 of file dequant_kernels.c.

197 {
198  const block_q5_0 *blocks = (const block_q5_0 *)src;
199  const size_t n_blocks = n_elements / QK5_0;
200 
201  for (size_t b = 0; b < n_blocks; b++) {
202  dequant_q5_0_block(&blocks[b], &dst[b * QK5_0]);
203  }
204 }
void dequant_q5_0_block(const block_q5_0 *block, float *output)
Dequantize a single Q5_0 block to FP32.

Referenced by dequant_row().

◆ dot_q5_0()

float dot_q5_0 ( const void *  w_q5_0,
const float *  x,
int  K 
)

Definition at line 870 of file gemm_kernels_q5_0.c.

871 {
872  float result;
873  gemv_q5_0(&result, w_q5_0, x, 1, K);
874  return result;
875 }
void gemv_q5_0(float *y, const void *W, const float *x, int M, int K)
Auto-dispatch GEMV for Q5_0 weights based on CPU features.

References gemv_q5_0().

◆ gemm_nt_q5_0()

void gemm_nt_q5_0 ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 831 of file gemm_kernels_q5_0.c.

836 {
837  /* For decode (M=1), use direct GEMV which has AVX optimization */
838  if (M == 1) {
839  /* gemm_q5_0 expects column-major output, but we need row-major
840  * So we call gemv_q5_0 directly for each output element */
841  gemv_q5_0(C, B, A, N, K);
842  if (bias) {
843  for (int n = 0; n < N; n++) {
844  C[n] += bias[n];
845  }
846  }
847  return;
848  }
849 
850  /* For prefill (M>1), use GEMM which dispatches to GEMV with AVX/AVX512 */
851  /* gemm_q5_0 produces Y as [batch x M_out]. Here:
852  * batch = M (tokens)
853  * M_out = N (output channels) */
854  gemm_q5_0(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
855 
856  if (bias) {
857  for (int m = 0; m < M; m++) {
858  float *row = C + (size_t)m * (size_t)N;
859  for (int n = 0; n < N; n++) {
860  row[n] += bias[n];
861  }
862  }
863  }
864 }
void gemm_q5_0(float *Y, const void *W, const float *X, int M, int N, int K)
Matrix-matrix multiply with Q5_0 weights.
#define C(color)
Definition: show_config.c:39

References C, gemm_q5_0(), and gemv_q5_0().

Referenced by ck_gemm_nt_quant(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().

◆ gemm_nt_q5_0_q8_0()

void gemm_nt_q5_0_q8_0 ( const void *  A_q8,
const void *  B_q5,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.

Computes C = A @ B^T + bias where: A: [M x K] Q8_0 quantized activations (M tokens, K features) B: [N x K] Q5_0 quantized weights (N outputs, K features) C: [M x N] FP32 output

This is the INT8 batch kernel for prefill, using pre-quantized activations to avoid FP32->Q8_0 conversion overhead per operation.

Parameters
A_q8Input activations in Q8_0 format [M rows of K/32 blocks each]
B_q5Weights in Q5_0 format [N rows of K/32 blocks each]
biasOptional bias vector [N], NULL if not used
COutput matrix [M x N], row-major FP32
MBatch size (number of tokens)
NOutput dimension (number of output features)
KInput dimension (must be multiple of 32)

Definition at line 1617 of file gemm_kernels_q5_0.c.

1625 {
1626  const block_q5_0 *weights = (const block_q5_0 *)B_q5;
1627  const block_q8_0 *inputs = (const block_q8_0 *)A_q8;
1628  const int blocks_per_row = K / QK5_0;
1629 
1630  for (int m = 0; m < M; m++) {
1631  const block_q8_0 *input_row = &inputs[m * blocks_per_row];
1632 
1633  for (int n = 0; n < N; n++) {
1634  const block_q5_0 *weight_row = &weights[n * blocks_per_row];
1635  float *out = &C[m * N + n];
1636 
1637  /* Dispatches to vec_dot_q5_0_q8_0_avx (2x block unrolled) on AVX */
1638  vec_dot_q5_0_q8_0(K, out, weight_row, input_row);
1639 
1640  if (bias) {
1641  *out += bias[n];
1642  }
1643  }
1644  }
1645 }
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.

References C, QK5_0, and vec_dot_q5_0_q8_0().

Referenced by ck_test_gemm_q5_0(), gemm_nt_q8_0_dispatch(), and gemm_nt_q8_0_mlp_dispatch().

◆ gemm_nt_q5_0_ref()

void gemm_nt_q5_0_ref ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

GEMM with transposed Q5_0 weights: C = A @ B^T.

Parameters
AInput activations [M x K], row-major FP32
BWeight matrix in Q5_0 format [N x K], row-major quantized
biasOptional bias [N], NULL if not used
COutput [M x N], row-major FP32
MBatch size (number of tokens)
NOutput dimension (number of rows in B)
KInput dimension

Definition at line 788 of file gemm_kernels_q5_0.c.

793 {
794  const block_q5_0 *blocks = (const block_q5_0 *)B;
795  const int blocks_per_row = K / QK5_0;
796 
797  for (int m = 0; m < M; m++) {
798  const float *a_row = &A[m * K];
799 
800  for (int n = 0; n < N; n++) {
801  float sum = 0.0f;
802 
803  for (int b = 0; b < blocks_per_row; b++) {
804  const block_q5_0 *block = &blocks[n * blocks_per_row + b];
805  const float d = CK_FP16_TO_FP32(block->d);
806  const float *ap = &a_row[b * QK5_0];
807 
808  uint32_t qh;
809  memcpy(&qh, block->qh, sizeof(qh));
810 
811  /* llama.cpp Q5_0 layout - note j+12 for second weight high bit */
812  for (int j = 0; j < QK5_0 / 2; j++) {
813  const uint8_t packed = block->qs[j];
814  const int lo = (packed & 0x0F);
815  const int hi = (packed >> 4);
816  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
817  const int xh_1 = ((qh >> (j + 12))) & 0x10;
818  const int q0 = (lo | xh_0) - 16;
819  const int q1 = (hi | xh_1) - 16;
820 
821  sum += d * (float)q0 * ap[j];
822  sum += d * (float)q1 * ap[j + 16];
823  }
824  }
825 
826  C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
827  }
828  }
829 }
#define CK_FP16_TO_FP32(x)

References C, CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemm_nt_q5_0_sse_v2().

◆ gemm_nt_q5_0_sse_v2()

void gemm_nt_q5_0_sse_v2 ( const float *  A,
const void *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 77 of file gemm_kernels_q5_0_sse_v2.c.

82 {
83  if (K % QK_K != 0) {
84  gemm_nt_q5_0_ref(A, B, bias, C, M, N, K);
85  return;
86  }
87 
88  size_t q8_size = (K / QK_K) * sizeof(block_q8_K);
89  block_q8_K *A_q8 = (block_q8_K *)alloca(q8_size);
90 
91  const block_q5_0 *weights = (const block_q5_0 *)B;
92  const int blocks_per_row = K / 32;
93 
94  for (int m = 0; m < M; m++) {
95  quantize_row_q8_k(&A[m * K], A_q8, K);
96 
97  for (int n = 0; n < N; n++) {
98  float sumf = 0.0f;
99  const block_q5_0 *w_row = weights + n * blocks_per_row;
100 
101  for (int b = 0; b < blocks_per_row; b++) {
102  int q8_block_idx = (b * 32) / QK_K;
103  int q8_offset = (b * 32) % QK_K;
104  sumf += dot_q5_0_q8_k_32_sse(&w_row[b], &A_q8[q8_block_idx], q8_offset);
105  }
106 
107  C[m * N + n] = sumf + (bias ? bias[n] : 0.0f);
108  }
109  }
110 }
#define QK_K
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q5_0_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_0 weights: C = A @ B^T.
static float dot_q5_0_q8_k_32_sse(const block_q5_0 *bw, const block_q8_K *ba, int q8_offset)

◆ gemm_q5_0()

void gemm_q5_0 ( float *  Y,
const void *  W,
const float *  X,
int  M,
int  N,
int  K 
)

Matrix-matrix multiply with Q5_0 weights.

Definition at line 682 of file gemm_kernels_q5_0.c.

686 {
687  for (int n = 0; n < N; n++) {
688  gemv_q5_0(&Y[n * M], W, &X[n * K], M, K);
689  }
690 }

References gemv_q5_0().

Referenced by gemm_nt_q5_0().

◆ gemm_q5_0_backward()

void gemm_q5_0_backward ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  N,
int  K 
)

Batched backward pass.

Definition at line 762 of file gemm_kernels_q5_0.c.

766 {
767  for (int n = 0; n < N; n++) {
768  gemv_q5_0_backward(&dX[n * K], W, &dY[n * M], M, K);
769  }
770 }
void gemv_q5_0_backward(float *dX, const void *W, const float *dY, int M, int K)
Auto-dispatch backward.

References gemv_q5_0_backward().

◆ gemv_q5_0()

void gemv_q5_0 ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Auto-dispatch GEMV for Q5_0 weights based on CPU features.

Dispatch priority (best available):

  1. AVX-512 (512-bit vectors) - Intel Skylake-X+
  2. AVX2+FMA (256-bit vectors) - Intel Haswell+
  3. AVX (256-bit vectors) - Intel Sandy Bridge+
  4. SSE4.1 (128-bit vectors) - Intel Nehalem+
  5. Reference (scalar) - Fallback

Uses ck_features.h for standardized feature detection.

Parameters
yOutput vector [M]
WWeight matrix in Q5_0 format [M x K]
xInput vector [K]
MNumber of output rows
KNumber of input columns (hidden dimension)

Definition at line 547 of file gemm_kernels_q5_0.c.

551 {
552 // Dispatch order: AVX512 > AVX2 > AVX > SSE > ref
553 #if defined(__AVX512F__)
554  gemv_q5_0_avx512(y, W, x, M, K);
555 #elif defined(__AVX2__)
556  gemv_q5_0_avx2(y, W, x, M, K);
557 #elif defined(__AVX__)
558  gemv_q5_0_avx(y, W, x, M, K);
559 #elif defined(__SSE4_1__)
560  gemv_q5_0_sse_v2(y, W, x, M, K);
561 #else
562  gemv_q5_0_ref(y, W, x, M, K);
563 #endif
564 }
void gemv_q5_0_ref(float *y, const void *W, const float *x, int M, int K)
Matrix-vector multiply with Q5_0 weights (scalar reference)

References gemv_q5_0_ref().

Referenced by dot_q5_0(), gemm_nt_q5_0(), and gemm_q5_0().

◆ gemv_q5_0_backward()

void gemv_q5_0_backward ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  K 
)

Auto-dispatch backward.

Definition at line 751 of file gemm_kernels_q5_0.c.

755 {
756  gemv_q5_0_backward_ref(dX, W, dY, M, K);
757 }
void gemv_q5_0_backward_ref(float *dX, const void *W, const float *dY, int M, int K)
Backward pass: compute input gradient.

References gemv_q5_0_backward_ref().

Referenced by gemm_q5_0_backward().

◆ gemv_q5_0_backward_ref()

void gemv_q5_0_backward_ref ( float *  dX,
const void *  W,
const float *  dY,
int  M,
int  K 
)

Backward pass: compute input gradient.

Parameters
dXOutput gradient w.r.t. input [K]
WWeight matrix in Q5_0 format [M x K]
dYGradient w.r.t. output [M]
MNumber of output rows
KNumber of columns (input dimension)

Definition at line 705 of file gemm_kernels_q5_0.c.

709 {
710  const block_q5_0 *blocks = (const block_q5_0 *)W;
711  const int blocks_per_row = K / QK5_0;
712 
713  /* Zero output gradient */
714  memset(dX, 0, K * sizeof(float));
715 
716  /* Accumulate: dX += W^T @ dY */
717  for (int row = 0; row < M; row++) {
718  const float dy = dY[row];
719 
720  for (int b = 0; b < blocks_per_row; b++) {
721  const block_q5_0 *block = &blocks[row * blocks_per_row + b];
722  const float d = CK_FP16_TO_FP32(block->d);
723  float *dxp = &dX[b * QK5_0];
724 
725  /* Get high bits */
726  uint32_t qh;
727  memcpy(&qh, block->qh, sizeof(qh));
728 
729  /* llama.cpp Q5_0 layout - note j+12 for second weight high bit */
730  for (int j = 0; j < QK5_0 / 2; j++) {
731  const uint8_t packed = block->qs[j];
732 
733  /* Extract and reconstruct 5-bit values */
734  const int lo = (packed & 0x0F);
735  const int hi = (packed >> 4);
736  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
737  const int xh_1 = ((qh >> (j + 12))) & 0x10;
738  const int q0 = (lo | xh_0) - 16;
739  const int q1 = (hi | xh_1) - 16;
740 
741  dxp[j] += d * (float)q0 * dy;
742  dxp[j + 16] += d * (float)q1 * dy;
743  }
744  }
745  }
746 }

References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemv_q5_0_backward().

◆ gemv_q5_0_parallel()

void gemv_q5_0_parallel ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K,
int  ith,
int  nth 
)

Parallel reference GEMV for Q5_0 × FP32.

Definition at line 576 of file gemm_kernels_q5_0.c.

581 {
582  if (!y || !W || !x || M <= 0 || K <= 0) return;
583  if (ith < 0 || nth <= 0 || ith >= nth) return;
584 
585  const int dr = (M + nth - 1) / nth;
586  const int r0 = dr * ith;
587  const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
588 
589  if (r0 >= M) return;
590 
591  const block_q5_0 *blocks = (const block_q5_0 *)W;
592  const int blocks_per_row = K / QK5_0;
593 
594  for (int row = r0; row < r1; row++) {
595  float sum = 0.0f;
596  for (int b = 0; b < blocks_per_row; b++) {
597  const block_q5_0 *block = &blocks[row * blocks_per_row + b];
598  const float d = CK_FP16_TO_FP32(block->d);
599  const float *xp = &x[b * QK5_0];
600 
601  uint32_t qh;
602  memcpy(&qh, block->qh, sizeof(qh));
603 
604  for (int j = 0; j < QK5_0 / 2; j++) {
605  const uint8_t packed = block->qs[j];
606  const int lo = (packed & 0x0F);
607  const int hi = (packed >> 4);
608  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
609  const int xh_1 = ((qh >> (j + 12))) & 0x10;
610  const int w0 = (lo | xh_0) - 16;
611  const int w1 = (hi | xh_1) - 16;
612  sum += d * (w0 * xp[j] + w1 * xp[j + QK5_0/2]);
613  }
614  }
615  y[row] = sum;
616  }
617 }

References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemv_q5_0_parallel_simd().

◆ gemv_q5_0_parallel_simd()

void gemv_q5_0_parallel_simd ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K,
int  ith,
int  nth 
)

Parallel SIMD GEMV for Q5_0 × FP32 with prefetching.

Definition at line 622 of file gemm_kernels_q5_0.c.

627 {
628  if (!y || !W || !x || M <= 0 || K <= 0) return;
629  if (ith < 0 || nth <= 0 || ith >= nth) return;
630 
631  const int dr = (M + nth - 1) / nth;
632  const int r0 = dr * ith;
633  const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
634 
635  if (r0 >= M) return;
636 
637  const block_q5_0 *blocks = (const block_q5_0 *)W;
638  const int blocks_per_row = K / QK5_0;
639 
640 #if defined(__AVX__) || defined(__SSE4_1__)
641  /* Prefetch first few rows */
642  const int PREFETCH_ROWS = 4;
643  for (int p = 0; p < PREFETCH_ROWS && r0 + p < r1; ++p) {
644  const char *row_ptr = (const char *)(blocks + (r0 + p) * blocks_per_row);
645  _mm_prefetch(row_ptr, _MM_HINT_T0);
646  _mm_prefetch(row_ptr + 64, _MM_HINT_T0);
647  }
648 
649  for (int row = r0; row < r1; ++row) {
650  /* Prefetch rows ahead */
651  if (row + PREFETCH_ROWS < r1) {
652  const char *prefetch_ptr = (const char *)(blocks + (row + PREFETCH_ROWS) * blocks_per_row);
653  _mm_prefetch(prefetch_ptr, _MM_HINT_T0);
654  _mm_prefetch(prefetch_ptr + 64, _MM_HINT_T0);
655  }
656 
657  /* Use SIMD dot product for this row */
658 #if defined(__AVX512F__)
659  /* Call single-row AVX512 implementation */
660  gemv_q5_0_avx512(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
661 #elif defined(__AVX2__)
662  gemv_q5_0_avx2(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
663 #elif defined(__AVX__)
664  gemv_q5_0_avx(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
665 #else
666  gemv_q5_0_sse_v2(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
667 #endif
668  }
669 #else
670  /* Fallback to reference parallel */
671  gemv_q5_0_parallel(y, W, x, M, K, ith, nth);
672 #endif
673 }
void gemv_q5_0_parallel(float *y, const void *W, const float *x, int M, int K, int ith, int nth)
Parallel reference GEMV for Q5_0 × FP32.

References gemv_q5_0_parallel(), and QK5_0.

◆ gemv_q5_0_q8_0()

void gemv_q5_0_q8_0 ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Matrix-vector multiply with Q5_0 weights and Q8_0 input.

Parameters
yOutput vector [M]
WWeight matrix in Q5_0 format [M x K]
x_q8Input vector in Q8_0 format [K]
MNumber of output rows
KNumber of columns (must be multiple of 32)

Definition at line 1529 of file gemm_kernels_q5_0.c.

1533 {
1534  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
1535  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
1536  const int blocks_per_row = K / QK5_0;
1537 
1538  for (int row = 0; row < M; row++) {
1539  vec_dot_q5_0_q8_0(K, &y[row],
1540  &w_blocks[row * blocks_per_row],
1541  x_blocks);
1542  }
1543 }

References QK5_0, and vec_dot_q5_0_q8_0().

Referenced by ck_test_gemv_q5_0(), and ck_test_gemv_q5_0_q8_0().

◆ gemv_q5_0_q8_0_parallel_simd()

void gemv_q5_0_q8_0_parallel_simd ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K,
int  ith,
int  nth 
)

Parallel SIMD GEMV for Q5_0 x Q8_0 with prefetching.

Each thread processes rows [r0, r1) where r0 = ith * ceil(M/nth). Uses vec_dot_q5_0_q8_0 dispatch (auto-selects AVX512/AVX/SSE/scalar).

Definition at line 1551 of file gemm_kernels_q5_0.c.

1556 {
1557  if (!y || !W || !x_q8 || M <= 0 || K <= 0) return;
1558  if (ith < 0 || nth <= 0 || ith >= nth) return;
1559 
1560  const int dr = (M + nth - 1) / nth;
1561  const int r0 = dr * ith;
1562  const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
1563 
1564  if (r0 >= M) return;
1565 
1566  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
1567  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
1568  const int blocks_per_row = K / QK5_0;
1569 
1570 #if defined(__AVX__) || defined(__SSE4_1__)
1571  const int PREFETCH_ROWS = 4;
1572  for (int p = 0; p < PREFETCH_ROWS && r0 + p < r1; ++p) {
1573  const char *row_ptr = (const char *)(w_blocks + (r0 + p) * blocks_per_row);
1574  _mm_prefetch(row_ptr, _MM_HINT_T0);
1575  _mm_prefetch(row_ptr + 64, _MM_HINT_T0);
1576  }
1577 
1578  for (int row = r0; row < r1; ++row) {
1579  if (row + PREFETCH_ROWS < r1) {
1580  const char *pf = (const char *)(w_blocks + (row + PREFETCH_ROWS) * blocks_per_row);
1581  _mm_prefetch(pf, _MM_HINT_T0);
1582  _mm_prefetch(pf + 64, _MM_HINT_T0);
1583  }
1584 
1585  vec_dot_q5_0_q8_0(K, &y[row],
1586  &w_blocks[row * blocks_per_row],
1587  x_blocks);
1588  }
1589 #else
1590  for (int row = r0; row < r1; row++) {
1591  vec_dot_q5_0_q8_0(K, &y[row],
1592  &w_blocks[row * blocks_per_row],
1593  x_blocks);
1594  }
1595 #endif
1596 }

References QK5_0, and vec_dot_q5_0_q8_0().

◆ gemv_q5_0_ref()

void gemv_q5_0_ref ( float *  y,
const void *  W,
const float *  x,
int  M,
int  K 
)

Matrix-vector multiply with Q5_0 weights (scalar reference)

Parameters
yOutput vector [M]
WWeight matrix in Q5_0 format [M x K]
xInput vector [K]
MNumber of output rows
KNumber of columns (must be multiple of 32)

Definition at line 64 of file gemm_kernels_q5_0.c.

68 {
69  const block_q5_0 *blocks = (const block_q5_0 *)W;
70  const int blocks_per_row = K / QK5_0;
71 
72  for (int row = 0; row < M; row++) {
73  float sum = 0.0f;
74 
75  for (int b = 0; b < blocks_per_row; b++) {
76  const block_q5_0 *block = &blocks[row * blocks_per_row + b];
77  const float d = CK_FP16_TO_FP32(block->d);
78  const float *xp = &x[b * QK5_0];
79 
80  /* Get high bits as 32-bit integer */
81  uint32_t qh;
82  memcpy(&qh, block->qh, sizeof(qh));
83 
84  /* llama.cpp Q5_0 layout:
85  * - Weight j uses: low nibble of qs[j], high bit from qh bit j
86  * - Weight j+16 uses: high nibble of qs[j], high bit from qh bit (j+12)
87  * Note: j+12 not j+16 for the high bit of the second weight!
88  */
89  for (int j = 0; j < QK5_0 / 2; j++) {
90  const uint8_t packed = block->qs[j];
91 
92  /* Extract nibbles */
93  const int lo = (packed & 0x0F);
94  const int hi = (packed >> 4);
95 
96  /* Extract high bits - matches llama.cpp exactly */
97  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
98  const int xh_1 = ((qh >> (j + 12))) & 0x10;
99 
100  /* Combine to 5-bit signed value */
101  const int q0 = (lo | xh_0) - 16;
102  const int q1 = (hi | xh_1) - 16;
103 
104  /* Weights at indices j and j+16 */
105  sum += d * (float)q0 * xp[j];
106  sum += d * (float)q1 * xp[j + 16];
107  }
108  }
109 
110  y[row] = sum;
111  }
112 }

References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemv_q5_0().

◆ vec_dot_q5_0_q8_0()

void vec_dot_q5_0_q8_0 ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Auto-dispatch quantized dot product Q5_0 x Q8_0.

Dispatch priority:

  1. AVX512 (best performance on modern Intel/AMD)
  2. AVX (256-bit float ops, works on Sandy/Ivy Bridge and newer)
  3. SSSE3 (128-bit fallback)
  4. Reference scalar (last resort)

Definition at line 1498 of file gemm_kernels_q5_0.c.

1499 {
1500 #if defined(__AVX512F__)
1501  vec_dot_q5_0_q8_0_avx512(n, s, vx, vy);
1502 #elif defined(__AVX__)
1503  /* AVX for 256-bit float ops (works on Ivy Bridge and newer) */
1504  vec_dot_q5_0_q8_0_avx(n, s, vx, vy);
1505 #elif defined(__SSSE3__)
1506  /* SSSE3 - most efficient on older CPUs */
1507  vec_dot_q5_0_q8_0_sse(n, s, vx, vy);
1508 #else
1509  vec_dot_q5_0_q8_0_ref(n, s, vx, vy);
1510 #endif
1511 }
void vec_dot_q5_0_q8_0_ref(int n, float *s, const void *vx, const void *vy)
Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference)

References vec_dot_q5_0_q8_0_ref().

Referenced by ck_test_vec_dot_q5_0_q8_0(), gemm_nt_q5_0_q8_0(), gemv_fused_q5_0_bias_parallel_omp(), gemv_q5_0_from_fp32(), gemv_q5_0_q8_0(), gemv_q5_0_q8_0_parallel_omp(), gemv_q5_0_q8_0_parallel_simd(), mega_fused_attention_decode_q5_0(), mega_fused_attention_decode_q5_0_parallel_simd(), and out_proj_head_major_q5_0_q8_0().

◆ vec_dot_q5_0_q8_0_ref()

void vec_dot_q5_0_q8_0_ref ( int  n,
float *  s,
const void *  vx,
const void *  vy 
)

Quantized dot product: Q5_0 weights x Q8_0 input (scalar reference)

Parameters
nNumber of elements (must be multiple of 32)
sOutput: scalar dot product result
vxQ5_0 quantized weights
vyQ8_0 quantized input

Definition at line 899 of file gemm_kernels_q5_0.c.

900 {
901  const int qk = QK5_0; /* 32 */
902  const int nb = n / qk;
903 
904  const block_q5_0 *x = (const block_q5_0 *)vx;
905  const block_q8_0 *y = (const block_q8_0 *)vy;
906 
907  float sumf = 0.0f;
908 
909  for (int ib = 0; ib < nb; ib++) {
910  /* Load high bits for this block */
911  uint32_t qh;
912  memcpy(&qh, x[ib].qh, sizeof(qh));
913 
914  int sumi0 = 0;
915  int sumi1 = 0;
916 
917  for (int j = 0; j < qk / 2; j++) {
918  /* Extract high bits - matches llama.cpp exactly */
919  const uint8_t xh_0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
920  const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
921 
922  /* Reconstruct 5-bit signed values (-16 to +15) */
923  const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
924  const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
925 
926  /* Integer dot product with Q8_0 values */
927  sumi0 += x0 * y[ib].qs[j];
928  sumi1 += x1 * y[ib].qs[j + qk / 2];
929  }
930 
931  int sumi = sumi0 + sumi1;
932  sumf += (CK_FP16_TO_FP32(x[ib].d) * CK_FP16_TO_FP32(y[ib].d)) * sumi;
933  }
934 
935  *s = sumf;
936 }
int8_t qs[32]

References CK_FP16_TO_FP32, QK5_0, and block_q8_0::qs.

Referenced by vec_dot_q5_0_q8_0().