Fused GEMV kernels with online quantization and bias. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <math.h>
#include "ckernel_quant.h"

Functions
static int	ck_round_nearest (float v)
	Round to nearest int, half away from zero (matches quantize_row_q8_0) More...

static float	dot_fp32_q5_0_block (const float x, const block_q5_0 block)
	Compute dot product of FP32 input with Q5_0 weight block, with online Q8 quantization. More...

static float	dot_fp32_q8_0_block (const float x, const block_q8_0 block)
	Compute dot product of FP32 input with Q8_0 weight block, with online Q8 quantization. More...

void	gemv_fused_q5_0_bias (float y, const void W, const float x, const float bias, int M, int K)

void	gemv_fused_q5_0_bias_dispatch (float y, const void W, const float x, const float bias, int M, int K)

void	gemv_fused_q8_0_bias (float y, const void W, const float x, const float bias, int M, int K)

void	gemv_fused_q8_0_bias_dispatch (float y, const void W, const float x, const float bias, int M, int K)

Detailed Description

Fused GEMV kernels with online quantization and bias.

These kernels fuse:

Quantize FP32 input to Q8_0/Q8_K (no memory write)
GEMV with quantized weights
Bias add

Benefits:

Eliminates memory traffic for quantized activations
Better cache utilization
Reduces total ops in IR from 3 to 1

Kernel signature: gemv_fused_q5_0_bias(y, W, x, bias, M, K)

x: FP32 input [K]
W: Q5_0 weights [M, K]
bias: FP32 bias [M] (can be NULL)
y: FP32 output [M]

Definition in file gemv_fused_quant_bias.c.

Function Documentation

◆ ck_round_nearest()

static int ck_round_nearest ( float v )

inlinestatic

Round to nearest int, half away from zero (matches quantize_row_q8_0)

Definition at line 40 of file gemv_fused_quant_bias.c.

                                             {
     return (int)(v + (v >= 0.0f ? 0.5f : -0.5f));
 }

Referenced by dot_fp32_q5_0_block(), and dot_fp32_q8_0_block().

◆ dot_fp32_q5_0_block()

static float dot_fp32_q5_0_block	(	const float *	x,
		const block_q5_0 *	block
	)

inlinestatic

Compute dot product of FP32 input with Q5_0 weight block, with online Q8 quantization.

Definition at line 375 of file gemv_fused_quant_bias.c.

                                                                                  {
     const float d_w = CK_FP16_TO_FP32(block->d);
  
     float amax = 0.0f;
     for (int j = 0; j < 32; j++) {
         float ax = x[j] >= 0 ? x[j] : -x[j];
         if (ax > amax) amax = ax;
     }
  
     float d_x = amax / 127.0f;
     d_x = CK_FP16_TO_FP32(CK_FP32_TO_FP16(d_x));
     const float id_x = (amax != 0.0f) ? 127.0f / amax : 0.0f;
     const float d = d_w * d_x;
  
     uint32_t qh;
     memcpy(&qh, block->qh, sizeof(qh));
  
     int32_t sumi = 0;
     for (int j = 0; j < 16; j++) {
         const uint8_t packed = block->qs[j];
         const int lo = (packed & 0x0F);
         const int hi = (packed >> 4);
         const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
         const int xh_1 = ((qh >> (j + 12))) & 0x10;
         const int w0 = (lo | xh_0) - 16;
         const int w1 = (hi | xh_1) - 16;
  
         float v0 = x[j] * id_x;
         float v1 = x[j + 16] * id_x;
         int q0 = ck_round_nearest(v0);
         int q1 = ck_round_nearest(v1);
         if (q0 > 127) q0 = 127; if (q0 < -127) q0 = -127;
         if (q1 > 127) q1 = 127; if (q1 < -127) q1 = -127;
  
         sumi += q0 * w0 + q1 * w1;
     }
  
     return d * (float)sumi;
 }

References CK_FP16_TO_FP32, CK_FP32_TO_FP16, ck_round_nearest(), block_q5_0::d, block_q5_0::qh, and block_q5_0::qs.

Referenced by gemv_fused_q5_0_bias().

◆ dot_fp32_q8_0_block()

static float dot_fp32_q8_0_block	(	const float *	x,
		const block_q8_0 *	block
	)

inlinestatic

Compute dot product of FP32 input with Q8_0 weight block, with online Q8 quantization.

Definition at line 418 of file gemv_fused_quant_bias.c.

                                                                                  {
     const float d_w = CK_FP16_TO_FP32(block->d);
  
     float amax = 0.0f;
     for (int j = 0; j < 32; j++) {
         float ax = x[j] >= 0 ? x[j] : -x[j];
         if (ax > amax) amax = ax;
     }
  
     float d_x = amax / 127.0f;
     d_x = CK_FP16_TO_FP32(CK_FP32_TO_FP16(d_x));
     const float id_x = (amax != 0.0f) ? 127.0f / amax : 0.0f;
     const float d = d_w * d_x;
  
     int32_t sumi = 0;
     for (int j = 0; j < 32; j++) {
         float v = x[j] * id_x;
         int q = ck_round_nearest(v);
         if (q > 127) q = 127;
         if (q < -127) q = -127;
         sumi += q * (int32_t)block->qs[j];
     }
  
     return d * (float)sumi;
 }

References CK_FP16_TO_FP32, CK_FP32_TO_FP16, ck_round_nearest(), block_q8_0::d, and block_q8_0::qs.

Referenced by gemv_fused_q8_0_bias().

◆ gemv_fused_q5_0_bias()

void gemv_fused_q5_0_bias	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 448 of file gemv_fused_quant_bias.c.

 {
     const block_q5_0 *blocks = (const block_q5_0 *)W;
     const int blocks_per_row = K / QK5_0;
  
     for (int row = 0; row < M; row++) {
         float sum = 0.0f;
  
         for (int b = 0; b < blocks_per_row; b++) {
             const block_q5_0 *block = &blocks[row * blocks_per_row + b];
             const float *xp = &x[b * QK5_0];
             sum += dot_fp32_q5_0_block(xp, block);
         }
  
         if (bias) {
             sum += bias[row];
         }
  
         y[row] = sum;
     }
 }

References dot_fp32_q5_0_block(), and QK5_0.

Referenced by gemv_fused_q5_0_bias_dispatch().

◆ gemv_fused_q5_0_bias_dispatch()

void gemv_fused_q5_0_bias_dispatch	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 508 of file gemv_fused_quant_bias.c.

 {
 #if defined(__AVX__)
     gemv_fused_q5_0_bias_avx(y, W, x, bias, M, K);
 #else
     gemv_fused_q5_0_bias(y, W, x, bias, M, K);
 #endif
 }

References gemv_fused_q5_0_bias().

◆ gemv_fused_q8_0_bias()

void gemv_fused_q8_0_bias	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 476 of file gemv_fused_quant_bias.c.

 {
     const block_q8_0 *blocks = (const block_q8_0 *)W;
     const int blocks_per_row = K / QK8_0;
  
     for (int row = 0; row < M; row++) {
         float sum = 0.0f;
  
         for (int b = 0; b < blocks_per_row; b++) {
             const block_q8_0 *block = &blocks[row * blocks_per_row + b];
             const float *xp = &x[b * QK8_0];
             sum += dot_fp32_q8_0_block(xp, block);
         }
  
         if (bias) {
             sum += bias[row];
         }
  
         y[row] = sum;
     }
 }

References dot_fp32_q8_0_block(), and QK8_0.

Referenced by gemv_fused_q8_0_bias_dispatch().

◆ gemv_fused_q8_0_bias_dispatch()

void gemv_fused_q8_0_bias_dispatch	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 523 of file gemv_fused_quant_bias.c.

 {
 #if defined(__AVX__)
     gemv_fused_q8_0_bias_avx(y, W, x, bias, M, K);
 #else
     gemv_fused_q8_0_bias(y, W, x, bias, M, K);
 #endif
 }

References gemv_fused_q8_0_bias().

Functions

Detailed Description

Function Documentation

◆ ck_round_nearest()

◆ dot_fp32_q5_0_block()

◆ dot_fp32_q8_0_block()

◆ gemv_fused_q5_0_bias()

◆ gemv_fused_q5_0_bias_dispatch()

◆ gemv_fused_q8_0_bias()

◆ gemv_fused_q8_0_bias_dispatch()