← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemv_fused_quant_bias.c File Reference

Fused GEMV kernels with online quantization and bias. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <math.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

static int ck_round_nearest (float v)
 Round to nearest int, half away from zero (matches quantize_row_q8_0) More...
 
static float dot_fp32_q5_0_block (const float *x, const block_q5_0 *block)
 Compute dot product of FP32 input with Q5_0 weight block, with online Q8 quantization. More...
 
static float dot_fp32_q8_0_block (const float *x, const block_q8_0 *block)
 Compute dot product of FP32 input with Q8_0 weight block, with online Q8 quantization. More...
 
void gemv_fused_q5_0_bias (float *y, const void *W, const float *x, const float *bias, int M, int K)
 
void gemv_fused_q5_0_bias_dispatch (float *y, const void *W, const float *x, const float *bias, int M, int K)
 
void gemv_fused_q8_0_bias (float *y, const void *W, const float *x, const float *bias, int M, int K)
 
void gemv_fused_q8_0_bias_dispatch (float *y, const void *W, const float *x, const float *bias, int M, int K)
 

Detailed Description

Fused GEMV kernels with online quantization and bias.

These kernels fuse:

  1. Quantize FP32 input to Q8_0/Q8_K (no memory write)
  2. GEMV with quantized weights
  3. Bias add

Benefits:

  • Eliminates memory traffic for quantized activations
  • Better cache utilization
  • Reduces total ops in IR from 3 to 1

Kernel signature: gemv_fused_q5_0_bias(y, W, x, bias, M, K)

  • x: FP32 input [K]
  • W: Q5_0 weights [M, K]
  • bias: FP32 bias [M] (can be NULL)
  • y: FP32 output [M]

Definition in file gemv_fused_quant_bias.c.

Function Documentation

◆ ck_round_nearest()

static int ck_round_nearest ( float  v)
inlinestatic

Round to nearest int, half away from zero (matches quantize_row_q8_0)

Definition at line 40 of file gemv_fused_quant_bias.c.

40  {
41  return (int)(v + (v >= 0.0f ? 0.5f : -0.5f));
42 }

Referenced by dot_fp32_q5_0_block(), and dot_fp32_q8_0_block().

◆ dot_fp32_q5_0_block()

static float dot_fp32_q5_0_block ( const float *  x,
const block_q5_0 block 
)
inlinestatic

Compute dot product of FP32 input with Q5_0 weight block, with online Q8 quantization.

Definition at line 375 of file gemv_fused_quant_bias.c.

375  {
376  const float d_w = CK_FP16_TO_FP32(block->d);
377 
378  float amax = 0.0f;
379  for (int j = 0; j < 32; j++) {
380  float ax = x[j] >= 0 ? x[j] : -x[j];
381  if (ax > amax) amax = ax;
382  }
383 
384  float d_x = amax / 127.0f;
385  d_x = CK_FP16_TO_FP32(CK_FP32_TO_FP16(d_x));
386  const float id_x = (amax != 0.0f) ? 127.0f / amax : 0.0f;
387  const float d = d_w * d_x;
388 
389  uint32_t qh;
390  memcpy(&qh, block->qh, sizeof(qh));
391 
392  int32_t sumi = 0;
393  for (int j = 0; j < 16; j++) {
394  const uint8_t packed = block->qs[j];
395  const int lo = (packed & 0x0F);
396  const int hi = (packed >> 4);
397  const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
398  const int xh_1 = ((qh >> (j + 12))) & 0x10;
399  const int w0 = (lo | xh_0) - 16;
400  const int w1 = (hi | xh_1) - 16;
401 
402  float v0 = x[j] * id_x;
403  float v1 = x[j + 16] * id_x;
404  int q0 = ck_round_nearest(v0);
405  int q1 = ck_round_nearest(v1);
406  if (q0 > 127) q0 = 127; if (q0 < -127) q0 = -127;
407  if (q1 > 127) q1 = 127; if (q1 < -127) q1 = -127;
408 
409  sumi += q0 * w0 + q1 * w1;
410  }
411 
412  return d * (float)sumi;
413 }
#define CK_FP16_TO_FP32(x)
#define CK_FP32_TO_FP16(x)
static int ck_round_nearest(float v)
Round to nearest int, half away from zero (matches quantize_row_q8_0)
ck_half d
Definition: ckernel_quant.h:70
uint8_t qh[4]
Definition: ckernel_quant.h:71
uint8_t qs[32/2]
Definition: ckernel_quant.h:72

References CK_FP16_TO_FP32, CK_FP32_TO_FP16, ck_round_nearest(), block_q5_0::d, block_q5_0::qh, and block_q5_0::qs.

Referenced by gemv_fused_q5_0_bias().

◆ dot_fp32_q8_0_block()

static float dot_fp32_q8_0_block ( const float *  x,
const block_q8_0 block 
)
inlinestatic

Compute dot product of FP32 input with Q8_0 weight block, with online Q8 quantization.

Definition at line 418 of file gemv_fused_quant_bias.c.

418  {
419  const float d_w = CK_FP16_TO_FP32(block->d);
420 
421  float amax = 0.0f;
422  for (int j = 0; j < 32; j++) {
423  float ax = x[j] >= 0 ? x[j] : -x[j];
424  if (ax > amax) amax = ax;
425  }
426 
427  float d_x = amax / 127.0f;
428  d_x = CK_FP16_TO_FP32(CK_FP32_TO_FP16(d_x));
429  const float id_x = (amax != 0.0f) ? 127.0f / amax : 0.0f;
430  const float d = d_w * d_x;
431 
432  int32_t sumi = 0;
433  for (int j = 0; j < 32; j++) {
434  float v = x[j] * id_x;
435  int q = ck_round_nearest(v);
436  if (q > 127) q = 127;
437  if (q < -127) q = -127;
438  sumi += q * (int32_t)block->qs[j];
439  }
440 
441  return d * (float)sumi;
442 }
int8_t qs[32]

References CK_FP16_TO_FP32, CK_FP32_TO_FP16, ck_round_nearest(), block_q8_0::d, and block_q8_0::qs.

Referenced by gemv_fused_q8_0_bias().

◆ gemv_fused_q5_0_bias()

void gemv_fused_q5_0_bias ( float *  y,
const void *  W,
const float *  x,
const float *  bias,
int  M,
int  K 
)

Definition at line 448 of file gemv_fused_quant_bias.c.

455 {
456  const block_q5_0 *blocks = (const block_q5_0 *)W;
457  const int blocks_per_row = K / QK5_0;
458 
459  for (int row = 0; row < M; row++) {
460  float sum = 0.0f;
461 
462  for (int b = 0; b < blocks_per_row; b++) {
463  const block_q5_0 *block = &blocks[row * blocks_per_row + b];
464  const float *xp = &x[b * QK5_0];
465  sum += dot_fp32_q5_0_block(xp, block);
466  }
467 
468  if (bias) {
469  sum += bias[row];
470  }
471 
472  y[row] = sum;
473  }
474 }
#define QK5_0
Definition: ckernel_quant.h:67
static float dot_fp32_q5_0_block(const float *x, const block_q5_0 *block)
Compute dot product of FP32 input with Q5_0 weight block, with online Q8 quantization.

References dot_fp32_q5_0_block(), and QK5_0.

Referenced by gemv_fused_q5_0_bias_dispatch().

◆ gemv_fused_q5_0_bias_dispatch()

void gemv_fused_q5_0_bias_dispatch ( float *  y,
const void *  W,
const float *  x,
const float *  bias,
int  M,
int  K 
)

Definition at line 508 of file gemv_fused_quant_bias.c.

515 {
516 #if defined(__AVX__)
517  gemv_fused_q5_0_bias_avx(y, W, x, bias, M, K);
518 #else
519  gemv_fused_q5_0_bias(y, W, x, bias, M, K);
520 #endif
521 }
void gemv_fused_q5_0_bias(float *y, const void *W, const float *x, const float *bias, int M, int K)

References gemv_fused_q5_0_bias().

◆ gemv_fused_q8_0_bias()

void gemv_fused_q8_0_bias ( float *  y,
const void *  W,
const float *  x,
const float *  bias,
int  M,
int  K 
)

Definition at line 476 of file gemv_fused_quant_bias.c.

483 {
484  const block_q8_0 *blocks = (const block_q8_0 *)W;
485  const int blocks_per_row = K / QK8_0;
486 
487  for (int row = 0; row < M; row++) {
488  float sum = 0.0f;
489 
490  for (int b = 0; b < blocks_per_row; b++) {
491  const block_q8_0 *block = &blocks[row * blocks_per_row + b];
492  const float *xp = &x[b * QK8_0];
493  sum += dot_fp32_q8_0_block(xp, block);
494  }
495 
496  if (bias) {
497  sum += bias[row];
498  }
499 
500  y[row] = sum;
501  }
502 }
#define QK8_0
static float dot_fp32_q8_0_block(const float *x, const block_q8_0 *block)
Compute dot product of FP32 input with Q8_0 weight block, with online Q8 quantization.

References dot_fp32_q8_0_block(), and QK8_0.

Referenced by gemv_fused_q8_0_bias_dispatch().

◆ gemv_fused_q8_0_bias_dispatch()

void gemv_fused_q8_0_bias_dispatch ( float *  y,
const void *  W,
const float *  x,
const float *  bias,
int  M,
int  K 
)

Definition at line 523 of file gemv_fused_quant_bias.c.

530 {
531 #if defined(__AVX__)
532  gemv_fused_q8_0_bias_avx(y, W, x, bias, M, K);
533 #else
534  gemv_fused_q8_0_bias(y, W, x, bias, M, K);
535 #endif
536 }
void gemv_fused_q8_0_bias(float *y, const void *W, const float *x, const float *bias, int M, int K)

References gemv_fused_q8_0_bias().