Fused RMSNorm + Linear (GEMV) kernel. More...

#include <assert.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "ckernel_quant.h"

Functions
static int	ck_nearest_int_fused (float fval)

void	fused_rmsnorm_linear_q4k (float y, const float x, const float gamma, const void W_q4k, int M, int K, float eps)
	Fused RMSNorm + Q4_K Linear projection. More...

void	gemv_q4_k_q8_k (float y, const void W, const void *x_q8, int M, int K)

void	unfused_rmsnorm_linear_q4k_ref (float y, const float x, const float gamma, const void W_q4k, int M, int K, float eps)
	Reference (unfused) implementation for correctness testing. More...

Detailed Description

Fused RMSNorm + Linear (GEMV) kernel.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
NO memcpy for layout - use strided access, not copies
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

VIOLATION: Has free() calls and memcpy in test/benchmark code at end of file. TODO: Move test code to unittest/, remove free()/memcpy from kernel file.

FUSION BENEFIT:

Unfused: RMSNorm(x) → [DRAM write: norm_out] → Quantize → [DRAM write: q8] → GEMV Total DRAM: 2 writes + 2 reads = 4 * hidden_size bytes

Fused: RMSNorm(x) → [registers] → Quantize → [stack/L1: q8] → GEMV Total DRAM: 0 intermediate writes/reads

Expected: 2-4x memory traffic reduction for this operation

Definition in file fused_rmsnorm_linear.c.

Function Documentation

◆ ck_nearest_int_fused()

static int ck_nearest_int_fused ( float fval )

inlinestatic

Definition at line 48 of file fused_rmsnorm_linear.c.

                                                    {
     float val = fval + 12582912.f;
     int i;
     memcpy(&i, &val, sizeof(int));
     return (i & 0x007fffff) - 0x00400000;
 }

Referenced by fused_rmsnorm_linear_q4k().

◆ fused_rmsnorm_linear_q4k()

void fused_rmsnorm_linear_q4k	(	float *	y,
		const float *	x,
		const float *	gamma,
		const void *	W_q4k,
		int	M,
		int	K,
		float	eps
	)

Fused RMSNorm + Q4_K Linear projection.

Computes: y = Linear(RMSNorm(x)) where Linear uses Q4_K weights and Q8_K activations internally.

The key optimization is that the normalized values never touch DRAM - they go directly from RMSNorm computation to Q8_K quantization to GEMV.

Parameters

y	Output (FP32), shape [M]
x	Input hidden state (FP32), shape [K]
gamma	RMSNorm scale weights (FP32), shape [K]
W_q4k	Linear weights in Q4_K format, shape [M, K]
M	Output dimension (e.g., 3 * hidden for QKV)
K	Input dimension (hidden_size)
eps	RMSNorm epsilon (typically 1e-5 or 1e-6)

Definition at line 83 of file fused_rmsnorm_linear.c.

 {
     if (!y || !x || !gamma || !W_q4k || M <= 0 || K <= 0) {
         return;
     }
  
     assert(K % QK_K == 0);
     const int nb = K / QK_K;  /* Number of Q8_K blocks */
  
     /* Stack-allocated Q8_K buffer - stays in L1/L2 cache */
     /* Max supported K = 8192 (8 blocks of 256) */
     block_q8_K q8_buffer[32];  /* 32 * ~260 bytes = ~8KB on stack */
     assert(nb <= 32 && "K too large for stack buffer");
  
     /* ================================================================
      * PHASE 1: Compute RMSNorm and quantize to Q8_K
      *          Result stays in stack (L1/L2), never touches DRAM
      * ================================================================ */
  
 #if defined(__AVX512F__)
     /* AVX-512: Compute sum of squares */
     __m512 sum_sq_vec = _mm512_setzero_ps();
     int d = 0;
     for (; d + 16 <= K; d += 16) {
         __m512 xv = _mm512_loadu_ps(&x[d]);
         sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
     }
     float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
     for (; d < K; ++d) {
         sum_sq += x[d] * x[d];
     }
  
 #elif defined(__AVX__)
     /* AVX: Compute sum of squares */
     __m256 sum_sq_vec = _mm256_setzero_ps();
     int d = 0;
     for (; d + 8 <= K; d += 8) {
         __m256 xv = _mm256_loadu_ps(&x[d]);
         __m256 xv_sq = _mm256_mul_ps(xv, xv);
         sum_sq_vec = _mm256_add_ps(sum_sq_vec, xv_sq);
     }
     float sum_sq = hsum256_ps_fused(sum_sq_vec);
     for (; d < K; ++d) {
         sum_sq += x[d] * x[d];
     }
  
 #else
     /* Scalar fallback */
     double sum_sq = 0.0;
     for (int d = 0; d < K; ++d) {
         double v = (double)x[d];
         sum_sq += v * v;
     }
 #endif
  
     float mean_sq = (float)sum_sq / (float)K;
     float rstd = 1.0f / sqrtf(mean_sq + eps);
  
     /* ================================================================
      * PHASE 2: Apply RMSNorm and quantize to Q8_K in one pass
      *          Normalized values go directly to Q8_K blocks
      * ================================================================ */
  
     for (int i = 0; i < nb; ++i) {
         const float *x_block = x + i * QK_K;
         const float *g_block = gamma + i * QK_K;
  
         /* Find max absolute value for this block's normalized output */
         float max_val = 0.0f;
         float amax = 0.0f;
  
 #if defined(__AVX512F__)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         __m512 max_vec = _mm512_setzero_ps();
         __m512 sign_mask = _mm512_set1_ps(-0.0f);
  
         for (int j = 0; j < QK_K; j += 16) {
             __m512 xv = _mm512_loadu_ps(&x_block[j]);
             __m512 gv = _mm512_loadu_ps(&g_block[j]);
             __m512 norm = _mm512_mul_ps(_mm512_mul_ps(xv, rstd_vec), gv);
             __m512 abs_norm = _mm512_andnot_ps(sign_mask, norm);
             max_vec = _mm512_max_ps(max_vec, abs_norm);
  
             /* Track max with sign for scale computation */
             __mmask16 gt_mask = _mm512_cmp_ps_mask(abs_norm, _mm512_set1_ps(amax), _CMP_GT_OQ);
             if (gt_mask) {
                 float temp_amax = _mm512_reduce_max_ps(abs_norm);
                 if (temp_amax > amax) {
                     amax = temp_amax;
                     /* Find the actual max value with sign */
                     for (int k = 0; k < 16; ++k) {
                         float v = x_block[j + k] * rstd * g_block[j + k];
                         if (fabsf(v) >= amax - 1e-6f) {
                             max_val = v;
                             break;
                         }
                     }
                 }
             }
         }
         amax = _mm512_reduce_max_ps(max_vec);
  
 #elif defined(__AVX__)
         __m256 rstd_vec = _mm256_set1_ps(rstd);
  
         for (int j = 0; j < QK_K; j += 8) {
             __m256 xv = _mm256_loadu_ps(&x_block[j]);
             __m256 gv = _mm256_loadu_ps(&g_block[j]);
             __m256 norm = _mm256_mul_ps(_mm256_mul_ps(xv, rstd_vec), gv);
  
             /* Check each element for max */
             float norm_arr[8];
             _mm256_storeu_ps(norm_arr, norm);
             for (int k = 0; k < 8; ++k) {
                 float av = fabsf(norm_arr[k]);
                 if (av > amax) {
                     amax = av;
                     max_val = norm_arr[k];
                 }
             }
         }
  
 #else
         for (int j = 0; j < QK_K; ++j) {
             float norm = x_block[j] * rstd * g_block[j];
             float av = fabsf(norm);
             if (av > amax) {
                 amax = av;
                 max_val = norm;
             }
         }
 #endif
  
         /* Handle zero block */
         if (amax < 1e-10f) {
             q8_buffer[i].d = 0.0f;
             memset(q8_buffer[i].qs, 0, sizeof(q8_buffer[i].qs));
             memset(q8_buffer[i].bsums, 0, sizeof(q8_buffer[i].bsums));
             continue;
         }
  
         /* Compute scale and quantize */
         const float iscale = -127.0f / max_val;
         q8_buffer[i].d = 1.0f / iscale;
  
         /* Quantize and compute bsums */
         for (int j = 0; j < QK_K; ++j) {
             float norm = x_block[j] * rstd * g_block[j];
             int v = ck_nearest_int_fused(iscale * norm);
             v = (v > 127) ? 127 : ((v < -128) ? -128 : v);
             q8_buffer[i].qs[j] = (int8_t)v;
         }
  
         /* Compute block sums (16 elements each) */
         for (int j = 0; j < QK_K / 16; ++j) {
             int sum = 0;
             const int8_t *qs = &q8_buffer[i].qs[j * 16];
             for (int k = 0; k < 16; ++k) {
                 sum += qs[k];
             }
             q8_buffer[i].bsums[j] = (int16_t)sum;
         }
     }
  
     /* ================================================================
      * PHASE 3: GEMV with Q4_K weights and Q8_K activations
      *          Q8_K data is in stack (L1/L2), not DRAM
      * ================================================================ */
  
     gemv_q4_k_q8_k(y, W_q4k, q8_buffer, M, K);
 }

References block_q8_K::bsums, ck_nearest_int_fused(), block_q8_K::d, gemv_q4_k_q8_k(), hsum256_ps_fused(), QK_K, and block_q8_K::qs.

◆ gemv_q4_k_q8_k()

void gemv_q4_k_q8_k	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 239 of file gemm_kernels_q4k_q8k.c.

 {
 #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     /* VNNI: Best for decode (single token) - INT8 dot product acceleration */
     gemv_q4_k_q8_k_vnni(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q4_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     /* AVX version uses maddubs_epi16 (more efficient than SSE) */
     gemv_q4_k_q8_k_avx(y, W, x_q8, M, K);
 #elif defined(__SSE4_1__)
     gemv_q4_k_q8_k_sse(y, W, x_q8, M, K);
 #else
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

Referenced by fused_rmsnorm_linear_q4k(), gemm_q4_k_q8_k(), and unfused_rmsnorm_linear_q4k_ref().

◆ unfused_rmsnorm_linear_q4k_ref()

void unfused_rmsnorm_linear_q4k_ref	(	float *	y,
		const float *	x,
		const float *	gamma,
		const void *	W_q4k,
		int	M,
		int	K,
		float	eps
	)

Reference (unfused) implementation for correctness testing.

This is the SLOW version that does separate RMSNorm and GEMV calls, with intermediate results going to DRAM.

Definition at line 266 of file fused_rmsnorm_linear.c.

 {
     if (!y || !x || !gamma || !W_q4k || M <= 0 || K <= 0) {
         return;
     }
  
     assert(K % QK_K == 0);
  
     /* Stack-allocated buffers (no malloc!) - stays in L1/L2 cache */
     /* Max supported: K=4096 (16KB), 16 blocks (~5KB) */
     if (K > 4096) return;
  
     float norm_out[4096];
     block_q8_K q8_buffer[16];  /* 16 blocks for K=4096, K/QK_K */
  
     /* Step 1: RMSNorm (stays in cache via stack buffer) */
     double sum_sq = 0.0;
     for (int d = 0; d < K; ++d) {
         sum_sq += (double)x[d] * (double)x[d];
     }
     float rstd = 1.0f / sqrtf((float)(sum_sq / K) + eps);
  
     for (int d = 0; d < K; ++d) {
         norm_out[d] = x[d] * rstd * gamma[d];  /* DRAM WRITE */
     }
  
     /* Step 2: Quantize (reads DRAM, writes DRAM) */
     extern void quantize_row_q8_k(const float *x, void *vy, int k);
     quantize_row_q8_k(norm_out, q8_buffer, K);  /* DRAM READ + WRITE */
  
     /* Step 3: GEMV (reads Q8_K from cache) */
     gemv_q4_k_q8_k(y, W_q4k, q8_buffer, M, K);
  
     /* No free needed - stack buffers auto-deallocate */
 }

References gemv_q4_k_q8_k(), QK_K, and quantize_row_q8_k().

Functions