Mega-fused attention decode with Q5_0 weights. More...

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <math.h>
#include "ckernel_quant.h"

Functions
static void	apply_rope_inline (float q, float k, const float rope_cos, const float rope_sin, int pos, int H, int KV, int AD)

void	attention_forward_decode_head_major_gqa_flash (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)

static void	gemv_q5_0_from_fp32 (float out, const void W_q5_0, const float x_fp32, const float bias, int M, int K, block_q8_0 *x_q8_scratch)

static void	gemv_q8_0_from_fp32 (float out, const void W_q8_0, const float x_fp32, const float bias, int M, int K, block_q8_0 *x_q8_scratch)

void	mega_fused_attention_decode_q5_0 (float output, const float input, const float residual, const void wq_q5_0, const void wk_q5_0, const void wv_q8_0, const void wo_q5_0, const float ln_gamma, const float bq, const float bk, const float bv, const float bo, float kv_cache_k, float kv_cache_v, const float rope_cos, const float rope_sin, int pos, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int cache_capacity, float eps, void *scratch)
	Serial mega-fused attention decode kernel. More...

void	mega_fused_attention_decode_q5_0_parallel_simd (float output, const float input, const float residual, const void wq_q5_0, const void wk_q5_0, const void wv_q8_0, const void wo_q5_0, const float ln_gamma, const float bq, const float bk, const float bv, const float bo, float kv_cache_k, float kv_cache_v, const float rope_cos, const float rope_sin, int pos, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int cache_capacity, float eps, void *scratch, int ith, int nth)
	Parallel SIMD mega-fused attention decode kernel (threadpool-aware) More...

int	mega_fused_attention_decode_scratch_size (int AE, int H, int KV, int AD)
	Calculate scratch buffer size needed for the kernel. More...

void	quantize_row_q8_0 (const float x, void vy, int k)
	Quantize FP32 to Q8_0 format (scalar reference) More...

void	rmsnorm_forward (const float input, const float gamma, float output, float rstd, int T, int D, int AD, float eps)

void	vec_dot_q5_0_q8_0 (int n, float s, const void vx, const void *vy)
	Auto-dispatch quantized dot product Q5_0 x Q8_0. More...

void	vec_dot_q8_0_q8_0 (int n, float s, const void vx, const void *vy)
	Auto-dispatch quantized dot product Q8_0 x Q8_0. More...

Detailed Description

Mega-fused attention decode with Q5_0 weights.

STATUS: Serial kernel complete and correct. Parallel variant is a prototype that requires threadpool barrier support (not yet available). The non-fused decode path (ck_parallel_decode.h) already parallelizes each GEMV via row-splitting, so this fused kernel is not on the critical path. It can be enabled once threadpool parallelization is resolved (see PARALLELIZATION NOTES below).

FUSION: Combines 9 operations to minimize memory traffic. All intermediate data stays in scratch buffer (L1/L2 cache).

Operations fused:

RMSNorm
Q projection (Q5_0) with bias
K projection (Q5_0) with bias
V projection (Q8_0) with bias
RoPE application
KV cache store
Flash attention decode (GQA-aware)
O projection (Q5_0) with bias
Residual add

PARALLELIZATION NOTES: The parallel_simd variant below documents the intended threading model but cannot run with the current threadpool (single dispatch, no mid-dispatch barrier). Three approaches were evaluated:

(A) Multi-dispatch (RECOMMENDED): Break into 3 ck_threadpool_dispatch() calls per layer: Dispatch 1: Row-split Q proj across threads. Thread 0 also does RMSNorm, K/V proj, RoPE, KV store (small ops that fit within Q proj wall time). Dispatch 2: Split attention across heads (h_start..h_end per thread). Dispatch 3: Row-split O proj across threads. Thread 0 does residual add after its rows. Cost: ~1us total for 2 extra barrier round-trips (negligible vs ~100us GEMV). Intermediates stay in shared scratch — cache benefit preserved.

(B) Redundant compute (single dispatch, no barrier): All threads redundantly compute RMSNorm + K/V proj + RoPE (~4us wasted per thread). Avoids barrier but wastes cycles on small ops. Only viable if Q/O proj dominate (true for short contexts).

(C) Skip fusion, use existing parallel GEMV: The non-fused decode path already parallelizes each GEMV call via ck_parallel_decode.h. For decode (M=1), intermediates are small (~3.5KB), so DRAM bandwidth savings from fusion are minimal. This is the current production path.

TESTING: make test-mega-fused-parity # Numerical parity make test-mega-fused-speed # Performance benchmark

Definition in file mega_fused_attention_decode_q5_0.c.

Function Documentation

◆ apply_rope_inline()

static void apply_rope_inline	(	float *	q,
		float *	k,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	H,
		int	KV,
		int	AD
	)

inlinestatic

Definition at line 135 of file mega_fused_attention_decode_q5_0.c.

 {
     const int D = AD / 2;
     const float *cos_row = &rope_cos[pos * D];
     const float *sin_row = &rope_sin[pos * D];
  
     /* Q heads */
     for (int h = 0; h < H; h++) {
         float *q_head = &q[h * AD];
         for (int d = 0; d < D; d++) {
             float q0 = q_head[d];
             float q1 = q_head[d + D];
             q_head[d] = q0 * cos_row[d] - q1 * sin_row[d];
             q_head[d + D] = q0 * sin_row[d] + q1 * cos_row[d];
         }
     }
  
     /* K heads */
     for (int kv = 0; kv < KV; kv++) {
         float *k_head = &k[kv * AD];
         for (int d = 0; d < D; d++) {
             float k0 = k_head[d];
             float k1 = k_head[d + D];
             k_head[d] = k0 * cos_row[d] - k1 * sin_row[d];
             k_head[d + D] = k0 * sin_row[d] + k1 * cos_row[d];
         }
     }
 }

Referenced by mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ attention_forward_decode_head_major_gqa_flash()

void attention_forward_decode_head_major_gqa_flash	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention decode (single token attends to KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_decode

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_decode

test_fused_attention_decode.py::TestFusedAttentionDecode::test_flash_decode

test_attention.py::TestAttentionForward::test_flash_decode

Single query token attends to kv_tokens in KV cache. Uses true flash attention from attention_flash_true.c.

After changes: make test && make llamacpp-parity-full

Definition at line 1467 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         attention_flash_decode(out_head,
                                q_head,
                                k_head,
                                v_head,
                                1,
                                kv_tokens,
                                1,
                                aligned_head_dim,
                                scale);
     }
 }

Referenced by mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ gemv_q5_0_from_fp32()

static void gemv_q5_0_from_fp32	(	float *	out,
		const void *	W_q5_0,
		const float *	x_fp32,
		const float *	bias,
		int	M,
		int	K,
		block_q8_0 *	x_q8_scratch
	)

inlinestatic

Definition at line 84 of file mega_fused_attention_decode_q5_0.c.

 {
     const block_q5_0 *w_blocks = (const block_q5_0 *)W_q5_0;
     const int blocks_per_row = K / QK5_0;
  
     /* Quantize input to Q8_0 (reuse existing kernel, scratch buffer) */
     quantize_row_q8_0(x_fp32, x_q8_scratch, K);
  
     /* Compute dot products using optimized kernel */
     for (int row = 0; row < M; row++) {
         float dot;
         vec_dot_q5_0_q8_0(K, &dot, &w_blocks[row * blocks_per_row], x_q8_scratch);
         out[row] = dot + (bias ? bias[row] : 0.0f);
     }
 }

References QK5_0, quantize_row_q8_0(), and vec_dot_q5_0_q8_0().

Referenced by mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ gemv_q8_0_from_fp32()

static void gemv_q8_0_from_fp32	(	float *	out,
		const void *	W_q8_0,
		const float *	x_fp32,
		const float *	bias,
		int	M,
		int	K,
		block_q8_0 *	x_q8_scratch
	)

inlinestatic

Definition at line 108 of file mega_fused_attention_decode_q5_0.c.

 {
     const block_q8_0 *w_blocks = (const block_q8_0 *)W_q8_0;
     const int blocks_per_row = K / QK8_0;
  
     /* Quantize input to Q8_0 (reuse existing kernel, scratch buffer) */
     quantize_row_q8_0(x_fp32, x_q8_scratch, K);
  
     /* Compute dot products */
     for (int row = 0; row < M; row++) {
         float dot;
         vec_dot_q8_0_q8_0(K, &dot, &w_blocks[row * blocks_per_row], x_q8_scratch);
         out[row] = dot + (bias ? bias[row] : 0.0f);
     }
 }

References QK8_0, quantize_row_q8_0(), and vec_dot_q8_0_q8_0().

Referenced by mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ mega_fused_attention_decode_q5_0()

void mega_fused_attention_decode_q5_0	(	float *	output,
		const float *	input,
		const float *	residual,
		const void *	wq_q5_0,
		const void *	wk_q5_0,
		const void *	wv_q8_0,
		const void *	wo_q5_0,
		const float *	ln_gamma,
		const float *	bq,
		const float *	bk,
		const float *	bv,
		const float *	bo,
		float *	kv_cache_k,
		float *	kv_cache_v,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	cache_capacity,
		float	eps,
		void *	scratch
	)

Serial mega-fused attention decode kernel.

Parameters

output	Output [AE] (final result, after residual add)
input	Input activation [AE]
residual	Residual input for add [AE]
wq_q5_0	Q projection weights [H*AD, AE] Q5_0
wk_q5_0	K projection weights [KV*AD, AE] Q5_0
wv_q8_0	V projection weights [KV*AD, AE] Q8_0
wo_q5_0	O projection weights [AE, H*AD] Q5_0
ln_gamma	RMSNorm gamma [AE]
bq	Q bias [H*AD] or NULL
bk	K bias [KV*AD] or NULL
bv	V bias [KV*AD] or NULL
bo	O bias [AE] or NULL
kv_cache_k	K cache [KV, max_T, AD]
kv_cache_v	V cache [KV, max_T, AD]
rope_cos	RoPE cos [max_T, D]
rope_sin	RoPE sin [max_T, D]
pos	Current position (0-indexed)
embed_dim	Original embedding dimension E
aligned_embed_dim	Aligned embedding dimension AE
num_heads	Number of query heads H
num_kv_heads	Number of key/value heads KV
head_dim	Head dimension AD
aligned_head_dim	Aligned head dimension AAD
cache_capacity	Maximum cache capacity max_T
eps	RMSNorm epsilon
scratch	Scratch buffer (>= scratch_size bytes)

Definition at line 222 of file mega_fused_attention_decode_q5_0.c.

 {
     const int H = num_heads;
     const int KV = num_kv_heads;
     const int AD = head_dim;
     const int AE = aligned_embed_dim;
     (void)embed_dim;  /* Unused but kept for API consistency */
  
     /* Parse scratch buffer - all allocations from scratch, no VLAs */
     float *scratch_ptr = (float *)scratch;
  
     float *rmsnorm_out = scratch_ptr;
     scratch_ptr += AE;
  
     float *rstd_scratch = scratch_ptr;  /* For rmsnorm rstd output - avoids VLA */
     scratch_ptr += AE;
  
     float *q = scratch_ptr;
     scratch_ptr += H * AD;
  
     float *k = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *v = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *attn_out = scratch_ptr;
     scratch_ptr += H * AD;
  
     block_q8_0 *x_q8_scratch = (block_q8_0 *)scratch_ptr;
  
     const int q_size = H * AD;
     const int k_size = KV * AD;
     const int v_size = KV * AD;
  
     /* ========================================================================
      * STEP 1: RMSNorm
      * Correct signature: rmsnorm_forward(in, gamma, out, rstd, T, D, AD, eps)
      * T=1 (single token), D=AE (full embed dim for norm)
      * ======================================================================== */
     rmsnorm_forward(input, ln_gamma, rmsnorm_out, rstd_scratch, 1, AE, AD, eps);
  
     /* ========================================================================
      * STEP 2-4: Q, K, V projections (fused with quantization)
      * Use scratch buffer for quantized input
      * ======================================================================== */
     gemv_q5_0_from_fp32(q, wq_q5_0, rmsnorm_out, bq, q_size, AE, x_q8_scratch);
     gemv_q5_0_from_fp32(k, wk_q5_0, rmsnorm_out, bk, k_size, AE, x_q8_scratch);
     gemv_q8_0_from_fp32(v, wv_q8_0, rmsnorm_out, bv, v_size, AE, x_q8_scratch);
  
     /* ========================================================================
      * STEP 5: Apply RoPE
      * ======================================================================== */
     apply_rope_inline(q, k, rope_cos, rope_sin, pos, H, KV, AD);
  
     /* ========================================================================
      * STEP 6: Store K and V to cache
      * Cache layout: [KV, cache_capacity, AD]
      * ======================================================================== */
     const size_t kv_stride = (size_t)cache_capacity * AD;
     for (int kv = 0; kv < KV; kv++) {
         float *k_cache = &kv_cache_k[kv * kv_stride];
         float *v_cache = &kv_cache_v[kv * kv_stride];
         const float *k_src = &k[kv * AD];
         const float *v_src = &v[kv * AD];
         const int offset = pos * AD;
         for (int d = 0; d < AD; d++) {
             k_cache[offset + d] = k_src[d];
             v_cache[offset + d] = v_src[d];
         }
     }
  
     /* ========================================================================
      * STEP 7: Flash attention decode (GQA-aware variant)
      * attention_forward_decode_head_major_gqa_flash handles H != KV correctly
      * It maps each of H heads to one of KV KV heads via: kv_head = h * KV / H
      * ======================================================================== */
     attention_forward_decode_head_major_gqa_flash(
         q, kv_cache_k, kv_cache_v,
         attn_out, H, KV, pos + 1, cache_capacity, AD, aligned_head_dim);
  
     /* ========================================================================
      * STEP 8: O projection (Q5_0 weights) with bias and residual add
      *
      * attn_out layout: [H * AD] flattened
      * wo_q5_0 layout: [AE, H*AD] - row e has H*AD input features
      *
      * O projection: output[e] = dot(wo[e], attn_out) + bias[e] + residual[e]
      *
      * Using vec_dot_q5_0_q8_0 for efficient quantized dot product.
      * ======================================================================== */
  
     /* Quantize attention output to Q8_0 for GEMV */
     quantize_row_q8_0(attn_out, x_q8_scratch, H * AD);
  
     const block_q5_0 *wo = (const block_q5_0 *)wo_q5_0;
     const int blocks_per_row = (H * AD) / QK5_0;
  
     for (int e = 0; e < AE; e++) {
         float dot;
         vec_dot_q5_0_q8_0(H * AD, &dot, &wo[e * blocks_per_row], x_q8_scratch);
         output[e] = dot + (bo ? bo[e] : 0.0f) + residual[e];
     }
 }

References apply_rope_inline(), attention_forward_decode_head_major_gqa_flash(), gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), QK5_0, quantize_row_q8_0(), rmsnorm_forward(), and vec_dot_q5_0_q8_0().

◆ mega_fused_attention_decode_q5_0_parallel_simd()

void mega_fused_attention_decode_q5_0_parallel_simd	(	float *	output,
		const float *	input,
		const float *	residual,
		const void *	wq_q5_0,
		const void *	wk_q5_0,
		const void *	wv_q8_0,
		const void *	wo_q5_0,
		const float *	ln_gamma,
		const float *	bq,
		const float *	bk,
		const float *	bv,
		const float *	bo,
		float *	kv_cache_k,
		float *	kv_cache_v,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	cache_capacity,
		float	eps,
		void *	scratch,
		int	ith,
		int	nth
	)

Parallel SIMD mega-fused attention decode kernel (threadpool-aware)

Parallelizes across attention heads using (ith, nth) pattern. Each thread processes a subset of heads.

IMPORTANT: Caller must ensure barrier sync between phases: Phase 1 (ith==0 only): RMSNorm, Q/K/V projection, RoPE, KV cache store – BARRIER – Phase 2 (all threads): Attention for assigned heads – BARRIER – Phase 3 (ith==0 only): O projection and residual add

Parameters

ith	Thread index (0 to nth-1)
nth	Total number of threads (other parameters same as serial version)

Definition at line 367 of file mega_fused_attention_decode_q5_0.c.

 {
     const int H = num_heads;
     const int KV = num_kv_heads;
     const int AD = head_dim;
     const int AE = aligned_embed_dim;
     (void)embed_dim;
  
     /* Each thread handles a subset of heads */
     const int heads_per_thread = (H + nth - 1) / nth;
     const int h_start = ith * heads_per_thread;
     const int h_end = (h_start + heads_per_thread < H) ? h_start + heads_per_thread : H;
     const int my_heads = h_end - h_start;
  
     if (h_start >= H) return;
  
     /* Parse scratch buffer (shared across threads) */
     float *scratch_ptr = (float *)scratch;
  
     float *rmsnorm_out = scratch_ptr;
     scratch_ptr += AE;
  
     float *rstd_scratch = scratch_ptr;
     scratch_ptr += AE;
  
     float *q = scratch_ptr;
     scratch_ptr += H * AD;
  
     float *k = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *v = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *attn_out = scratch_ptr;
     scratch_ptr += H * AD;
  
     block_q8_0 *x_q8_scratch = (block_q8_0 *)scratch_ptr;
  
     /* ========================================================================
      * PHASE 1: Only thread 0 does RMSNorm and K/V projections
      * These are shared across all heads.
      * CALLER MUST BARRIER AFTER THIS PHASE.
      * ======================================================================== */
     if (ith == 0) {
         rmsnorm_forward(input, ln_gamma, rmsnorm_out, rstd_scratch, 1, AE, AD, eps);
  
         gemv_q5_0_from_fp32(q, wq_q5_0, rmsnorm_out, bq, H * AD, AE, x_q8_scratch);
         gemv_q5_0_from_fp32(k, wk_q5_0, rmsnorm_out, bk, KV * AD, AE, x_q8_scratch);
         gemv_q8_0_from_fp32(v, wv_q8_0, rmsnorm_out, bv, KV * AD, AE, x_q8_scratch);
  
         apply_rope_inline(q, k, rope_cos, rope_sin, pos, H, KV, AD);
  
         /* Store K/V to cache */
         const size_t kv_stride = (size_t)cache_capacity * AD;
         for (int kv_idx = 0; kv_idx < KV; kv_idx++) {
             float *k_cache = &kv_cache_k[kv_idx * kv_stride];
             float *v_cache = &kv_cache_v[kv_idx * kv_stride];
             const int offset = pos * AD;
             for (int d = 0; d < AD; d++) {
                 k_cache[offset + d] = k[kv_idx * AD + d];
                 v_cache[offset + d] = v[kv_idx * AD + d];
             }
         }
     }
  
     /* ========================================================================
      * CALLER MUST BARRIER HERE
      * All threads need to wait for thread 0 to finish projections
      * ======================================================================== */
  
     /* ========================================================================
      * PHASE 2: Each thread does attention for its heads only
      * attention_forward_decode_head_major_gqa_flash expects:
      *   - q_token: pointer to start of Q for these heads
      *   - out_token: pointer to start of output for these heads
      *   - num_heads: number of heads THIS THREAD is processing
      * ======================================================================== */
     if (my_heads > 0) {
         attention_forward_decode_head_major_gqa_flash(
             &q[h_start * AD],           /* Q for this thread's heads */
             kv_cache_k, kv_cache_v,
             &attn_out[h_start * AD],    /* Output for this thread's heads */
             my_heads,                   /* Only my_heads, not H */
             KV,                         /* Still need all KV heads for GQA */
             pos + 1, cache_capacity, AD, aligned_head_dim);
     }
  
     /* ========================================================================
      * CALLER MUST BARRIER HERE
      * Thread 0 needs all threads to finish attention before O projection
      * ======================================================================== */
  
     /* ========================================================================
      * PHASE 3: Thread 0 does O projection and residual add
      * ======================================================================== */
     if (ith == 0) {
         /* Quantize full attention output for O projection */
         quantize_row_q8_0(attn_out, x_q8_scratch, H * AD);
  
         const block_q5_0 *wo = (const block_q5_0 *)wo_q5_0;
         const int blocks_per_row = (H * AD) / QK5_0;
  
         for (int e = 0; e < AE; e++) {
             float dot;
             vec_dot_q5_0_q8_0(H * AD, &dot, &wo[e * blocks_per_row], x_q8_scratch);
             output[e] = dot + (bo ? bo[e] : 0.0f) + residual[e];
         }
     }
 }

References apply_rope_inline(), attention_forward_decode_head_major_gqa_flash(), gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), QK5_0, quantize_row_q8_0(), rmsnorm_forward(), and vec_dot_q5_0_q8_0().

◆ mega_fused_attention_decode_scratch_size()

int mega_fused_attention_decode_scratch_size	(	int	AE,
		int	H,
		int	KV,
		int	AD
	)

Calculate scratch buffer size needed for the kernel.

Parameters

AE	Aligned embedding dimension (multiple of 64)
H	Number of query heads
KV	Number of key/value heads
AD	Head dimension

Returns: Size in bytes needed for scratch buffer

Definition at line 176 of file mega_fused_attention_decode_q5_0.c.

                                                                             {
     /* Need: 1x AE for RMSNorm output
             1x AE for RMSNorm rstd (avoid VLA)
             1x H*AD for Q
             1x KV*AD for K
             1x KV*AD for V
             1x H*AD for attention output
             1x max(AE, H*AD)/QK8_0 * sizeof(block_q8_0) for GEMV scratch
     */
     int max_input_dim = (AE > H * AD) ? AE : H * AD;
     int q8_blocks = (max_input_dim + QK8_0 - 1) / QK8_0;
     return (int)(sizeof(float) * (AE + AE + H * AD + 2 * KV * AD + H * AD)
                  + q8_blocks * sizeof(block_q8_0));
 }

References QK8_0.

◆ quantize_row_q8_0()

void quantize_row_q8_0	(	const float *	x,
		void *	vy,
		int	k
	)

Quantize FP32 to Q8_0 format (scalar reference)

Parameters

x	Input FP32 values
vy	Output Q8_0 blocks
k	Number of elements (must be multiple of 32)

Definition at line 59 of file gemm_kernels_q8_0.c.

 {
     block_q8_0 *y = (block_q8_0 *)vy;
     const int nb = k / QK8_0;  /* QK8_0 = 32 */
  
 #if defined(__AVX__)
     const __m256 sign_bit = _mm256_set1_ps(-0.0f);
     const __m256 v_half = _mm256_set1_ps(0.5f);
     const __m256 v_min = _mm256_set1_ps(-127.0f);
     const __m256 v_max = _mm256_set1_ps(127.0f);
  
     for (int i = 0; i < nb; i++) {
         __m256 v0 = _mm256_loadu_ps(x + 0);
         __m256 v1 = _mm256_loadu_ps(x + 8);
         __m256 v2 = _mm256_loadu_ps(x + 16);
         __m256 v3 = _mm256_loadu_ps(x + 24);
         x += QK8_0;
  
         __m256 max_abs = _mm256_andnot_ps(sign_bit, v0);
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v1));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v2));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v3));
  
         __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(max_abs, 1),
                                  _mm256_castps256_ps128(max_abs));
         max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
         max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
         const float max_scalar = _mm_cvtss_f32(max4);
  
         const float d = max_scalar / 127.0f;
         const float id = max_scalar != 0.0f ? 127.0f / max_scalar : 0.0f;
         y[i].d = CK_FP32_TO_FP16(d);
  
         const __m256 mul = _mm256_set1_ps(id);
         v0 = _mm256_mul_ps(v0, mul);
         v1 = _mm256_mul_ps(v1, mul);
         v2 = _mm256_mul_ps(v2, mul);
         v3 = _mm256_mul_ps(v3, mul);
  
         v0 = _mm256_min_ps(_mm256_max_ps(v0, v_min), v_max);
         v1 = _mm256_min_ps(_mm256_max_ps(v1, v_min), v_max);
         v2 = _mm256_min_ps(_mm256_max_ps(v2, v_min), v_max);
         v3 = _mm256_min_ps(_mm256_max_ps(v3, v_min), v_max);
  
         /* Round half away from zero to match the scalar path */
         v0 = _mm256_add_ps(v0, _mm256_or_ps(_mm256_and_ps(v0, sign_bit), v_half));
         v1 = _mm256_add_ps(v1, _mm256_or_ps(_mm256_and_ps(v1, sign_bit), v_half));
         v2 = _mm256_add_ps(v2, _mm256_or_ps(_mm256_and_ps(v2, sign_bit), v_half));
         v3 = _mm256_add_ps(v3, _mm256_or_ps(_mm256_and_ps(v3, sign_bit), v_half));
  
         __m256i i0 = _mm256_cvttps_epi32(v0);
         __m256i i1 = _mm256_cvttps_epi32(v1);
         __m256i i2 = _mm256_cvttps_epi32(v2);
         __m256i i3 = _mm256_cvttps_epi32(v3);
  
 #if defined(__AVX2__)
         i0 = _mm256_packs_epi32(i0, i1);
         i2 = _mm256_packs_epi32(i2, i3);
         i0 = _mm256_packs_epi16(i0, i2);
  
         const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
         i0 = _mm256_permutevar8x32_epi32(i0, perm);
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
         __m128i ni0 = _mm256_castsi256_si128(i0);
         __m128i ni1 = _mm256_extractf128_si256(i0, 1);
         __m128i ni2 = _mm256_castsi256_si128(i1);
         __m128i ni3 = _mm256_extractf128_si256(i1, 1);
         __m128i ni4 = _mm256_castsi256_si128(i2);
         __m128i ni5 = _mm256_extractf128_si256(i2, 1);
         __m128i ni6 = _mm256_castsi256_si128(i3);
         __m128i ni7 = _mm256_extractf128_si256(i3, 1);
  
         ni0 = _mm_packs_epi32(ni0, ni1);
         ni2 = _mm_packs_epi32(ni2, ni3);
         ni4 = _mm_packs_epi32(ni4, ni5);
         ni6 = _mm_packs_epi32(ni6, ni7);
  
         ni0 = _mm_packs_epi16(ni0, ni2);
         ni4 = _mm_packs_epi16(ni4, ni6);
  
         _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
         _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
 #endif
     }
 #else
     for (int i = 0; i < nb; i++) {
         const float *xb = x + i * QK8_0;
  
         /* Find max absolute value in block */
         float amax = 0.0f;
         for (int j = 0; j < QK8_0; j++) {
             float av = xb[j] >= 0 ? xb[j] : -xb[j];
             if (av > amax) amax = av;
         }
  
         /* Compute scale: d = max / 127 */
         float d = amax / 127.0f;
         float id = d != 0.0f ? 127.0f / amax : 0.0f;
  
         /* Store scale as FP16 */
         y[i].d = CK_FP32_TO_FP16(d);
  
         /* Quantize values */
         for (int j = 0; j < QK8_0; j++) {
             float v = xb[j] * id;
             /* Round to nearest int and clamp to [-127, 127] */
             int q = (int)(v + (v >= 0 ? 0.5f : -0.5f));
             if (q > 127) q = 127;
             if (q < -127) q = -127;
             y[i].qs[j] = (int8_t)q;
         }
     }
 #endif
 }

Referenced by gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ rmsnorm_forward()

void rmsnorm_forward	(	const float *	input,
		const float *	gamma,
		float *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

RMSNorm forward pass

Test:

test_rmsnorm.py::TestRMSNormForward::test_fp32_tokens

test_rmsnorm.py::TestRMSNormForward::test_fp32_single

test_rmsnorm.py::TestRMSNormForward::test_perf_rolled

test_layernorm.py::TestLayerNormForward::test_rmsnorm_compat

test_parity.py::test_rmsnorm_parity

RMSNorm: y[i] = gamma[i] * x[i] / sqrt(mean(x^2) + eps)

After changes: make test && make llamacpp-parity-full

Definition at line 50 of file rmsnorm_kernels.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     for (int t = 0; t < T; ++t) {
         const float *x = input + (size_t)t * aligned;
         float *y = output + (size_t)t * aligned;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 sum_sq_vec = _mm512_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
         }
         float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             __m512 yv = _mm512_mul_ps(x_hat, gv);
             _mm512_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #elif defined(__AVX__)
         // AVX: Process 8 floats at a time
         __m256 sum_sq_vec = _mm256_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares (no FMA in AVX1, use mul + add)
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 xv_sq = _mm256_mul_ps(xv, xv);
             sum_sq_vec = _mm256_add_ps(sum_sq_vec, xv_sq);
         }
         float sum_sq = hsum256_ps_rmsnorm(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         d = 0;
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 gv = _mm256_loadu_ps(&gamma[d]);
             __m256 x_hat = _mm256_mul_ps(xv, rstd_vec);
             __m256 yv = _mm256_mul_ps(x_hat, gv);
             _mm256_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #else
         // Scalar fallback
         double sum_sq = 0.0;
         for (int d = 0; d < D; ++d) {
             double v = (double)x[d];
             sum_sq += v * v;
         }
         double mean_sq = sum_sq / (double)D;
         double r = sqrt(mean_sq + (double)eps);
         float rstd = (float)(1.0 / r);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale
         for (int d = 0; d < D; ++d) {
             float x_hat = x[d] * rstd;
             y[d] = x_hat * gamma[d];
         }
 #endif
  
         // Zero padding (if any)
         for (int d = D; d < aligned; ++d) {
             y[d] = 0.0f;
         }
     }
 }

Referenced by mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ vec_dot_q5_0_q8_0()

void vec_dot_q5_0_q8_0	(	int	n,
		float *	s,
		const void *	vx,
		const void *	vy
	)

Auto-dispatch quantized dot product Q5_0 x Q8_0.

Dispatch priority:

AVX512 (best performance on modern Intel/AMD)
AVX (256-bit float ops, works on Sandy/Ivy Bridge and newer)
SSSE3 (128-bit fallback)
Reference scalar (last resort)

Definition at line 1498 of file gemm_kernels_q5_0.c.

 {
 #if defined(__AVX512F__)
     vec_dot_q5_0_q8_0_avx512(n, s, vx, vy);
 #elif defined(__AVX__)
     /* AVX for 256-bit float ops (works on Ivy Bridge and newer) */
     vec_dot_q5_0_q8_0_avx(n, s, vx, vy);
 #elif defined(__SSSE3__)
     /* SSSE3 - most efficient on older CPUs */
     vec_dot_q5_0_q8_0_sse(n, s, vx, vy);
 #else
     vec_dot_q5_0_q8_0_ref(n, s, vx, vy);
 #endif
 }

Referenced by gemv_q5_0_from_fp32(), mega_fused_attention_decode_q5_0(), and mega_fused_attention_decode_q5_0_parallel_simd().

◆ vec_dot_q8_0_q8_0()

void vec_dot_q8_0_q8_0	(	int	n,
		float *	s,
		const void *	vx,
		const void *	vy
	)

Auto-dispatch quantized dot product Q8_0 x Q8_0.

Definition at line 1013 of file gemm_kernels_q8_0.c.

 {
 #ifdef __AVX512F__
     vec_dot_q8_0_q8_0_avx512(n, s, vx, vy);
 #elif defined(__AVX__)
     vec_dot_q8_0_q8_0_avx(n, s, vx, vy);
 #elif defined(__SSE4_1__)
     vec_dot_q8_0_q8_0_sse(n, s, vx, vy);
 #else
     vec_dot_q8_0_q8_0_ref(n, s, vx, vy);
 #endif
 }

Referenced by gemv_q8_0_from_fp32().

Functions

Detailed Description

Function Documentation

◆ apply_rope_inline()

◆ attention_forward_decode_head_major_gqa_flash()

◆ gemv_q5_0_from_fp32()

◆ gemv_q8_0_from_fp32()

◆ mega_fused_attention_decode_q5_0()

◆ mega_fused_attention_decode_q5_0_parallel_simd()

◆ mega_fused_attention_decode_scratch_size()

◆ quantize_row_q8_0()

◆ rmsnorm_forward()

◆ vec_dot_q5_0_q8_0()

◆ vec_dot_q8_0_q8_0()