Mega-fused attention decode with Q5_0 weights - Header. More...

Functions
void	mega_fused_attention_decode_q5_0 (float output, const float input, const float residual, const void wq_q5_0, const void wk_q5_0, const void wv_q8_0, const void wo_q5_0, const float ln_gamma, const float bq, const float bk, const float bv, const float bo, float kv_cache_k, float kv_cache_v, const float rope_cos, const float rope_sin, int pos, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int cache_capacity, float eps, void *scratch)
	Serial mega-fused attention decode kernel. More...

void	mega_fused_attention_decode_q5_0_parallel_simd (float output, const float input, const float residual, const void wq_q5_0, const void wk_q5_0, const void wv_q8_0, const void wo_q5_0, const float ln_gamma, const float bq, const float bk, const float bv, const float bo, float kv_cache_k, float kv_cache_v, const float rope_cos, const float rope_sin, int pos, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int cache_capacity, float eps, void *scratch, int ith, int nth)
	Parallel SIMD mega-fused attention decode kernel (threadpool-aware) More...

int	mega_fused_attention_decode_scratch_size (int AE, int H, int KV, int AD)
	Calculate scratch buffer size needed for the kernel. More...

Detailed Description

Mega-fused attention decode with Q5_0 weights - Header.

This header declares the mega-fused attention decode kernel that combines 9 separate operations into a single fused kernel call:

RMSNorm
Q projection (Q5_0) with bias
K projection (Q5_0) with bias
V projection (Q8_0) with bias
RoPE application
KV cache store
Flash attention decode (GQA-aware)
O projection (Q5_0) with bias
Residual add

Definition in file mega_fused_attention_decode_q5_0.h.

Function Documentation

◆ mega_fused_attention_decode_q5_0()

void mega_fused_attention_decode_q5_0	(	float *	output,
		const float *	input,
		const float *	residual,
		const void *	wq_q5_0,
		const void *	wk_q5_0,
		const void *	wv_q8_0,
		const void *	wo_q5_0,
		const float *	ln_gamma,
		const float *	bq,
		const float *	bk,
		const float *	bv,
		const float *	bo,
		float *	kv_cache_k,
		float *	kv_cache_v,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	cache_capacity,
		float	eps,
		void *	scratch
	)

Serial mega-fused attention decode kernel.

Parameters

output	Output [AE] (final result, after residual add)
input	Input activation [AE]
residual	Residual input for add [AE]
wq_q5_0	Q projection weights [H*AD, AE] Q5_0
wk_q5_0	K projection weights [KV*AD, AE] Q5_0
wv_q8_0	V projection weights [KV*AD, AE] Q8_0
wo_q5_0	O projection weights [AE, H*AD] Q5_0
ln_gamma	RMSNorm gamma [AE]
bq	Q bias [H*AD] or NULL
bk	K bias [KV*AD] or NULL
bv	V bias [KV*AD] or NULL
bo	O bias [AE] or NULL
kv_cache_k	K cache [KV, max_T, AD]
kv_cache_v	V cache [KV, max_T, AD]
rope_cos	RoPE cos [max_T, D]
rope_sin	RoPE sin [max_T, D]
pos	Current position (0-indexed)
embed_dim	Original embedding dimension E
aligned_embed_dim	Aligned embedding dimension AE
num_heads	Number of query heads H
num_kv_heads	Number of key/value heads KV
head_dim	Head dimension AD
aligned_head_dim	Aligned head dimension AAD
cache_capacity	Maximum cache capacity max_T
eps	RMSNorm epsilon
scratch	Scratch buffer (>= scratch_size bytes)

Definition at line 222 of file mega_fused_attention_decode_q5_0.c.

 {
     const int H = num_heads;
     const int KV = num_kv_heads;
     const int AD = head_dim;
     const int AE = aligned_embed_dim;
     (void)embed_dim;  /* Unused but kept for API consistency */
  
     /* Parse scratch buffer - all allocations from scratch, no VLAs */
     float *scratch_ptr = (float *)scratch;
  
     float *rmsnorm_out = scratch_ptr;
     scratch_ptr += AE;
  
     float *rstd_scratch = scratch_ptr;  /* For rmsnorm rstd output - avoids VLA */
     scratch_ptr += AE;
  
     float *q = scratch_ptr;
     scratch_ptr += H * AD;
  
     float *k = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *v = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *attn_out = scratch_ptr;
     scratch_ptr += H * AD;
  
     block_q8_0 *x_q8_scratch = (block_q8_0 *)scratch_ptr;
  
     const int q_size = H * AD;
     const int k_size = KV * AD;
     const int v_size = KV * AD;
  
     /* ========================================================================
      * STEP 1: RMSNorm
      * Correct signature: rmsnorm_forward(in, gamma, out, rstd, T, D, AD, eps)
      * T=1 (single token), D=AE (full embed dim for norm)
      * ======================================================================== */
     rmsnorm_forward(input, ln_gamma, rmsnorm_out, rstd_scratch, 1, AE, AD, eps);
  
     /* ========================================================================
      * STEP 2-4: Q, K, V projections (fused with quantization)
      * Use scratch buffer for quantized input
      * ======================================================================== */
     gemv_q5_0_from_fp32(q, wq_q5_0, rmsnorm_out, bq, q_size, AE, x_q8_scratch);
     gemv_q5_0_from_fp32(k, wk_q5_0, rmsnorm_out, bk, k_size, AE, x_q8_scratch);
     gemv_q8_0_from_fp32(v, wv_q8_0, rmsnorm_out, bv, v_size, AE, x_q8_scratch);
  
     /* ========================================================================
      * STEP 5: Apply RoPE
      * ======================================================================== */
     apply_rope_inline(q, k, rope_cos, rope_sin, pos, H, KV, AD);
  
     /* ========================================================================
      * STEP 6: Store K and V to cache
      * Cache layout: [KV, cache_capacity, AD]
      * ======================================================================== */
     const size_t kv_stride = (size_t)cache_capacity * AD;
     for (int kv = 0; kv < KV; kv++) {
         float *k_cache = &kv_cache_k[kv * kv_stride];
         float *v_cache = &kv_cache_v[kv * kv_stride];
         const float *k_src = &k[kv * AD];
         const float *v_src = &v[kv * AD];
         const int offset = pos * AD;
         for (int d = 0; d < AD; d++) {
             k_cache[offset + d] = k_src[d];
             v_cache[offset + d] = v_src[d];
         }
     }
  
     /* ========================================================================
      * STEP 7: Flash attention decode (GQA-aware variant)
      * attention_forward_decode_head_major_gqa_flash handles H != KV correctly
      * It maps each of H heads to one of KV KV heads via: kv_head = h * KV / H
      * ======================================================================== */
     attention_forward_decode_head_major_gqa_flash(
         q, kv_cache_k, kv_cache_v,
         attn_out, H, KV, pos + 1, cache_capacity, AD, aligned_head_dim);
  
     /* ========================================================================
      * STEP 8: O projection (Q5_0 weights) with bias and residual add
      *
      * attn_out layout: [H * AD] flattened
      * wo_q5_0 layout: [AE, H*AD] - row e has H*AD input features
      *
      * O projection: output[e] = dot(wo[e], attn_out) + bias[e] + residual[e]
      *
      * Using vec_dot_q5_0_q8_0 for efficient quantized dot product.
      * ======================================================================== */
  
     /* Quantize attention output to Q8_0 for GEMV */
     quantize_row_q8_0(attn_out, x_q8_scratch, H * AD);
  
     const block_q5_0 *wo = (const block_q5_0 *)wo_q5_0;
     const int blocks_per_row = (H * AD) / QK5_0;
  
     for (int e = 0; e < AE; e++) {
         float dot;
         vec_dot_q5_0_q8_0(H * AD, &dot, &wo[e * blocks_per_row], x_q8_scratch);
         output[e] = dot + (bo ? bo[e] : 0.0f) + residual[e];
     }
 }

References apply_rope_inline(), attention_forward_decode_head_major_gqa_flash(), gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), QK5_0, quantize_row_q8_0(), rmsnorm_forward(), and vec_dot_q5_0_q8_0().

◆ mega_fused_attention_decode_q5_0_parallel_simd()

void mega_fused_attention_decode_q5_0_parallel_simd	(	float *	output,
		const float *	input,
		const float *	residual,
		const void *	wq_q5_0,
		const void *	wk_q5_0,
		const void *	wv_q8_0,
		const void *	wo_q5_0,
		const float *	ln_gamma,
		const float *	bq,
		const float *	bk,
		const float *	bv,
		const float *	bo,
		float *	kv_cache_k,
		float *	kv_cache_v,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	cache_capacity,
		float	eps,
		void *	scratch,
		int	ith,
		int	nth
	)

Parallel SIMD mega-fused attention decode kernel (threadpool-aware)

Parallelizes across attention heads using (ith, nth) pattern. Each thread processes a subset of heads.

IMPORTANT: Caller must ensure barrier sync between phases: Phase 1 (ith==0 only): RMSNorm, Q/K/V projection, RoPE, KV cache store – BARRIER – Phase 2 (all threads): Attention for assigned heads – BARRIER – Phase 3 (ith==0 only): O projection and residual add

Parameters

ith	Thread index (0 to nth-1)
nth	Total number of threads (other parameters same as serial version)

Definition at line 367 of file mega_fused_attention_decode_q5_0.c.

 {
     const int H = num_heads;
     const int KV = num_kv_heads;
     const int AD = head_dim;
     const int AE = aligned_embed_dim;
     (void)embed_dim;
  
     /* Each thread handles a subset of heads */
     const int heads_per_thread = (H + nth - 1) / nth;
     const int h_start = ith * heads_per_thread;
     const int h_end = (h_start + heads_per_thread < H) ? h_start + heads_per_thread : H;
     const int my_heads = h_end - h_start;
  
     if (h_start >= H) return;
  
     /* Parse scratch buffer (shared across threads) */
     float *scratch_ptr = (float *)scratch;
  
     float *rmsnorm_out = scratch_ptr;
     scratch_ptr += AE;
  
     float *rstd_scratch = scratch_ptr;
     scratch_ptr += AE;
  
     float *q = scratch_ptr;
     scratch_ptr += H * AD;
  
     float *k = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *v = scratch_ptr;
     scratch_ptr += KV * AD;
  
     float *attn_out = scratch_ptr;
     scratch_ptr += H * AD;
  
     block_q8_0 *x_q8_scratch = (block_q8_0 *)scratch_ptr;
  
     /* ========================================================================
      * PHASE 1: Only thread 0 does RMSNorm and K/V projections
      * These are shared across all heads.
      * CALLER MUST BARRIER AFTER THIS PHASE.
      * ======================================================================== */
     if (ith == 0) {
         rmsnorm_forward(input, ln_gamma, rmsnorm_out, rstd_scratch, 1, AE, AD, eps);
  
         gemv_q5_0_from_fp32(q, wq_q5_0, rmsnorm_out, bq, H * AD, AE, x_q8_scratch);
         gemv_q5_0_from_fp32(k, wk_q5_0, rmsnorm_out, bk, KV * AD, AE, x_q8_scratch);
         gemv_q8_0_from_fp32(v, wv_q8_0, rmsnorm_out, bv, KV * AD, AE, x_q8_scratch);
  
         apply_rope_inline(q, k, rope_cos, rope_sin, pos, H, KV, AD);
  
         /* Store K/V to cache */
         const size_t kv_stride = (size_t)cache_capacity * AD;
         for (int kv_idx = 0; kv_idx < KV; kv_idx++) {
             float *k_cache = &kv_cache_k[kv_idx * kv_stride];
             float *v_cache = &kv_cache_v[kv_idx * kv_stride];
             const int offset = pos * AD;
             for (int d = 0; d < AD; d++) {
                 k_cache[offset + d] = k[kv_idx * AD + d];
                 v_cache[offset + d] = v[kv_idx * AD + d];
             }
         }
     }
  
     /* ========================================================================
      * CALLER MUST BARRIER HERE
      * All threads need to wait for thread 0 to finish projections
      * ======================================================================== */
  
     /* ========================================================================
      * PHASE 2: Each thread does attention for its heads only
      * attention_forward_decode_head_major_gqa_flash expects:
      *   - q_token: pointer to start of Q for these heads
      *   - out_token: pointer to start of output for these heads
      *   - num_heads: number of heads THIS THREAD is processing
      * ======================================================================== */
     if (my_heads > 0) {
         attention_forward_decode_head_major_gqa_flash(
             &q[h_start * AD],           /* Q for this thread's heads */
             kv_cache_k, kv_cache_v,
             &attn_out[h_start * AD],    /* Output for this thread's heads */
             my_heads,                   /* Only my_heads, not H */
             KV,                         /* Still need all KV heads for GQA */
             pos + 1, cache_capacity, AD, aligned_head_dim);
     }
  
     /* ========================================================================
      * CALLER MUST BARRIER HERE
      * Thread 0 needs all threads to finish attention before O projection
      * ======================================================================== */
  
     /* ========================================================================
      * PHASE 3: Thread 0 does O projection and residual add
      * ======================================================================== */
     if (ith == 0) {
         /* Quantize full attention output for O projection */
         quantize_row_q8_0(attn_out, x_q8_scratch, H * AD);
  
         const block_q5_0 *wo = (const block_q5_0 *)wo_q5_0;
         const int blocks_per_row = (H * AD) / QK5_0;
  
         for (int e = 0; e < AE; e++) {
             float dot;
             vec_dot_q5_0_q8_0(H * AD, &dot, &wo[e * blocks_per_row], x_q8_scratch);
             output[e] = dot + (bo ? bo[e] : 0.0f) + residual[e];
         }
     }
 }

References apply_rope_inline(), attention_forward_decode_head_major_gqa_flash(), gemv_q5_0_from_fp32(), gemv_q8_0_from_fp32(), QK5_0, quantize_row_q8_0(), rmsnorm_forward(), and vec_dot_q5_0_q8_0().

◆ mega_fused_attention_decode_scratch_size()

int mega_fused_attention_decode_scratch_size	(	int	AE,
		int	H,
		int	KV,
		int	AD
	)

Calculate scratch buffer size needed for the kernel.

Parameters

AE	Aligned embedding dimension (multiple of 64)
H	Number of query heads
KV	Number of key/value heads
AD	Head dimension

Returns: Size in bytes needed for scratch buffer

Definition at line 176 of file mega_fused_attention_decode_q5_0.c.

                                                                             {
     /* Need: 1x AE for RMSNorm output
             1x AE for RMSNorm rstd (avoid VLA)
             1x H*AD for Q
             1x KV*AD for K
             1x KV*AD for V
             1x H*AD for attention output
             1x max(AE, H*AD)/QK8_0 * sizeof(block_q8_0) for GEMV scratch
     */
     int max_input_dim = (AE > H * AD) ? AE : H * AD;
     int q8_blocks = (max_input_dim + QK8_0 - 1) / QK8_0;
     return (int)(sizeof(float) * (AE + AE + H * AD + 2 * KV * AD + H * AD)
                  + q8_blocks * sizeof(block_q8_0));
 }

References QK8_0.

Functions

Detailed Description

Function Documentation

◆ mega_fused_attention_decode_q5_0()

◆ mega_fused_attention_decode_q5_0_parallel_simd()

◆ mega_fused_attention_decode_scratch_size()