Mega-Fused Attention for AVX (256-bit) and AVX-512 (512-bit) More...

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "ckernel_engine.h"
#include "ckernel_quant.h"
#include "ck_features.h"

Macros
#define	MEGA_KV_TILE 32

#define	MEGA_Q_TILE 32

#define	MEGA_REGS 16 /* 16 YMM registers */

#define	MEGA_STACK_MAX 8192

#define	MEGA_VLEN 8 /* 256 / 32 */

#define	REG_K_TILE "YMM8-YMM11" /* 4 regs for K tile */

#define	REG_O_ACCUM "Stack+L1" /* O in L1 cache */

#define	REG_Q_ACCUM "YMM0-YMM7" /* 8 regs for Q tile */

#define	REG_SOFTMAX "YMM0-YMM1" /* 2 regs for m, l */

#define	REG_TEMP "YMM2-YMM3" /* 2 regs for temps */

#define	REG_V_TILE "YMM12-YMM15" /* 4 regs for V tile */

Functions
static float	ck_dot_f32 (const float a, const float b, int len)

void	mega_fuse_flash_attention_avx (float o_out, const float q, const float kv_cache_k, const float kv_cache_v, int num_heads, int num_kv_heads, int seq_len, int cache_capacity, int head_dim, int aligned_head_dim)
	Flash attention with online softmax (AVX version) More...

static void	mega_fuse_output_proj_residual (const float attn_token, const float wo, const float bo, const float residual, float *output, int embed_dim, int aligned_embed_dim, int num_heads, int head_dim, int aligned_head_dim)

void	mega_fuse_rmsnorm_qkv_avx (float q_out, float k_out, float v_out, const float input, const float gamma, const float wq, const float bq, const float wk, const float bk, const float wv, const float *bv, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, float eps)
	Fused RMSNorm + QKV for decode (single token) More...

void	mega_fuse_rope_inplace_avx (float q, float k, const float rope_cos, const float rope_sin, int pos, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim)
	Apply RoPE to Q and K (in-place, from L1) More...

void	mega_fused_attention_decode (float output, const float input, const float residual, const float ln1_gamma, const float wq, const float bq, const float wk, const float bk, const float wv, const float bv, const float wo, const float bo, float kv_cache_k, float kv_cache_v, const float rope_cos, const float rope_sin, int pos, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int cache_capacity, float eps)
	Full mega-fused attention for decode. More...

Detailed Description

Mega-Fused Attention for AVX (256-bit) and AVX-512 (512-bit)

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
NO memcpy for layout - use strided access, not copies
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

VIOLATION: Uses malloc for intermediate buffers and memcpy for layout. TODO: Refactor to use bump allocator workspace and strided access.

Holy grail fusion: RMSNorm → QKV → RoPE → Flash Attention → OutProj + Residual

AVX approach: Keep intermediates in L1 cache (not registers) AVX-512 approach: Keep intermediates in registers

Both achieve the same goal: Eliminate DRAM traffic for intermediates.

Definition in file mega_fused_attention_avx.c.

Macro Definition Documentation

◆ MEGA_KV_TILE

#define MEGA_KV_TILE 32

Definition at line 118 of file mega_fused_attention_avx.c.

◆ MEGA_Q_TILE

#define MEGA_Q_TILE 32

Definition at line 117 of file mega_fused_attention_avx.c.

◆ MEGA_REGS

#define MEGA_REGS 16 /* 16 YMM registers */

Definition at line 116 of file mega_fused_attention_avx.c.

◆ MEGA_STACK_MAX

#define MEGA_STACK_MAX 8192

Definition at line 119 of file mega_fused_attention_avx.c.

◆ MEGA_VLEN

#define MEGA_VLEN 8 /* 256 / 32 */

Definition at line 115 of file mega_fused_attention_avx.c.

◆ REG_K_TILE

#define REG_K_TILE "YMM8-YMM11" /* 4 regs for K tile */

Definition at line 123 of file mega_fused_attention_avx.c.

◆ REG_O_ACCUM

#define REG_O_ACCUM "Stack+L1" /* O in L1 cache */

Definition at line 125 of file mega_fused_attention_avx.c.

◆ REG_Q_ACCUM

#define REG_Q_ACCUM "YMM0-YMM7" /* 8 regs for Q tile */

Definition at line 122 of file mega_fused_attention_avx.c.

◆ REG_SOFTMAX

#define REG_SOFTMAX "YMM0-YMM1" /* 2 regs for m, l */

Definition at line 126 of file mega_fused_attention_avx.c.

◆ REG_TEMP

#define REG_TEMP "YMM2-YMM3" /* 2 regs for temps */

Definition at line 127 of file mega_fused_attention_avx.c.

◆ REG_V_TILE

#define REG_V_TILE "YMM12-YMM15" /* 4 regs for V tile */

Definition at line 124 of file mega_fused_attention_avx.c.

Function Documentation

◆ ck_dot_f32()

static float ck_dot_f32	(	const float *	a,
		const float *	b,
		int	len
	)

inlinestatic

Definition at line 54 of file mega_fused_attention_avx.c.

 {
 #if defined(__AVX512F__)
     __m512 acc = _mm512_setzero_ps();
     int i = 0;
     for (; i <= len - 16; i += 16) {
         __m512 va = _mm512_loadu_ps(a + i);
         __m512 vb = _mm512_loadu_ps(b + i);
         acc = _mm512_fmadd_ps(va, vb, acc);
     }
     float sum = _mm512_reduce_add_ps(acc);
     for (; i < len; ++i) {
         sum += a[i] * b[i];
     }
     return sum;
 #elif defined(__AVX__)
     __m256 acc = _mm256_setzero_ps();
     int i = 0;
     for (; i <= len - 8; i += 8) {
         __m256 va = _mm256_loadu_ps(a + i);
         __m256 vb = _mm256_loadu_ps(b + i);
         acc = _mm256_add_ps(acc, _mm256_mul_ps(va, vb));
     }
     float sum = ck_hsum256_ps(acc);
     for (; i < len; ++i) {
         sum += a[i] * b[i];
     }
     return sum;
 #else
     float sum = 0.0f;
     for (int i = 0; i < len; ++i) {
         sum += a[i] * b[i];
     }
     return sum;
 #endif
 }

Referenced by mega_fuse_output_proj_residual(), and mega_fuse_rmsnorm_qkv_avx().

◆ mega_fuse_flash_attention_avx()

void mega_fuse_flash_attention_avx	(	float *	o_out,
		const float *	q,
		const float *	kv_cache_k,
		const float *	kv_cache_v,
		int	num_heads,
		int	num_kv_heads,
		int	seq_len,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention with online softmax (AVX version)

Key insight: O, m, l stay in registers throughout! K/V tiles stream from L2 cache.

Parameters

o_out	Output [num_heads * aligned_head_dim] - in registers/L1
q	Q tensor [num_heads * aligned_head_dim] - from L1
kv_cache_k	KV cache K [num_kv_heads * cache_capacity * aligned_head_dim]
kv_cache_v	KV cache V [num_kv_heads * cache_capacity * aligned_head_dim]
num_heads	Number of heads
num_kv_heads	Number of KV heads
seq_len	Current sequence length
cache_capacity	KV cache capacity (head stride)
head_dim	Head dimension
aligned_head_dim	Aligned head dimension

Definition at line 444 of file mega_fused_attention_avx.c.

 {
     const int hd = head_dim;
     const float scale = 1.0f / sqrtf((float)hd);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; h++) {
         const float *q_h = q + (size_t)h * (size_t)aligned_head_dim;
         const int kv_idx = h % num_kv_heads;
         const float *k_cache = kv_cache_k + (size_t)kv_idx * head_stride;
         const float *v_cache = kv_cache_v + (size_t)kv_idx * head_stride;
  
         /* O, m, l in registers for this head */
         float o_h[aligned_head_dim]; /* in L1 */
         float m = -INFINITY; /* running max */
         float l = 0.0f;      /* running sum */
  
         /* Initialize O to zeros */
         memset(o_h, 0, (size_t)aligned_head_dim * sizeof(float));
  
         /* Iterate over KV cache tiles */
         for (int t = 0; t < seq_len; t += MEGA_KV_TILE) {
             int tile_end = t + MEGA_KV_TILE;
             if (tile_end > seq_len) tile_end = seq_len;
             int tile_size = tile_end - t;
  
             /* Load K tile from L2 cache */
             float k_tile[MEGA_KV_TILE * hd];
             for (int i = 0; i < tile_size; i++) {
                 memcpy(k_tile + (size_t)i * (size_t)hd,
                        k_cache + (size_t)(t + i) * (size_t)aligned_head_dim,
                        (size_t)hd * sizeof(float));
             }
  
             /* S_ij = Q @ K_tile.T / sqrt(d) - in registers */
             float s_row[MEGA_KV_TILE];
             for (int j = 0; j < tile_size; j++) {
                 s_row[j] = 0.0f;
                 for (int i = 0; i < hd; i++) {
                     s_row[j] += q_h[i] * k_tile[j * hd + i];
                 }
                 s_row[j] *= scale;
             }
  
             /* Online softmax update */
             float m_new = m;
             for (int j = 0; j < tile_size; j++) {
                 if (s_row[j] > m_new) m_new = s_row[j];
             }
  
             float l_new = 0.0f;
             for (int j = 0; j < tile_size; j++) {
                 float p = expf(s_row[j] - m_new);
                 s_row[j] = p;
                 l_new += p;
             }
  
             /* Scale O by exp(m - m_new) and add P @ V */
             float exp_m_diff = expf(m - m_new);
             for (int i = 0; i < hd; i++) {
                 o_h[i] *= exp_m_diff;
             }
  
             /* Load V tile and accumulate */
             for (int j = 0; j < tile_size; j++) {
                 float p = s_row[j];
                 for (int i = 0; i < hd; i++) {
                     o_h[i] += p * v_cache[(size_t)(t + j) * (size_t)aligned_head_dim + (size_t)i];
                 }
             }
  
             l = l * exp_m_diff + l_new;
             m = m_new;
         }
  
         /* Normalize by l */
         for (int i = 0; i < hd; i++) {
             o_h[i] /= l;
         }
         for (int i = hd; i < aligned_head_dim; ++i) {
             o_h[i] = 0.0f;
         }
  
         /* Store O - still in L1, goes to output projection */
         memcpy(o_out + (size_t)h * (size_t)aligned_head_dim,
                o_h,
                (size_t)aligned_head_dim * sizeof(float));
     }
 }

References MEGA_KV_TILE.

Referenced by mega_fused_attention_decode().

◆ mega_fuse_output_proj_residual()

static void mega_fuse_output_proj_residual	(	const float *	attn_token,
		const float *	wo,
		const float *	bo,
		const float *	residual,
		float *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	head_dim,
		int	aligned_head_dim
	)

static

Definition at line 551 of file mega_fused_attention_avx.c.

 {
     if (!attn_token || !wo || !output) {
         return;
     }
  
     const size_t head_weight_stride = (size_t)aligned_embed_dim * (size_t)aligned_head_dim;
  
     for (int j = 0; j < embed_dim; ++j) {
         float sum = bo ? bo[j] : 0.0f;
         for (int h = 0; h < num_heads; ++h) {
             const float *o_h = attn_token + (size_t)h * (size_t)aligned_head_dim;
             const float *wo_row = wo + (size_t)h * head_weight_stride + (size_t)j * (size_t)aligned_head_dim;
             sum += ck_dot_f32(o_h, wo_row, head_dim);
         }
         output[j] = sum + (residual ? residual[j] : 0.0f);
     }
  
     for (int j = embed_dim; j < aligned_embed_dim; ++j) {
         output[j] = 0.0f;
     }
 }

References ck_dot_f32().

Referenced by mega_fused_attention_decode().

◆ mega_fuse_rmsnorm_qkv_avx()

void mega_fuse_rmsnorm_qkv_avx	(	float *	q_out,
		float *	k_out,
		float *	v_out,
		const float *	input,
		const float *	gamma,
		const float *	wq,
		const float *	bq,
		const float *	wk,
		const float *	bk,
		const float *	wv,
		const float *	bv,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		float	eps
	)

Fused RMSNorm + QKV for decode (single token)

Intermediates stay in L1/L2. Output buffers are head-major.

Definition at line 143 of file mega_fused_attention_avx.c.

 {
     if (!q_out || !k_out || !v_out || !input || !wq || !wk || !wv) {
         return;
     }
     if (embed_dim <= 0 || aligned_embed_dim <= 0 || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     float ln1_row[aligned_embed_dim];
     float sum_sq = 0.0f;
  
 #if defined(__AVX512F__)
     __m512 sum_vec = _mm512_setzero_ps();
     int i = 0;
     for (; i + 16 <= embed_dim; i += 16) {
         __m512 xv = _mm512_loadu_ps(input + i);
         sum_vec = _mm512_fmadd_ps(xv, xv, sum_vec);
     }
     sum_sq = _mm512_reduce_add_ps(sum_vec);
     for (; i < embed_dim; ++i) {
         sum_sq += input[i] * input[i];
     }
 #elif defined(__AVX__)
     __m256 sum_vec = _mm256_setzero_ps();
     int i = 0;
     for (; i + 8 <= embed_dim; i += 8) {
         __m256 xv = _mm256_loadu_ps(input + i);
         sum_vec = _mm256_add_ps(sum_vec, _mm256_mul_ps(xv, xv));
     }
     sum_sq = ck_hsum256_ps(sum_vec);
     for (; i < embed_dim; ++i) {
         sum_sq += input[i] * input[i];
     }
 #else
     for (int i = 0; i < embed_dim; ++i) {
         sum_sq += input[i] * input[i];
     }
 #endif
  
     float rstd = 1.0f / sqrtf(sum_sq / (float)embed_dim + eps);
  
 #if defined(__AVX512F__)
     if (gamma) {
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         int j = 0;
         for (; j + 16 <= embed_dim; j += 16) {
             __m512 xv = _mm512_loadu_ps(input + j);
             __m512 gv = _mm512_loadu_ps(gamma + j);
             __m512 yv = _mm512_mul_ps(_mm512_mul_ps(xv, rstd_vec), gv);
             _mm512_storeu_ps(ln1_row + j, yv);
         }
         for (; j < embed_dim; ++j) {
             ln1_row[j] = input[j] * rstd * gamma[j];
         }
     } else {
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         int j = 0;
         for (; j + 16 <= embed_dim; j += 16) {
             __m512 xv = _mm512_loadu_ps(input + j);
             __m512 yv = _mm512_mul_ps(xv, rstd_vec);
             _mm512_storeu_ps(ln1_row + j, yv);
         }
         for (; j < embed_dim; ++j) {
             ln1_row[j] = input[j] * rstd;
         }
     }
 #elif defined(__AVX__)
     if (gamma) {
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         int j = 0;
         for (; j + 8 <= embed_dim; j += 8) {
             __m256 xv = _mm256_loadu_ps(input + j);
             __m256 gv = _mm256_loadu_ps(gamma + j);
             __m256 yv = _mm256_mul_ps(_mm256_mul_ps(xv, rstd_vec), gv);
             _mm256_storeu_ps(ln1_row + j, yv);
         }
         for (; j < embed_dim; ++j) {
             ln1_row[j] = input[j] * rstd * gamma[j];
         }
     } else {
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         int j = 0;
         for (; j + 8 <= embed_dim; j += 8) {
             __m256 xv = _mm256_loadu_ps(input + j);
             __m256 yv = _mm256_mul_ps(xv, rstd_vec);
             _mm256_storeu_ps(ln1_row + j, yv);
         }
         for (; j < embed_dim; ++j) {
             ln1_row[j] = input[j] * rstd;
         }
     }
 #else
     for (int j = 0; j < embed_dim; ++j) {
         ln1_row[j] = input[j] * rstd * (gamma ? gamma[j] : 1.0f);
     }
 #endif
  
     for (int j = embed_dim; j < aligned_embed_dim; ++j) {
         ln1_row[j] = 0.0f;
     }
  
     const size_t head_w_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         const float *wq_h = wq + (size_t)h * head_w_stride;
         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;
         float *q_h = q_out + (size_t)h * (size_t)aligned_head_dim;
         for (int d = 0; d < head_dim; ++d) {
             const float *row = wq_h + (size_t)d * (size_t)aligned_embed_dim;
             float sum = ck_dot_f32(ln1_row, row, aligned_embed_dim);
             q_h[d] = sum + (bq_h ? bq_h[d] : 0.0f);
         }
         for (int d = head_dim; d < aligned_head_dim; ++d) {
             q_h[d] = 0.0f;
         }
     }
  
     for (int h = 0; h < num_kv_heads; ++h) {
         const float *wk_h = wk + (size_t)h * head_w_stride;
         const float *wv_h = wv + (size_t)h * head_w_stride;
         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;
         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;
         float *k_h = k_out + (size_t)h * (size_t)aligned_head_dim;
         float *v_h = v_out + (size_t)h * (size_t)aligned_head_dim;
         for (int d = 0; d < head_dim; ++d) {
             const float *wk_row = wk_h + (size_t)d * (size_t)aligned_embed_dim;
             const float *wv_row = wv_h + (size_t)d * (size_t)aligned_embed_dim;
             float k_sum = ck_dot_f32(ln1_row, wk_row, aligned_embed_dim);
             float v_sum = ck_dot_f32(ln1_row, wv_row, aligned_embed_dim);
             k_h[d] = k_sum + (bk_h ? bk_h[d] : 0.0f);
             v_h[d] = v_sum + (bv_h ? bv_h[d] : 0.0f);
         }
         for (int d = head_dim; d < aligned_head_dim; ++d) {
             k_h[d] = 0.0f;
             v_h[d] = 0.0f;
         }
     }
 }

References ck_dot_f32().

Referenced by mega_fused_attention_decode().

◆ mega_fuse_rope_inplace_avx()

void mega_fuse_rope_inplace_avx	(	float *	q,
		float *	k,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim
	)

Apply RoPE to Q and K (in-place, from L1)

Q and K are already in L1 from QKV projection. Just apply rotation in-place.

Definition at line 311 of file mega_fused_attention_avx.c.

 {
     if (!q || !k || !rope_cos || !rope_sin || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
     if ((head_dim & 1) != 0) {
         return;
     }
  
     int half = head_dim / 2;
     const float *cos_ptr = rope_cos + (size_t)pos * (size_t)half;
     const float *sin_ptr = rope_sin + (size_t)pos * (size_t)half;
  
     for (int h = 0; h < num_heads; ++h) {
         float *q_h = q + (size_t)h * (size_t)aligned_head_dim;
         int i = 0;
 #if defined(__AVX512F__)
         for (; i + 16 <= half; i += 16) {
             __m512 q0 = _mm512_loadu_ps(q_h + i);
             __m512 q1 = _mm512_loadu_ps(q_h + i + half);
             __m512 cos = _mm512_loadu_ps(cos_ptr + i);
             __m512 sin = _mm512_loadu_ps(sin_ptr + i);
  
             __m512 q_rot0 = _mm512_sub_ps(_mm512_mul_ps(q0, cos), _mm512_mul_ps(q1, sin));
             __m512 q_rot1 = _mm512_add_ps(_mm512_mul_ps(q0, sin), _mm512_mul_ps(q1, cos));
  
             _mm512_storeu_ps(q_h + i, q_rot0);
             _mm512_storeu_ps(q_h + i + half, q_rot1);
         }
 #elif defined(__AVX__)
         for (; i + 8 <= half; i += 8) {
             __m256 q0 = _mm256_loadu_ps(q_h + i);
             __m256 q1 = _mm256_loadu_ps(q_h + i + half);
             __m256 cos = _mm256_loadu_ps(cos_ptr + i);
             __m256 sin = _mm256_loadu_ps(sin_ptr + i);
  
             __m256 q_rot0 = _mm256_sub_ps(_mm256_mul_ps(q0, cos), _mm256_mul_ps(q1, sin));
             __m256 q_rot1 = _mm256_add_ps(_mm256_mul_ps(q0, sin), _mm256_mul_ps(q1, cos));
  
             _mm256_storeu_ps(q_h + i, q_rot0);
             _mm256_storeu_ps(q_h + i + half, q_rot1);
         }
 #endif
         for (; i < half; ++i) {
             float q0 = q_h[i];
             float q1 = q_h[i + half];
             float c = cos_ptr[i];
             float s = sin_ptr[i];
             q_h[i] = q0 * c - q1 * s;
             q_h[i + half] = q0 * s + q1 * c;
         }
         for (int d = head_dim; d < aligned_head_dim; ++d) {
             q_h[d] = 0.0f;
         }
     }
  
     for (int h = 0; h < num_kv_heads; ++h) {
         float *k_h = k + (size_t)h * (size_t)aligned_head_dim;
         int i = 0;
 #if defined(__AVX512F__)
         for (; i + 16 <= half; i += 16) {
             __m512 k0 = _mm512_loadu_ps(k_h + i);
             __m512 k1 = _mm512_loadu_ps(k_h + i + half);
             __m512 cos = _mm512_loadu_ps(cos_ptr + i);
             __m512 sin = _mm512_loadu_ps(sin_ptr + i);
  
             __m512 k_rot0 = _mm512_sub_ps(_mm512_mul_ps(k0, cos), _mm512_mul_ps(k1, sin));
             __m512 k_rot1 = _mm512_add_ps(_mm512_mul_ps(k0, sin), _mm512_mul_ps(k1, cos));
  
             _mm512_storeu_ps(k_h + i, k_rot0);
             _mm512_storeu_ps(k_h + i + half, k_rot1);
         }
 #elif defined(__AVX__)
         for (; i + 8 <= half; i += 8) {
             __m256 k0 = _mm256_loadu_ps(k_h + i);
             __m256 k1 = _mm256_loadu_ps(k_h + i + half);
             __m256 cos = _mm256_loadu_ps(cos_ptr + i);
             __m256 sin = _mm256_loadu_ps(sin_ptr + i);
  
             __m256 k_rot0 = _mm256_sub_ps(_mm256_mul_ps(k0, cos), _mm256_mul_ps(k1, sin));
             __m256 k_rot1 = _mm256_add_ps(_mm256_mul_ps(k0, sin), _mm256_mul_ps(k1, cos));
  
             _mm256_storeu_ps(k_h + i, k_rot0);
             _mm256_storeu_ps(k_h + i + half, k_rot1);
         }
 #endif
         for (; i < half; ++i) {
             float k0 = k_h[i];
             float k1 = k_h[i + half];
             float c = cos_ptr[i];
             float s = sin_ptr[i];
             k_h[i] = k0 * c - k1 * s;
             k_h[i + half] = k0 * s + k1 * c;
         }
         for (int d = head_dim; d < aligned_head_dim; ++d) {
             k_h[d] = 0.0f;
         }
     }
 }

Referenced by mega_fused_attention_decode().

◆ mega_fused_attention_decode()

void mega_fused_attention_decode	(	float *	output,
		const float *	input,
		const float *	residual,
		const float *	ln1_gamma,
		const float *	wq,
		const float *	bq,
		const float *	wk,
		const float *	bk,
		const float *	wv,
		const float *	bv,
		const float *	wo,
		const float *	bo,
		float *	kv_cache_k,
		float *	kv_cache_v,
		const float *	rope_cos,
		const float *	rope_sin,
		int	pos,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	cache_capacity,
		float	eps
	)

Full mega-fused attention for decode.

Mega-fused attention for decode mode (single token)

RMSNorm → QKV → RoPE → Flash Attn → OutProj + Residual

Definition at line 589 of file mega_fused_attention_avx.c.

 {
     if (!output || !input || !ln1_gamma || !wq || !wk || !wv || !wo ||
         !kv_cache_k || !kv_cache_v) {
         return;
     }
     if (embed_dim <= 0 || aligned_embed_dim <= 0 || head_dim <= 0 || aligned_head_dim <= 0 ||
         num_heads <= 0 || num_kv_heads <= 0 || cache_capacity <= 0) {
         return;
     }
     if (pos < 0 || pos >= cache_capacity) {
         return;
     }
     if (aligned_embed_dim < embed_dim || aligned_head_dim < head_dim) {
         return;
     }
  
     const size_t q_elems = (size_t)num_heads * (size_t)aligned_head_dim;
     const size_t kv_elems = (size_t)num_kv_heads * (size_t)aligned_head_dim;
  
     float q_stack[MEGA_STACK_MAX];
     float k_stack[MEGA_STACK_MAX];
     float v_stack[MEGA_STACK_MAX];
     float o_stack[MEGA_STACK_MAX];
  
     float *q = q_stack;
     float *k = k_stack;
     float *v = v_stack;
     float *o = o_stack;
  
     int free_q = 0;
     int free_k = 0;
     int free_v = 0;
     int free_o = 0;
  
     if (q_elems > MEGA_STACK_MAX) {
         q = (float *)malloc(q_elems * sizeof(float));
         if (!q) {
             return;
         }
         free_q = 1;
     }
     if (kv_elems > MEGA_STACK_MAX) {
         k = (float *)malloc(kv_elems * sizeof(float));
         if (!k) {
             if (free_q) free(q);
             return;
         }
         v = (float *)malloc(kv_elems * sizeof(float));
         if (!v) {
             if (free_q) free(q);
             free(k);
             return;
         }
         free_k = 1;
         free_v = 1;
     }
     if (q_elems > MEGA_STACK_MAX) {
         o = (float *)malloc(q_elems * sizeof(float));
         if (!o) {
             if (free_q) free(q);
             if (free_k) free(k);
             if (free_v) free(v);
             return;
         }
         free_o = 1;
     }
  
     mega_fuse_rmsnorm_qkv_avx(q, k, v, input, ln1_gamma,
                               wq, bq, wk, bk, wv, bv,
                               embed_dim, aligned_embed_dim,
                               num_heads, num_kv_heads,
                               head_dim, aligned_head_dim, eps);
  
     if (rope_cos && rope_sin) {
         mega_fuse_rope_inplace_avx(q, k, rope_cos, rope_sin, pos,
                                    num_heads, num_kv_heads,
                                    head_dim, aligned_head_dim);
     }
  
     kv_cache_write_head_major(k, v,
                               kv_cache_k, kv_cache_v,
                               num_kv_heads, pos,
                               cache_capacity,
                               head_dim, aligned_head_dim);
  
     mega_fuse_flash_attention_avx(o, q, kv_cache_k, kv_cache_v,
                                   num_heads, num_kv_heads,
                                   pos + 1, cache_capacity,
                                   head_dim, aligned_head_dim);
  
     mega_fuse_output_proj_residual(o, wo, bo, residual, output,
                                    embed_dim, aligned_embed_dim,
                                    num_heads, head_dim, aligned_head_dim);
  
     if (free_q) free(q);
     if (free_k) free(k);
     if (free_v) free(v);
     if (free_o) free(o);
 }

References kv_cache_write_head_major(), mega_fuse_flash_attention_avx(), mega_fuse_output_proj_residual(), mega_fuse_rmsnorm_qkv_avx(), mega_fuse_rope_inplace_avx(), and MEGA_STACK_MAX.

Macros

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Macro Definition Documentation

◆ MEGA_KV_TILE

◆ MEGA_Q_TILE

◆ MEGA_REGS

◆ MEGA_STACK_MAX

◆ MEGA_VLEN

◆ REG_K_TILE

◆ REG_O_ACCUM

◆ REG_Q_ACCUM

◆ REG_SOFTMAX

◆ REG_TEMP

◆ REG_V_TILE

Function Documentation

◆ ck_dot_f32()

◆ mega_fuse_flash_attention_avx()

◆ mega_fuse_output_proj_residual()

◆ mega_fuse_rmsnorm_qkv_avx()

◆ mega_fuse_rope_inplace_avx()

◆ mega_fused_attention_decode()