Flash-style attention (online softmax, causal, streaming) More...

#include <math.h>
#include <stddef.h>
#include <stdint.h>

Macros
#define	CK_FLASH_ATTN_FAST_EXP 0

#define	CK_FLASH_ATTN_TILE_K 32

Functions
void	attention_flash_cleanup (void)
	Clean up flash attention resources. More...

void	attention_flash_decode (float out, const float q, const float k, const float v, int T_q, int T_k, int H, int D_h, float scale)
	Main flash attention function with SIMD dispatch. More...

static void	attention_flash_decode_scalar (float out, const float q, const float k, const float v, int T_q, int T_k, int H, int D_h, float scale)
	Scalar flash-style attention (online softmax) More...

void	attention_flash_init (int max_context, int max_heads, int max_head_dim)
	Initialize flash attention buffers. More...

static float	ck_expf (float x)

static float	ck_fast_expf (float x)

int	ck_flash_attn_choose_tile_k (int D_h)

int	ck_flash_attn_fast_exp_kind (void)

static int	ck_flash_attn_tile_k (int D_h)

static int	max_k_for_query (int t_q, int T_q, int T_k)

Detailed Description

Flash-style attention (online softmax, causal, streaming)

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Layout: Q/K/V/Out: [T, H, D_h] contiguous

Causal alignment: Queries are assumed to correspond to the last T_q positions in the KV cache. This makes T_q == T_k behave like standard causal prefill, and T_q == 1 behave like decode over a full KV cache.

Notes:

This is O(T_k) per query head; it avoids materializing the score matrix.
SIMD paths are provided for AVX-512 and AVX.

Definition in file attention_flash_true.c.

Macro Definition Documentation

◆ CK_FLASH_ATTN_FAST_EXP

#define CK_FLASH_ATTN_FAST_EXP 0

Definition at line 44 of file attention_flash_true.c.

◆ CK_FLASH_ATTN_TILE_K

#define CK_FLASH_ATTN_TILE_K 32

Definition at line 40 of file attention_flash_true.c.

Function Documentation

◆ attention_flash_cleanup()

void attention_flash_cleanup ( void )

Clean up flash attention resources.

Definition at line 739 of file attention_flash_true.c.

                                    {
     // For future optimization: free pre-allocated buffers
 }

◆ attention_flash_decode()

void attention_flash_decode	(	float *	out,
		const float *	q,
		const float *	k,
		const float *	v,
		int	T_q,
		int	T_k,
		int	H,
		int	D_h,
		float	scale
	)

Main flash attention function with SIMD dispatch.

Parameters

out	Output [T_q, H, D_h]
q	Query [T_q, H, D_h]
k	Key [T_k, H, D_h]
v	Value [T_k, H, D_h]
T_q	Number of query tokens (1 for decode)
T_k	Number of key/value tokens (context length)
H	Number of heads
D_h	Head dimension
scale	1/sqrt(D_h)

Definition at line 696 of file attention_flash_true.c.

 {
     if (!out || !q || !k || !v) {
         return;
     }
     if (T_q <= 0 || T_k <= 0 || H <= 0 || D_h <= 0) {
         return;
     }
  
     // Dispatch based on CPU features
 #if defined(__AVX512F__)
     attention_flash_decode_avx512(out, q, k, v, T_q, T_k, H, D_h, scale);
 #elif defined(__AVX__) && !defined(__AVX512F__)
     attention_flash_decode_avx(out, q, k, v, T_q, T_k, H, D_h, scale);
 #else
     attention_flash_decode_scalar(out, q, k, v, T_q, T_k, H, D_h, scale);
 #endif
 }

References attention_flash_decode_scalar().

Referenced by attention_forward_decode_head_major_gqa_flash(), ck_attention_flash_decode_wrapper(), mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ attention_flash_decode_scalar()

static void attention_flash_decode_scalar	(	float *	out,
		const float *	q,
		const float *	k,
		const float *	v,
		int	T_q,
		int	T_k,
		int	H,
		int	D_h,
		float	scale
	)

static

Scalar flash-style attention (online softmax)

Definition at line 142 of file attention_flash_true.c.

 {
     const int total = T_q * H;
     const size_t stride = (size_t)H * (size_t)D_h;
     const int tile_k = ck_flash_attn_tile_k(D_h);
  
     for (int idx = 0; idx < total; ++idx) {
         const int t_q = idx / H;
         const int h = idx - t_q * H;
         const int max_k = max_k_for_query(t_q, T_q, T_k);
  
         const float *q_head = q + (size_t)t_q * stride + (size_t)h * (size_t)D_h;
         float *out_head = out + (size_t)t_q * stride + (size_t)h * (size_t)D_h;
         const float *k_base = k + (size_t)h * (size_t)D_h;
         const float *v_base = v + (size_t)h * (size_t)D_h;
  
         for (int d = 0; d < D_h; ++d) {
             out_head[d] = 0.0f;
         }
  
         float m = -INFINITY;
         float s = 0.0f;
  
         float scores[CK_FLASH_ATTN_TILE_K];
  
         for (int t_k0 = 0; t_k0 <= max_k; t_k0 += tile_k) {
             int blk_len = max_k - t_k0 + 1;
             if (blk_len > tile_k) {
                 blk_len = tile_k;
             }
  
             float m_block = -INFINITY;
             for (int bi = 0; bi < blk_len; ++bi) {
                 const int t_k = t_k0 + bi;
                 const float *k_head = k_base + (size_t)t_k * stride;
  
                 float dot = 0.0f;
                 for (int d = 0; d < D_h; ++d) {
                     dot += q_head[d] * k_head[d];
                 }
  
                 float score = dot * scale;
                 scores[bi] = score;
                 if (score > m_block) {
                     m_block = score;
                 }
             }
  
             if (m_block > m) {
                 float scale_old = (m == -INFINITY) ? 0.0f : ck_expf(m - m_block);
                 s *= scale_old;
                 for (int d = 0; d < D_h; ++d) {
                     out_head[d] *= scale_old;
                 }
                 m = m_block;
             }
  
             for (int bi = 0; bi < blk_len; ++bi) {
                 const int t_k = t_k0 + bi;
                 const float *v_head = v_base + (size_t)t_k * stride;
                 float w = ck_expf(scores[bi] - m);
                 s += w;
                 for (int d = 0; d < D_h; ++d) {
                     out_head[d] += w * v_head[d];
                 }
             }
         }
  
         if (s > 0.0f) {
             float inv_s = 1.0f / s;
             for (int d = 0; d < D_h; ++d) {
                 out_head[d] *= inv_s;
             }
         } else {
             for (int d = 0; d < D_h; ++d) {
                 out_head[d] = 0.0f;
             }
         }
     }
 }

References ck_expf(), CK_FLASH_ATTN_TILE_K, ck_flash_attn_tile_k(), max_k_for_query(), and score.

Referenced by attention_flash_decode().

◆ attention_flash_init()

void attention_flash_init	(	int	max_context,
		int	max_heads,
		int	max_head_dim
	)

Initialize flash attention buffers.

Definition at line 731 of file attention_flash_true.c.

                                                                             {
     // For future optimization: pre-allocate scratch buffers
     // Currently using stack/heap allocation
 }

◆ ck_expf()

static float ck_expf ( float x )

inlinestatic

Definition at line 80 of file attention_flash_true.c.

                                      {
 #if CK_FLASH_ATTN_FAST_EXP
     return ck_fast_expf(x);
 #else
     return expf(x);
 #endif
 }

References ck_fast_expf().

Referenced by attention_flash_decode_scalar().

◆ ck_fast_expf()

static float ck_fast_expf ( float x )

inlinestatic

Definition at line 47 of file attention_flash_true.c.

                                           {
     const float max_val = 88.0f;
     const float min_val = -88.0f;
     if (x > max_val) {
         x = max_val;
     } else if (x < min_val) {
         x = min_val;
     }
  
     const float log2e = 1.4426950408889634f;
     float z = x * log2e;
     float zf = nearbyintf(z);
     float f = z - zf;
  
     const float c0 = 1.0f;
     const float c1 = 0.6931471805599453f;
     const float c2 = 0.2402265069591007f;
     const float c3 = 0.05550410866482158f;
     const float c4 = 0.009618129107628478f;
  
     float poly = ((c4 * f + c3) * f + c2) * f + c1;
     poly = poly * f + c0;
  
     int32_t zi = (int32_t)zf + 127;
     uint32_t bits = (uint32_t)zi << 23;
     union {
         uint32_t i;
         float f;
     } u;
     u.i = bits;
     return poly * u.f;
 }

Referenced by ck_expf().

◆ ck_flash_attn_choose_tile_k()

int ck_flash_attn_choose_tile_k ( int D_h )

Definition at line 108 of file attention_flash_true.c.

                                          {
     return ck_flash_attn_tile_k(D_h);
 }

References ck_flash_attn_tile_k().

◆ ck_flash_attn_fast_exp_kind()

int ck_flash_attn_fast_exp_kind ( void )

Definition at line 112 of file attention_flash_true.c.

                                       {
 #if CK_FLASH_ATTN_FAST_EXP
 #if defined(__AVX512F__)
     return 512;
 #elif defined(__AVX__)
     return 256;
 #else
     return 0;
 #endif
 #else
     return 0;
 #endif
 }

◆ ck_flash_attn_tile_k()

static int ck_flash_attn_tile_k ( int D_h )

inlinestatic

Definition at line 88 of file attention_flash_true.c.

                                                 {
     int tile = CK_FLASH_ATTN_TILE_K;
     if (D_h > 128) {
         tile = CK_FLASH_ATTN_TILE_K / 4;
     } else if (D_h > 64) {
         tile = CK_FLASH_ATTN_TILE_K / 2;
     }
  
     if (CK_FLASH_ATTN_TILE_K >= 8 && tile < 8) {
         tile = 8;
     }
     if (tile > CK_FLASH_ATTN_TILE_K) {
         tile = CK_FLASH_ATTN_TILE_K;
     }
     if (tile < 1) {
         tile = 1;
     }
     return tile;
 }

References CK_FLASH_ATTN_TILE_K.

Referenced by attention_flash_decode_scalar(), and ck_flash_attn_choose_tile_k().

◆ max_k_for_query()

static int max_k_for_query	(	int	t_q,
		int	T_q,
		int	T_k
	)

inlinestatic

Definition at line 126 of file attention_flash_true.c.

                                                              {
     int q_pos_offset = (T_k > T_q) ? (T_k - T_q) : 0;
     int max_k = q_pos_offset + t_q;
     if (max_k >= T_k) {
         max_k = T_k - 1;
     }
     return max_k;
 }

Referenced by attention_flash_decode_scalar().

Macros

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Macro Definition Documentation

◆ CK_FLASH_ATTN_FAST_EXP

◆ CK_FLASH_ATTN_TILE_K

Function Documentation

◆ attention_flash_cleanup()

◆ attention_flash_decode()

◆ attention_flash_decode_scalar()

◆ attention_flash_init()

◆ ck_expf()

◆ ck_fast_expf()

◆ ck_flash_attn_choose_tile_k()

◆ ck_flash_attn_fast_exp_kind()

◆ ck_flash_attn_tile_k()

◆ max_k_for_query()