Attention score/softmax/output kernels with SIMD (SSE/AVX/AVX512) More...

#include "bf16_utils.h"
#include "ckernel_engine.h"
#include <math.h>
#include <stdlib.h>

Macros
#define	FLASH_QUERY_IMPL attention_flash_query_causal

#define	FLASH_QUERY_IMPL attention_flash_query_causal

#define	FLASH_QUERY_IMPL_DECODE attention_flash_query_causal

#define	SLIDING_DECODE_IMPL attention_flash_query_sliding

#define	SLIDING_FLASH_IMPL attention_flash_query_sliding

Functions
void	attention_backward_causal_head_major (const float d_output, const float q, const float k, const float v, const float attn_weights, float d_q, float d_k, float d_v, float *d_scores, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_backward_causal_head_major_gqa (const float d_output, const float q, const float k, const float v, const float attn_weights, float d_q, float d_k, float d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_backward_causal_head_major_gqa_bf16 (const uint16_t d_output, float d_x, const uint16_t q, const uint16_t k, const uint16_t v, const float attn_weights, float d_q, float d_k, float d_v, float d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float scratch_d_output, float scratch_q, float scratch_k, float scratch_v)

static void	attention_flash_query_causal (const float q_vec, const float k_head, const float v_head, int kv_tokens, int head_dim, int aligned_head_dim, float scale, float out_vec)

static void	attention_flash_query_sliding (const float q_vec, const float k_head, const float v_head, int query_pos, int kv_tokens, int head_dim, int aligned_head_dim, float scale, float out_vec, int sliding_window)

void	attention_forward_causal_head_major (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_exact (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa_bf16 (const uint16_t q, const uint16_t k, const uint16_t v, float scores, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float scratch_q, float scratch_k, float scratch_v)

void	attention_forward_causal_head_major_gqa_exact (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa_flash (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim)

void	attention_forward_causal_head_major_gqa_flash_strided (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)

void	attention_forward_causal_head_major_gqa_flash_strided_sliding (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens, int sliding_window)

void	attention_forward_decode_head_major_gqa_flash (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)

void	attention_forward_decode_head_major_gqa_flash_sliding (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim, int sliding_window)

void	attention_forward_decode_head_major_gqa_regular (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
	WARNING: This is NOT true flash attention! More...

static void	convert_bf16_tensor_to_buf (const uint16_t src, float dst, size_t count)

static size_t	qkv_index (int h, int t, int d, int num_tokens, int aligned_head_dim)

static size_t	score_index (int h, int i, int j, int aligned_context_window)

Detailed Description

Attention score/softmax/output kernels with SIMD (SSE/AVX/AVX512)

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Attention: softmax(Q @ K^T / sqrt(d)) @ V Supports GQA (grouped-query attention) with head broadcasting.

Definition in file attention_kernels.c.

Macro Definition Documentation

◆ FLASH_QUERY_IMPL [1/2]

#define FLASH_QUERY_IMPL attention_flash_query_causal

◆ FLASH_QUERY_IMPL [2/2]

#define FLASH_QUERY_IMPL attention_flash_query_causal

◆ FLASH_QUERY_IMPL_DECODE

#define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal

◆ SLIDING_DECODE_IMPL

#define SLIDING_DECODE_IMPL attention_flash_query_sliding

◆ SLIDING_FLASH_IMPL

#define SLIDING_FLASH_IMPL attention_flash_query_sliding

Function Documentation

◆ attention_backward_causal_head_major()

void attention_backward_causal_head_major	(	const float *	d_output,
		const float *	q,
		const float *	k,
		const float *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention backward (non-GQA version)

Test:

test_attention_backward.py::TestAttentionBackward::test_backward

test_attention_backward.py::TestAttentionBackward::test_backward_vs_separate

test_parity.py::test_attention_backward_parity

Non-GQA version where num_heads == num_kv_heads. Simpler than GQA, no head broadcasting needed.

After changes: make test && make llamacpp-parity-full

Definition at line 1811 of file attention_kernels.c.

 {
     attention_backward_causal_head_major_gqa(
         d_output, q, k, v, attn_weights,
         d_q, d_k, d_v, d_scores,
         num_heads, num_heads,  // num_kv_heads == num_heads
         num_tokens, head_dim, aligned_head_dim, aligned_context_window);
 }

References attention_backward_causal_head_major_gqa().

◆ attention_backward_causal_head_major_gqa()

void attention_backward_causal_head_major_gqa	(	const float *	d_output,
		const float *	q,
		const float *	k,
		const float *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention backward (score-matrix version)

Test:

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_vs_separate

test_parity.py::test_attention_backward_parity

Computes dQ, dK, dV given dOutput and attention weights. Supports grouped-query attention with head broadcasting.

After changes: make test && make llamacpp-parity-full

Definition at line 1672 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
     int T = num_tokens;
     int H = num_heads;
     int H_kv = num_kv_heads;
     int hd = head_dim;
     int ad = aligned_head_dim;
     int aw = aligned_context_window;
  
     const size_t d_q_elems = (size_t)H * (size_t)T * (size_t)ad;
     const size_t kv_elems = (size_t)H_kv * (size_t)T * (size_t)ad;
     /* Zero the aligned outputs so padded lanes never leak garbage to downstream GEMMs. */
     for (size_t idx = 0; idx < d_q_elems; ++idx) {
         d_q[idx] = 0.0f;
     }
     for (size_t idx = 0; idx < kv_elems; ++idx) {
         d_k[idx] = 0.0f;
         d_v[idx] = 0.0f;
     }
  
     // Process each query head
     for (int h = 0; h < H; ++h) {
         // Which KV head does this query head use?
         int kv_h = (int)((long long)h * (long long)H_kv / (long long)H);
  
         // ----------------------------------------------------------------
         // Step 1: d_weights = d_output @ V^T  and  d_v += weights^T @ d_output
         // ----------------------------------------------------------------
         // For each query position i, compute d_weights[i, j] for j <= i
         // and accumulate d_v[j] contributions
  
         for (int i = 0; i < T; ++i) {
             size_t d_out_base = qkv_index(h, i, 0, T, ad);
  
             for (int j = 0; j <= i; ++j) {
                 size_t v_base = qkv_index(kv_h, j, 0, T, ad);
                 size_t w_idx = score_index(h, i, j, aw);
                 float w = attn_weights[w_idx];
  
                 // d_weights[h, i, j] = d_output[h, i, :] @ v[kv_h, j, :]^T
                 float dot = 0.0f;
                 for (int dd = 0; dd < hd; ++dd) {
                     dot += d_output[d_out_base + dd] * v[v_base + dd];
                 }
                 d_scores[w_idx] = dot;
  
                 // d_v[kv_h, j, :] += weights[h, i, j] * d_output[h, i, :]
                 for (int dd = 0; dd < hd; ++dd) {
                     d_v[v_base + dd] += w * d_output[d_out_base + dd];
                 }
             }
  
             // Zero out upper triangle of d_scores
             for (int j = i + 1; j < T; ++j) {
                 d_scores[score_index(h, i, j, aw)] = 0.0f;
             }
             /* Scores scratch uses aligned_context_window, zero the padded columns. */
             for (int j = T; j < aw; ++j) {
                 d_scores[score_index(h, i, j, aw)] = 0.0f;
             }
         }
  
         // ----------------------------------------------------------------
         // Step 2: Backward through softmax (in-place on d_scores for this head)
         // ----------------------------------------------------------------
         // d_scores = softmax_backward(d_scores, attn_weights)
         // Formula: d_score[i,j] = w[i,j] * (d_w[i,j] - sum_k(w[i,k] * d_w[i,k]))
  
         for (int i = 0; i < T; ++i) {
             int base = h * aw * aw + i * aw;
  
             // Compute dot product: sum_j w[i,j] * d_w[i,j]
             float dot_product = 0.0f;
             for (int j = 0; j <= i; ++j) {
                 float wt = attn_weights[base + j];
                 float dw = d_scores[base + j];
                 dot_product += wt * dw;
             }
  
             // Apply softmax backward formula
             for (int j = 0; j <= i; ++j) {
                 float wt = attn_weights[base + j];
                 float dw = d_scores[base + j];
                 d_scores[base + j] = wt * (dw - dot_product);
             }
         }
  
         // ----------------------------------------------------------------
         // Step 3: d_q = d_scores @ K * scale
         //         d_k += d_scores^T @ Q * scale
         // ----------------------------------------------------------------
  
         for (int i = 0; i < T; ++i) {
             size_t d_q_base = qkv_index(h, i, 0, T, ad);
             size_t q_base = qkv_index(h, i, 0, T, ad);
  
             // d_q[h, i, :] = sum_j d_scores[h, i, j] * k[kv_h, j, :] * scale
             // d_k[kv_h, j, :] += d_scores[h, i, j] * q[h, i, :] * scale
             for (int j = 0; j <= i; ++j) {
                 size_t k_base = qkv_index(kv_h, j, 0, T, ad);
                 size_t d_k_base = qkv_index(kv_h, j, 0, T, ad);
                 float ds = d_scores[score_index(h, i, j, aw)] * scale;
  
                 for (int dd = 0; dd < hd; ++dd) {
                     d_q[d_q_base + dd] += ds * k[k_base + dd];
                     d_k[d_k_base + dd] += ds * q[q_base + dd];
                 }
             }
         }
     }
 }

References qkv_index(), and score_index().

Referenced by attention_backward_causal_head_major(), attention_backward_causal_head_major_gqa_bf16(), and ck_layer_backward_rmsnorm_swiglu().

◆ attention_backward_causal_head_major_gqa_bf16()

void attention_backward_causal_head_major_gqa_bf16	(	const uint16_t *	d_output,
		float *	d_x,
		const uint16_t *	q,
		const uint16_t *	k,
		const uint16_t *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window,
		float *	scratch_d_output,
		float *	scratch_q,
		float *	scratch_k,
		float *	scratch_v
	)

BF16 attention backward with caller-provided scratch buffers

Test:: bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_backward

Accepts BF16 inputs, converts to FP32, runs FP32 backward. Caller provides scratch buffers (no per-call malloc).

After changes: make test

Definition at line 1619 of file attention_kernels.c.

 {
     (void)d_x;
     const size_t head_elems = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
     const size_t kv_elems = (size_t)num_kv_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     if (!scratch_d_output || !scratch_q || !scratch_k || !scratch_v) return;
  
     convert_bf16_tensor_to_buf(d_output, scratch_d_output, head_elems);
     convert_bf16_tensor_to_buf(q, scratch_q, head_elems);
     convert_bf16_tensor_to_buf(k, scratch_k, kv_elems);
     convert_bf16_tensor_to_buf(v, scratch_v, kv_elems);
  
     attention_backward_causal_head_major_gqa(scratch_d_output, scratch_q, scratch_k, scratch_v,
                                              attn_weights,
                                              d_q, d_k, d_v, d_scores,
                                              num_heads, num_kv_heads,
                                              num_tokens, head_dim,
                                              aligned_head_dim, aligned_context_window);
     /* No free - caller owns scratch buffers */
 }

References attention_backward_causal_head_major_gqa(), and convert_bf16_tensor_to_buf().

◆ attention_flash_query_causal()

static void attention_flash_query_causal	(	const float *	q_vec,
		const float *	k_head,
		const float *	v_head,
		int	kv_tokens,
		int	head_dim,
		int	aligned_head_dim,
		float	scale,
		float *	out_vec
	)

static

Definition at line 730 of file attention_kernels.c.

 {
     // Online softmax:
     //   m = running max, s = running sum(exp(score - m))
     //   out = sum(exp(score - m) * v)
     float m = -INFINITY;
     float s = 0.0f;
  
     for (int d = 0; d < head_dim; ++d) {
         out_vec[d] = 0.0f;
     }
  
     for (int j = 0; j < kv_tokens; ++j) {
         const float *k_vec = k_head + (size_t)j * (size_t)aligned_head_dim;
         const float *v_vec = v_head + (size_t)j * (size_t)aligned_head_dim;
  
         float dot = 0.0f;
         for (int d = 0; d < head_dim; ++d) {
             dot += q_vec[d] * k_vec[d];
         }
         float score = dot * scale;
  
         if (score > m) {
             float exp_m = (m == -INFINITY) ? 0.0f : expf(m - score);
             s *= exp_m;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] *= exp_m;
             }
             s += 1.0f;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] += v_vec[d];
             }
             m = score;
         } else {
             float e = expf(score - m);
             s += e;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] += e * v_vec[d];
             }
         }
     }
  
     float inv_s = 1.0f / s;
     for (int d = 0; d < head_dim; ++d) {
         out_vec[d] *= inv_s;
     }
     for (int d = head_dim; d < aligned_head_dim; ++d) {
         out_vec[d] = 0.0f;
     }
 }

References score.

◆ attention_flash_query_sliding()

static void attention_flash_query_sliding	(	const float *	q_vec,
		const float *	k_head,
		const float *	v_head,
		int	query_pos,
		int	kv_tokens,
		int	head_dim,
		int	aligned_head_dim,
		float	scale,
		float *	out_vec,
		int	sliding_window
	)

static

Definition at line 1243 of file attention_kernels.c.

 {
     float m = -INFINITY;
     float s = 0.0f;
  
     int window_start = 0;
     if (sliding_window > 0) {
         window_start = query_pos - sliding_window + 1;
         if (window_start < 0) window_start = 0;
     }
  
     for (int d = 0; d < head_dim; ++d) {
         out_vec[d] = 0.0f;
     }
  
     int effective_kv_end = query_pos < kv_tokens ? query_pos : kv_tokens - 1;
     for (int j = window_start; j <= effective_kv_end; ++j) {
         const float *k_vec = k_head + (size_t)j * (size_t)aligned_head_dim;
         const float *v_vec = v_head + (size_t)j * (size_t)aligned_head_dim;
  
         float dot = 0.0f;
         for (int d = 0; d < head_dim; ++d) {
             dot += q_vec[d] * k_vec[d];
         }
         float score = dot * scale;
  
         if (score > m) {
             float exp_m = (m == -INFINITY) ? 0.0f : expf(m - score);
             s *= exp_m;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] *= exp_m;
             }
             s += 1.0f;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] += v_vec[d];
             }
             m = score;
         } else {
             float e = expf(score - m);
             s += e;
             for (int d = 0; d < head_dim; ++d) {
                 out_vec[d] += e * v_vec[d];
             }
         }
     }
  
     float inv_s = 1.0f / s;
     for (int d = 0; d < head_dim; ++d) {
         out_vec[d] *= inv_s;
     }
     for (int d = head_dim; d < aligned_head_dim; ++d) {
         out_vec[d] = 0.0f;
     }
 }

References score.

◆ attention_forward_causal_head_major()

void attention_forward_causal_head_major	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention forward (score-matrix version)

Test:

test_attention.py::TestAttentionForward::test_causal_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention.py::TestAttentionForward::test_exact_vs_fast

test_parity.py::test_attention_parity

Computes softmax(Q @ K^T / sqrt(d)) @ V with causal masking. Uses O(N^2) memory for scores matrix.

After changes: make test && make llamacpp-parity-full

Definition at line 70 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     // Phase 1: compute scaled dot-product scores Q·K^T / sqrt(d_k),
     // lower triangle only (j <= i).
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             // Ensure upper triangle is zeroed so there are no stale values
             // before the softmax kernel runs.
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Phase 2: apply causal row-wise softmax in-place over j <= i.
     causal_softmax_head_major(scores,
                               num_heads,
                               num_tokens,
                               aligned_context_window);
  
     // Phase 3: attention weights · V.
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
  
             // Zero the full aligned head slice so padded dims stay clean.
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             // Weighted sum over causal positions.
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major(), qkv_index(), and score_index().

◆ attention_forward_causal_head_major_exact()

void attention_forward_causal_head_major_exact	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention forward (exact version using stdlib expf)

Test:

test_attention.py::TestAttentionForward::test_exact_single

test_attention.py::TestAttentionForward::test_exact_vs_fast

Uses standard library expf for numerical accuracy reference. Slower but provides maximum accuracy.

After changes: make test

Definition at line 146 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     // Phase 1: compute scaled dot-product scores Q·K^T / sqrt(d_k),
     // lower triangle only (j <= i).
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             // Ensure upper triangle is zeroed so there are no stale values
             // before the softmax kernel runs.
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Phase 2: apply causal row-wise softmax using exact expf.
     causal_softmax_head_major_exact(scores,
                                      num_heads,
                                      num_tokens,
                                      aligned_context_window);
  
     // Phase 3: attention weights · V.
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
  
             // Zero the full aligned head slice so padded dims stay clean.
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             // Weighted sum over causal positions.
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major_exact(), qkv_index(), and score_index().

◆ attention_forward_causal_head_major_gqa()

void attention_forward_causal_head_major_gqa	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention forward (score-matrix version)

Test:

test_attention.py::TestAttentionForward::test_gqa_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_parity.py::test_attention_gqa_parity

Grouped-query attention: Q has num_heads, K/V have num_kv_heads. Each query head maps to a KV head via ratio.

After changes: make test && make llamacpp-parity-full

Definition at line 224 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     causal_softmax_head_major(scores,
                               num_heads,
                               num_tokens,
                               aligned_context_window);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major(), qkv_index(), and score_index().

Referenced by ck_layer_forward_rmsnorm_swiglu(), ck_layer_forward_rmsnorm_swiglu_q4_k(), ck_layer_forward_rmsnorm_swiglu_quant(), and ck_layer_forward_rmsnorm_swiglu_ref().

◆ attention_forward_causal_head_major_gqa_bf16()

void attention_forward_causal_head_major_gqa_bf16	(	const uint16_t *	q,
		const uint16_t *	k,
		const uint16_t *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window,
		float *	scratch_q,
		float *	scratch_k,
		float *	scratch_v
	)

BF16 GQA causal attention forward

Test:

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_forward

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_flash

Accepts BF16 inputs, converts to FP32, uses exact softmax. Caller provides scratch buffers (no per-call malloc).

After changes: make test

Definition at line 366 of file attention_kernels.c.

 {
     const size_t q_elems = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
     const size_t kv_elems = (size_t)num_kv_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     if (!scratch_q || !scratch_k || !scratch_v) return;
  
     convert_bf16_tensor_to_buf(q, scratch_q, q_elems);
     convert_bf16_tensor_to_buf(k, scratch_k, kv_elems);
     convert_bf16_tensor_to_buf(v, scratch_v, kv_elems);
  
     // Use exact version to avoid fast exp approximation error accumulating
     // with BF16 precision loss.
     attention_forward_causal_head_major_gqa_exact(scratch_q, scratch_k, scratch_v,
                                                    scores, output,
                                                    num_heads, num_kv_heads,
                                                    num_tokens, head_dim,
                                                    aligned_head_dim, aligned_context_window);
     /* No free - caller owns scratch buffers */
 }

References attention_forward_causal_head_major_gqa_exact(), and convert_bf16_tensor_to_buf().

◆ attention_forward_causal_head_major_gqa_exact()

void attention_forward_causal_head_major_gqa_exact	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention forward (exact version using stdlib expf)

Test:

test_attention.py::TestAttentionForward::test_gqa_exact

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

Uses standard library expf for numerical accuracy reference. Used by BF16 wrapper to avoid approximation error accumulation.

After changes: make test

Definition at line 294 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Use exact softmax with standard library expf
     causal_softmax_head_major_exact(scores,
                                      num_heads,
                                      num_tokens,
                                      aligned_context_window);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major_exact(), qkv_index(), and score_index().

Referenced by attention_forward_causal_head_major_gqa_bf16().

◆ attention_forward_causal_head_major_gqa_flash()

void attention_forward_causal_head_major_gqa_flash	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention forward for GQA (prefill, no score materialization)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_forward

test_flash_attention.py::TestFlashAttention::test_flash_vs_score_matrix

test_flash_attention.py::TestFlashAttention::test_flash_gqa

test_attention.py::TestAttentionForward::test_flash_forward

Online softmax with streaming KV. O(N) memory instead of O(N^2). For prefill: all tokens attend to previous tokens.

After changes: make test && make llamacpp-parity-full

Definition at line 800 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL attention_flash_query_causal
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * (size_t)T * (size_t)aligned_head_dim;
         const float *v_head = v + (size_t)kv_head * (size_t)T * (size_t)aligned_head_dim;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             FLASH_QUERY_IMPL(q_vec, k_head, v_head,
                              /*kv_tokens=*/i + 1,
                              head_dim, aligned_head_dim,
                              scale, out_vec);
         }
     }
  
 #undef FLASH_QUERY_IMPL
 }

References FLASH_QUERY_IMPL, and qkv_index().

◆ attention_forward_causal_head_major_gqa_flash_strided()

void attention_forward_causal_head_major_gqa_flash_strided	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens
	)

Flash attention forward with custom KV stride (for KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_attention

Variant with configurable kv_stride_tokens for KV cache layouts where K/V may not be contiguous in memory.

After changes: make test

Definition at line 859 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL attention_flash_query_causal
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             FLASH_QUERY_IMPL(q_vec, k_head, v_head,
                              /*kv_tokens=*/i + 1,
                              head_dim, aligned_head_dim,
                              scale, out_vec);
         }
     }
  
 #undef FLASH_QUERY_IMPL
 }

References FLASH_QUERY_IMPL, and qkv_index().

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

void attention_forward_causal_head_major_gqa_flash_strided_sliding	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		int	sliding_window
	)

Flash attention forward with sliding window (prefill)

Test:: test_attention.py::TestAttentionForward::test_sliding_window_prefill

Sliding-window attention for prefill: each token attends to the last W tokens. When sliding_window <= 0, behaves like regular causal attention.

After changes: make test

Definition at line 1316 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
 #if defined(__AVX512F__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             SLIDING_FLASH_IMPL(q_vec, k_head, v_head,
                                /*query_pos=*/i,
                                /*kv_tokens=*/T,
                                head_dim, aligned_head_dim,
                                scale, out_vec,
                                sliding_window);
         }
     }
  
 #undef SLIDING_FLASH_IMPL
 }

References qkv_index(), and SLIDING_FLASH_IMPL.

Referenced by ck_test_attention_sliding_window().

◆ attention_forward_decode_head_major_gqa_flash()

void attention_forward_decode_head_major_gqa_flash	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention decode (single token attends to KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_decode

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_decode

test_fused_attention_decode.py::TestFusedAttentionDecode::test_flash_decode

test_attention.py::TestAttentionForward::test_flash_decode

Single query token attends to kv_tokens in KV cache. Uses true flash attention from attention_flash_true.c.

After changes: make test && make llamacpp-parity-full

Definition at line 1467 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         attention_flash_decode(out_head,
                                q_head,
                                k_head,
                                v_head,
                                1,
                                kv_tokens,
                                1,
                                aligned_head_dim,
                                scale);
     }
 }

References attention_flash_decode().

◆ attention_forward_decode_head_major_gqa_flash_sliding()

void attention_forward_decode_head_major_gqa_flash_sliding	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim,
		int	sliding_window
	)

Flash attention decode with sliding window

Test:: test_attention.py::TestAttentionForward::test_sliding_window_decode

Single query token attends to the last W tokens in the KV cache. For decode: effective_kv_tokens = min(kv_tokens, sliding_window)

After changes: make test

Definition at line 1382 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens <= 0 || kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     // Compute effective KV tokens based on sliding window
     int effective_kv_tokens = kv_tokens;
     if (sliding_window > 0 && sliding_window < kv_tokens) {
         effective_kv_tokens = sliding_window;
     }
  
     // Guard against empty window (shouldn't happen with kv_tokens >= 1)
     if (effective_kv_tokens <= 0) {
         return;
     }
  
     // Offset to start reading from the last effective_kv_tokens entries
     int kv_start_offset = kv_tokens - effective_kv_tokens;
  
 #if defined(__AVX512F__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         // Offset K/V pointer to start from the first token in the sliding window
         const float *k_head = k_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         const float *v_head = v_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         // Use query_pos relative to the windowed KV (last token = effective_kv_tokens - 1)
         // sliding_window = 0 since we've already windowed via K/V pointer offset
         SLIDING_DECODE_IMPL(q_head, k_head, v_head,
                             /*query_pos=*/effective_kv_tokens - 1,
                             /*kv_tokens=*/effective_kv_tokens,
                             head_dim, aligned_head_dim,
                             scale, out_head,
                             /*sliding_window=*/0);
     }
  
 #undef SLIDING_DECODE_IMPL
 }

References SLIDING_DECODE_IMPL.

Referenced by ck_test_attention_decode_sliding().

◆ attention_forward_decode_head_major_gqa_regular()

void attention_forward_decode_head_major_gqa_regular	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

WARNING: This is NOT true flash attention!

This function is named "flash" but implements regular attention with O(n) complexity. It's kept for reference and as a fallback.

TRUE flash attention is implemented in attention_flash_true.c

Test:

test_kv_cache_attention.py::TestKVCacheAttention::test_regular_decode

test_attention.py::TestAttentionForward::test_regular_decode

Regular attention decode (score-matrix version) for fallback.

After changes: make test

Definition at line 1524 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal
 #endif
  
 #pragma omp parallel for schedule(static) if(num_heads > 1)
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_vec = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_vec = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         FLASH_QUERY_IMPL_DECODE(q_vec, k_head, v_head,
                                  kv_tokens, head_dim, aligned_head_dim,
                                  scale, out_vec);
     }
  
 #undef FLASH_QUERY_IMPL_DECODE
 }

References FLASH_QUERY_IMPL_DECODE.

◆ convert_bf16_tensor_to_buf()

static void convert_bf16_tensor_to_buf	(	const uint16_t *	src,
		float *	dst,
		size_t	count
	)

static

Definition at line 28 of file attention_kernels.c.

 {
     if (!dst || !src) return;
     bf16_tensor_to_float(src, dst, count);
 }

References bf16_tensor_to_float().

Referenced by attention_backward_causal_head_major_gqa_bf16(), and attention_forward_causal_head_major_gqa_bf16().

◆ qkv_index()

static size_t qkv_index	(	int	h,
		int	t,
		int	d,
		int	num_tokens,
		int	aligned_head_dim
	)

inlinestatic

Definition at line 36 of file attention_kernels.c.

 {
     return ((size_t)h * (size_t)num_tokens + (size_t)t) * (size_t)aligned_head_dim
          + (size_t)d;
 }

Referenced by attention_backward_causal_head_major_gqa(), attention_forward_causal_head_major(), attention_forward_causal_head_major_exact(), attention_forward_causal_head_major_gqa(), attention_forward_causal_head_major_gqa_exact(), attention_forward_causal_head_major_gqa_flash(), attention_forward_causal_head_major_gqa_flash_strided(), and attention_forward_causal_head_major_gqa_flash_strided_sliding().

◆ score_index()

static size_t score_index	(	int	h,
		int	i,
		int	j,
		int	aligned_context_window
	)

inlinestatic

Definition at line 48 of file attention_kernels.c.

 {
     return ((size_t)h * (size_t)aligned_context_window * (size_t)aligned_context_window)
          + (size_t)i * (size_t)aligned_context_window
          + (size_t)j;
 }

Referenced by attention_backward_causal_head_major_gqa(), attention_forward_causal_head_major(), attention_forward_causal_head_major_exact(), attention_forward_causal_head_major_gqa(), and attention_forward_causal_head_major_gqa_exact().

Macros

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Macro Definition Documentation

◆ FLASH_QUERY_IMPL [1/2]

◆ FLASH_QUERY_IMPL [2/2]

◆ FLASH_QUERY_IMPL_DECODE

◆ SLIDING_DECODE_IMPL

◆ SLIDING_FLASH_IMPL

Function Documentation

◆ attention_backward_causal_head_major()

◆ attention_backward_causal_head_major_gqa()

◆ attention_backward_causal_head_major_gqa_bf16()

◆ attention_flash_query_causal()

◆ attention_flash_query_sliding()

◆ attention_forward_causal_head_major()

◆ attention_forward_causal_head_major_exact()

◆ attention_forward_causal_head_major_gqa()

◆ attention_forward_causal_head_major_gqa_bf16()

◆ attention_forward_causal_head_major_gqa_exact()

◆ attention_forward_causal_head_major_gqa_flash()

◆ attention_forward_causal_head_major_gqa_flash_strided()

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

◆ attention_forward_decode_head_major_gqa_flash()

◆ attention_forward_decode_head_major_gqa_flash_sliding()

◆ attention_forward_decode_head_major_gqa_regular()

◆ convert_bf16_tensor_to_buf()

◆ qkv_index()

◆ score_index()