Global add_forward_f32 (const float *a, const float *b, float *y, size_t n)

test_add.py::TestAddForward::test_add_forward_f32

test_add.py::TestAddForward::test_add_inplace_f32

test_multi_layer_parity.py::TestMultiLayerParity::test_residual_add

Global attention_backward_causal_head_major (const float *d_output, const float *q, const float *k, const float *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention_backward.py::TestAttentionBackward::test_backward

test_attention_backward.py::TestAttentionBackward::test_backward_vs_separate

test_parity.py::test_attention_backward_parity

Global attention_backward_causal_head_major_gqa (const float *d_output, const float *q, const float *k, const float *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_vs_separate

test_parity.py::test_attention_backward_parity

Global attention_backward_causal_head_major_gqa_bf16 (const uint16_t *d_output, float *d_x, const uint16_t *q, const uint16_t *k, const uint16_t *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float *scratch_d_output, float *scratch_q, float *scratch_k, float *scratch_v)

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_backward

Global attention_forward_causal_head_major (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention.py::TestAttentionForward::test_causal_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention.py::TestAttentionForward::test_exact_vs_fast

test_parity.py::test_attention_parity

Global attention_forward_causal_head_major_exact (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention.py::TestAttentionForward::test_exact_single

test_attention.py::TestAttentionForward::test_exact_vs_fast

Global attention_forward_causal_head_major_gqa (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention.py::TestAttentionForward::test_gqa_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_parity.py::test_attention_gqa_parity

Global attention_forward_causal_head_major_gqa_bf16 (const uint16_t *q, const uint16_t *k, const uint16_t *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float *scratch_q, float *scratch_k, float *scratch_v)

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_forward

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_flash

Global attention_forward_causal_head_major_gqa_exact (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

test_attention.py::TestAttentionForward::test_gqa_exact

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

Global attention_forward_causal_head_major_gqa_flash (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim)

test_flash_attention.py::TestFlashAttention::test_flash_forward

test_flash_attention.py::TestFlashAttention::test_flash_vs_score_matrix

test_flash_attention.py::TestFlashAttention::test_flash_gqa

test_attention.py::TestAttentionForward::test_flash_forward

Global attention_forward_causal_head_major_gqa_flash_strided (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)

test_flash_attention.py::TestFlashAttention::test_flash_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_attention

Global attention_forward_causal_head_major_gqa_flash_strided_sliding (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens, int sliding_window)

test_attention.py::TestAttentionForward::test_sliding_window_prefill

Global attention_forward_decode_head_major_gqa_flash (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)

test_flash_attention.py::TestFlashAttention::test_flash_decode

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_decode

test_fused_attention_decode.py::TestFusedAttentionDecode::test_flash_decode

test_attention.py::TestAttentionForward::test_flash_decode

Global attention_forward_decode_head_major_gqa_flash_sliding (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim, int sliding_window)

test_attention.py::TestAttentionForward::test_sliding_window_decode

Global attention_forward_decode_head_major_gqa_regular (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)

test_kv_cache_attention.py::TestKVCacheAttention::test_regular_decode

test_attention.py::TestAttentionForward::test_regular_decode

Global axpy_f32 (float *y, const float *x, float alpha, int n)

test_axpy.py::TestAXPY::test_axpy_f32

test_axpy.py::TestAXPY::test_axpy_vs_naive

Global causal_softmax_head_major (float *scores, int num_heads, int num_tokens, int aligned_context_window)

test_softmax.py::TestSoftmaxForward::test_causal_softmax

test_softmax.py::TestSoftmaxForward::test_causal_vs_softmax

test_attention.py::TestAttentionForward::test_softmax_correctness

Global causal_softmax_head_major_exact (float *scores, int num_heads, int num_tokens, int aligned_context_window)

test_softmax.py::TestSoftmaxForward::test_causal_softmax_exact

test_softmax.py::TestSoftmaxForward::test_exact_vs_fast

Global geglu_backward_fp32 (const float *x, const float *d_out, float *d_x, int tokens, int dim)

test_geglu.py::TestGeGLU::test_geglu_backward_fp32

Global geglu_forward_bf16 (const uint16_t *x, uint16_t *out, int tokens, int dim, float *scratch)

test_geglu.py::TestGeGLU::test_geglu_forward_bf16

Global geglu_forward_fp32 (const float *x, float *out, int tokens, int dim)

test_geglu.py::TestGeGLU::test_geglu_forward_fp32

Global gelu_fast_inplace (float *data, size_t n)

test_gelu.py::TestGELUForward::test_gelu_fast_inplace

test_gelu.py::TestGELUForward::test_gelu_vs_exact

test_parity.py::test_gelu_parity

Global qk_norm_forward (float *q, float *k, const float *q_gamma, const float *k_gamma, int num_heads, int num_kv_heads, int num_tokens, int head_dim, float eps)

unittest/test_qk_norm.py

Global rmsnorm_backward (const float *d_output, const float *input, const float *gamma, const float *rstd_cache, float *d_input, float *d_gamma, int tokens, int d_model, int aligned_embed_dim)

test_rmsnorm.py::TestRMSNormBackward::test_backward_tokens

test_rmsnorm.py::TestRMSNormBackward::test_backward_single

test_parity.py::test_rmsnorm_backward_parity

Global rmsnorm_forward (const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

test_rmsnorm.py::TestRMSNormForward::test_fp32_tokens

test_rmsnorm.py::TestRMSNormForward::test_fp32_single

test_rmsnorm.py::TestRMSNormForward::test_perf_rolled

test_layernorm.py::TestLayerNormForward::test_rmsnorm_compat

test_parity.py::test_rmsnorm_parity

Global rope_backward (const float *d_out, float *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

test_rope.py::TestRoPEBackward::test_rope_backward

test_rope.py::TestRoPEBackward::test_rope_backward_vs_separate

Global rope_backward_inplace (float *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

test_rope.py::TestRoPEBackward::test_rope_backward_inplace

Global rope_backward_qk (const float *d_q_out, const float *d_k_out, float *d_q, float *d_k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

test_rope.py::TestRoPEBackward::test_rope_backward_qk

Global rope_forward (float *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

test_rope.py::TestRoPEForward::test_rope_forward

test_rope.py::TestRoPEForward::test_rope_vs_separate

test_parity.py::test_rope_parity

Global rope_forward_qk (float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

test_rope.py::TestRoPEForward::test_rope_forward_qk

test_fused_attention_decode.py::TestFusedAttentionDecode::test_qk_rope

test_parity.py::test_rope_qk_parity

Global rope_forward_qk_strided (float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int q_stride_tokens, int k_stride_tokens)

test_rope.py::TestRoPEForward::test_rope_forward_qk_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_qk_rope_strided

Global rope_forward_strided (float *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int head_stride_tokens)

test_rope.py::TestRoPEForward::test_rope_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_rope_decode

Global rope_precompute_cache (float *cos_cache, float *sin_cache, int max_seq_len, int head_dim, float base)

test_rope.py::TestRoPECache::test_cache_computation

test_rope.py::TestRoPECache::test_cache_values

Global swiglu_backward (const float *input, const float *d_output, float *d_input, int tokens, int dim)

test_swiglu.py::TestSwiGLUBackward::test_backward_tokens

test_swiglu.py::TestSwiGLUBackward::test_backward_single

test_parity.py::test_swiglu_backward_parity

Global swiglu_backward_exact (const float *input, const float *d_output, float *d_input, int tokens, int dim)

test_swiglu.py::TestSwiGLUBackward::test_exact_vs_fast

test_swiglu.py::TestSwiGLUBackward::test_exact_single

Global swiglu_forward (const float *input, float *output, int tokens, int dim)

test_swiglu.py::TestSwiGLUForward::test_forward_tokens

test_swiglu.py::TestSwiGLUForward::test_forward_single

test_mlp.py::TestMLPForward::test_swiglu_mlp

test_fused_swiglu_decode.py::TestFusedSwiGLUDecode::test_fused_swiglu_decode

test_parity.py::test_swiglu_parity

Global swiglu_forward_exact (const float *input, float *output, int tokens, int dim)

test_swiglu.py::TestSwiGLUForward::test_exact_vs_fast

test_swiglu.py::TestSwiGLUForward::test_exact_single