- Global add_forward_f32 (const float *a, const float *b, float *y, size_t n)
test_add.py::TestAddForward::test_add_forward_f32
test_add.py::TestAddForward::test_add_inplace_f32
test_multi_layer_parity.py::TestMultiLayerParity::test_residual_add
- Global attention_backward_causal_head_major (const float *d_output, const float *q, const float *k, const float *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention_backward.py::TestAttentionBackward::test_backward
test_attention_backward.py::TestAttentionBackward::test_backward_vs_separate
test_parity.py::test_attention_backward_parity
- Global attention_backward_causal_head_major_gqa (const float *d_output, const float *q, const float *k, const float *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward
test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_vs_separate
test_parity.py::test_attention_backward_parity
- Global attention_backward_causal_head_major_gqa_bf16 (const uint16_t *d_output, float *d_x, const uint16_t *q, const uint16_t *k, const uint16_t *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float *scratch_d_output, float *scratch_q, float *scratch_k, float *scratch_v)
- bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_backward
- Global attention_forward_causal_head_major (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention.py::TestAttentionForward::test_causal_forward
test_attention.py::TestAttentionForward::test_gqa_broadcast
test_attention.py::TestAttentionForward::test_exact_vs_fast
test_parity.py::test_attention_parity
- Global attention_forward_causal_head_major_exact (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention.py::TestAttentionForward::test_exact_single
test_attention.py::TestAttentionForward::test_exact_vs_fast
- Global attention_forward_causal_head_major_gqa (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention.py::TestAttentionForward::test_gqa_forward
test_attention.py::TestAttentionForward::test_gqa_broadcast
test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward
test_parity.py::test_attention_gqa_parity
- Global attention_forward_causal_head_major_gqa_bf16 (const uint16_t *q, const uint16_t *k, const uint16_t *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float *scratch_q, float *scratch_k, float *scratch_v)
bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_forward
bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa
bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_flash
- Global attention_forward_causal_head_major_gqa_exact (const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
test_attention.py::TestAttentionForward::test_gqa_exact
bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa
- Global attention_forward_causal_head_major_gqa_flash (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim)
test_flash_attention.py::TestFlashAttention::test_flash_forward
test_flash_attention.py::TestFlashAttention::test_flash_vs_score_matrix
test_flash_attention.py::TestFlashAttention::test_flash_gqa
test_attention.py::TestAttentionForward::test_flash_forward
- Global attention_forward_causal_head_major_gqa_flash_strided (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)
test_flash_attention.py::TestFlashAttention::test_flash_strided
test_kv_cache_attention.py::TestKVCacheAttention::test_flash_attention
- Global attention_forward_causal_head_major_gqa_flash_strided_sliding (const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens, int sliding_window)
- test_attention.py::TestAttentionForward::test_sliding_window_prefill
- Global attention_forward_decode_head_major_gqa_flash (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
test_flash_attention.py::TestFlashAttention::test_flash_decode
test_kv_cache_attention.py::TestKVCacheAttention::test_flash_decode
test_fused_attention_decode.py::TestFusedAttentionDecode::test_flash_decode
test_attention.py::TestAttentionForward::test_flash_decode
- Global attention_forward_decode_head_major_gqa_flash_sliding (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim, int sliding_window)
- test_attention.py::TestAttentionForward::test_sliding_window_decode
- Global attention_forward_decode_head_major_gqa_regular (const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
test_kv_cache_attention.py::TestKVCacheAttention::test_regular_decode
test_attention.py::TestAttentionForward::test_regular_decode
- Global axpy_f32 (float *y, const float *x, float alpha, int n)
test_axpy.py::TestAXPY::test_axpy_f32
test_axpy.py::TestAXPY::test_axpy_vs_naive
- Global causal_softmax_head_major (float *scores, int num_heads, int num_tokens, int aligned_context_window)
test_softmax.py::TestSoftmaxForward::test_causal_softmax
test_softmax.py::TestSoftmaxForward::test_causal_vs_softmax
test_attention.py::TestAttentionForward::test_softmax_correctness
- Global causal_softmax_head_major_exact (float *scores, int num_heads, int num_tokens, int aligned_context_window)
test_softmax.py::TestSoftmaxForward::test_causal_softmax_exact
test_softmax.py::TestSoftmaxForward::test_exact_vs_fast
- Global geglu_backward_fp32 (const float *x, const float *d_out, float *d_x, int tokens, int dim)
- test_geglu.py::TestGeGLU::test_geglu_backward_fp32
- Global geglu_forward_bf16 (const uint16_t *x, uint16_t *out, int tokens, int dim, float *scratch)
- test_geglu.py::TestGeGLU::test_geglu_forward_bf16
- Global geglu_forward_fp32 (const float *x, float *out, int tokens, int dim)
- test_geglu.py::TestGeGLU::test_geglu_forward_fp32
- Global gelu_fast_inplace (float *data, size_t n)
test_gelu.py::TestGELUForward::test_gelu_fast_inplace
test_gelu.py::TestGELUForward::test_gelu_vs_exact
test_parity.py::test_gelu_parity
- Global qk_norm_forward (float *q, float *k, const float *q_gamma, const float *k_gamma, int num_heads, int num_kv_heads, int num_tokens, int head_dim, float eps)
- unittest/test_qk_norm.py
- Global rmsnorm_backward (const float *d_output, const float *input, const float *gamma, const float *rstd_cache, float *d_input, float *d_gamma, int tokens, int d_model, int aligned_embed_dim)
test_rmsnorm.py::TestRMSNormBackward::test_backward_tokens
test_rmsnorm.py::TestRMSNormBackward::test_backward_single
test_parity.py::test_rmsnorm_backward_parity
- Global rmsnorm_forward (const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
test_rmsnorm.py::TestRMSNormForward::test_fp32_tokens
test_rmsnorm.py::TestRMSNormForward::test_fp32_single
test_rmsnorm.py::TestRMSNormForward::test_perf_rolled
test_layernorm.py::TestLayerNormForward::test_rmsnorm_compat
test_parity.py::test_rmsnorm_parity
- Global rope_backward (const float *d_out, float *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
test_rope.py::TestRoPEBackward::test_rope_backward
test_rope.py::TestRoPEBackward::test_rope_backward_vs_separate
- Global rope_backward_inplace (float *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
- test_rope.py::TestRoPEBackward::test_rope_backward_inplace
- Global rope_backward_qk (const float *d_q_out, const float *d_k_out, float *d_q, float *d_k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
- test_rope.py::TestRoPEBackward::test_rope_backward_qk
- Global rope_forward (float *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
test_rope.py::TestRoPEForward::test_rope_forward
test_rope.py::TestRoPEForward::test_rope_vs_separate
test_parity.py::test_rope_parity
- Global rope_forward_qk (float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
test_rope.py::TestRoPEForward::test_rope_forward_qk
test_fused_attention_decode.py::TestFusedAttentionDecode::test_qk_rope
test_parity.py::test_rope_qk_parity
- Global rope_forward_qk_strided (float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int q_stride_tokens, int k_stride_tokens)
test_rope.py::TestRoPEForward::test_rope_forward_qk_strided
test_kv_cache_attention.py::TestKVCacheAttention::test_qk_rope_strided
- Global rope_forward_strided (float *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int head_stride_tokens)
test_rope.py::TestRoPEForward::test_rope_strided
test_kv_cache_attention.py::TestKVCacheAttention::test_rope_decode
- Global rope_precompute_cache (float *cos_cache, float *sin_cache, int max_seq_len, int head_dim, float base)
test_rope.py::TestRoPECache::test_cache_computation
test_rope.py::TestRoPECache::test_cache_values
- Global swiglu_backward (const float *input, const float *d_output, float *d_input, int tokens, int dim)
test_swiglu.py::TestSwiGLUBackward::test_backward_tokens
test_swiglu.py::TestSwiGLUBackward::test_backward_single
test_parity.py::test_swiglu_backward_parity
- Global swiglu_backward_exact (const float *input, const float *d_output, float *d_input, int tokens, int dim)
test_swiglu.py::TestSwiGLUBackward::test_exact_vs_fast
test_swiglu.py::TestSwiGLUBackward::test_exact_single
- Global swiglu_forward (const float *input, float *output, int tokens, int dim)
test_swiglu.py::TestSwiGLUForward::test_forward_tokens
test_swiglu.py::TestSwiGLUForward::test_forward_single
test_mlp.py::TestMLPForward::test_swiglu_mlp
test_fused_swiglu_decode.py::TestFusedSwiGLUDecode::test_fused_swiglu_decode
test_parity.py::test_swiglu_parity
- Global swiglu_forward_exact (const float *input, float *output, int tokens, int dim)
test_swiglu.py::TestSwiGLUForward::test_exact_vs_fast
test_swiglu.py::TestSwiGLUForward::test_exact_single