#include <stddef.h>
#include <stdint.h>
#include "cpu_features.h"
#include "ckernel_quant.h"
#include "mega_fused_attention.h"

Data Structures
struct	CKMathBackend

Functions
void	add_backward_bf16 (const uint16_t d_y, uint16_t d_a, uint16_t *d_b, size_t n)

void	add_forward_2d_bf16 (const uint16_t a, const uint16_t b, uint16_t *y, int tokens, int dim, int aligned_dim)

void	add_forward_bf16 (const uint16_t a, const uint16_t b, uint16_t *y, size_t n)

void	add_forward_f32 (const float a, const float b, float *y, size_t n)

void	add_inplace_bf16 (uint16_t a, const uint16_t b, size_t n)

void	add_inplace_f32 (float a, const float b, size_t n)

void	add_scaled_forward_bf16 (const uint16_t a, const uint16_t b, uint16_t *y, float alpha, size_t n)

void	add_scaled_inplace_bf16 (uint16_t a, const uint16_t b, float alpha, size_t n)

int	argmax_f32 (const float *scores, int n)
	Find index of maximum value. More...

void	attention_backward_causal_head_major (const float d_output, const float q, const float k, const float v, const float attn_weights, float d_q, float d_k, float d_v, float *d_scores, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_backward_causal_head_major_gqa (const float d_output, const float q, const float k, const float v, const float attn_weights, float d_q, float d_k, float d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_backward_causal_head_major_gqa_bf16 (const uint16_t d_output, float d_x, const uint16_t q, const uint16_t k, const uint16_t v, const float attn_weights, float d_q, float d_k, float d_v, float d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float scratch_d_output, float scratch_q, float scratch_k, float scratch_v)

void	attention_flash_decode (float out, const float q, const float k, const float v, int T_q, int T_k, int H, int D_h, float scale)
	Main flash attention function with SIMD dispatch. More...

void	attention_forward_causal_head_major (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_exact (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa_bf16 (const uint16_t q, const uint16_t k, const uint16_t v, float scores, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window, float scratch_q, float scratch_k, float scratch_v)

void	attention_forward_causal_head_major_gqa_exact (const float q, const float k, const float v, float scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)

void	attention_forward_causal_head_major_gqa_flash (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim)

void	attention_forward_causal_head_major_gqa_flash_strided (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)

void	attention_forward_causal_head_major_gqa_flash_strided_sliding (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens, int sliding_window)

void	attention_forward_decode_head_major_gqa_flash (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)

void	attention_forward_decode_head_major_gqa_flash_sliding (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim, int sliding_window)

void	attention_forward_decode_head_major_gqa_regular (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
	WARNING: This is NOT true flash attention! More...

void	axpy_2d_f32 (float Y, const float X, float alpha, int num_tokens, int dim, int y_stride, int x_stride)
	Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:]. More...

void	axpy_f32 (float y, const float x, float alpha, int n)
	In-place AXPY: y += alpha * x. More...

void	axpy_zero_f32 (float y, const float x, float alpha, int n)
	Zero output then accumulate: y = 0; y += alpha * x. More...

void	backward_causal_softmax_head_major (float d_scores, const float weights, int num_heads, int num_tokens, int aligned_context_window)

void	backward_causal_softmax_head_major_bf16 (uint16_t d_scores, const uint16_t weights, int num_heads, int num_tokens, int aligned_context_window, float scratch_d_scores, float scratch_weights)

void	causal_softmax_head_major (float *scores, int num_heads, int num_tokens, int aligned_context_window)

void	causal_softmax_head_major_bf16 (uint16_t scores, int num_heads, int num_tokens, int aligned_context_window, float scratch)

void	causal_softmax_head_major_exact (float *scores, int num_heads, int num_tokens, int aligned_context_window)

void	ck_attention_flash_decode_wrapper (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
	Wrapper to call TRUE flash attention from orchestration layer. More...

int	ck_flash_attn_choose_tile_k (int D_h)

int	ck_flash_attn_fast_exp_kind (void)

void	ck_gemm_nt_head_major_q5_0 (const float attn_out, const void wo, const float bias, float output, int tokens, int embed_dim, int num_heads, int head_dim)
	Output projection from head-major attention (auto-dispatch) More...

void	ck_gemm_nt_head_major_q8_0 (const float attn_out, const void wo, const float bias, float output, int tokens, int embed_dim, int num_heads, int head_dim)
	Output projection from head-major attention (Q8_0 weights) More...

int	ck_get_num_threads (void)

int	ck_get_physical_cores (void)

void	ck_set_num_threads (int num_threads)

void	ck_set_strict_parity (int enabled)

int	ck_strict_parity_enabled (void)

CKMathBackend	ckernel_backend_native (void)

void	dequant_q4_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_0 row (multiple blocks) More...

void	dequant_q4_1_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_1 row (multiple blocks) More...

void	dequant_q4_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_K row (multiple blocks) More...

void	dequant_q5_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q5_0 row (multiple blocks) More...

void	dequant_q5_1_row (const void src, float dst, size_t n_elements)
	Dequantize Q5_1 row (multiple blocks) More...

void	dequant_q6_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q6_K row (multiple blocks) More...

void	dequant_q8_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q8_0 row (multiple blocks) More...

void	embedding_backward (const int32_t token_ids, int token_count, const float d_output, float d_token_embeddings, float d_pos_embeddings, int vocab_size, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_backward_bf16 (const int32_t token_ids, int token_count, const uint16_t d_output, uint16_t d_token_embeddings, uint16_t d_pos_embeddings, int vocab_size, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_forward (const int32_t token_ids, int token_count, int vocab_size, const float token_embeddings, const float pos_embeddings, float output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_forward_bf16 (const int32_t token_ids, int token_count, int vocab_size, const uint16_t token_embeddings, const uint16_t pos_embeddings, uint16_t output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_forward_q4_k (const int32_t token_ids, int token_count, int vocab_size, const void token_embeddings, const float pos_embeddings, float output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_forward_q6_k (const int32_t token_ids, int token_count, int vocab_size, const void token_embeddings, const float pos_embeddings, float output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	embedding_forward_q8_0 (const int32_t token_ids, int token_count, int vocab_size, const void token_embeddings, const float pos_embeddings, float output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)

void	fc1_backward_kernel (const float d_output, const float fc1_input, const float W_fc1, float d_input, float d_W_fc1, float d_b_fc1, int T, int aligned_in, int aligned_out, int num_threads)

void	fc2_backward_kernel (const float d_output, const float fc2_input, const float W_fc2, float d_input, float d_W_fc2, float d_b_fc2, int T, int aligned_in, int aligned_out, int num_threads)

void	fused_mlp_swiglu_decode (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

void	fused_mlp_swiglu_decode_tiled (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

void	fused_mlp_swiglu_decode_v2 (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

void	fused_mlp_swiglu_prefill (const float x, const float W_gate, const float W_up, const float W_down, float output, int seq_len, int hidden, int intermediate, float scratch)
	Fused MLP (Gate + Up + SwiGLU + Down) for prefill. More...

void	fused_mlp_swiglu_prefill_bias (const float x, const float W_gate, const float W_up, const float W_down, const float B_gate, const float B_up, const float B_down, float output, int seq_len, int hidden, int intermediate, float *scratch)
	Fused MLP (Gate + Up + SwiGLU + Down) for prefill with biases. More...

void	fused_mlp_swiglu_prefill_w1w2_quant (const float x, const void W1, const float B1, CKDataType w1_dt, const void W2, const float B2, CKDataType w2_dt, float output, int seq_len, int embed_dim, int aligned_embed_dim, int intermediate_dim, int aligned_intermediate_dim, void *scratch)
	Quantized fused MLP for prefill (W1=gate+up, W2=down) More...

size_t	fused_mlp_swiglu_prefill_w1w2_quant_scratch_size (int aligned_embed_dim, int aligned_intermediate_dim)
	Get scratch buffer size for fused_mlp_swiglu_prefill_w1w2_quant. More...

size_t	fused_mlp_swiglu_scratch_size (int intermediate)
	Get scratch buffer size for fused_mlp_swiglu_prefill. More...

void	fused_rmsnorm_qkv_prefill (const float x, const float gamma, const float Wq, const float Wk, const float Wv, float Q, float K, float V, int seq_len, int hidden, int q_dim, int kv_dim, float eps, float *scratch)
	Fused RMSNorm + QKV projection for prefill. More...

void	fused_rmsnorm_qkv_prefill_head_major (const float x, const float gamma, const float Wq, const float Bq, const float Wk, const float Bk, const float Wv, const float Bv, float Q, float K, float V, int seq_len, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int kv_stride_tokens, float eps, float scratch)
	Fused RMSNorm + QKV projection for prefill (head-major outputs) More...

void	fused_rmsnorm_qkv_prefill_head_major_quant (const float x, const float gamma, const void Wq, const float Bq, CKDataType wq_dt, const void Wk, const float Bk, CKDataType wk_dt, const void Wv, const float Bv, CKDataType wv_dt, float Q, float K, float V, int seq_len, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int kv_stride_tokens, float eps, void scratch)
	Fused RMSNorm + QKV projection for prefill (head-major, Q8 activations) More...

size_t	fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size (int aligned_embed_dim)
	Get scratch buffer size for fused_rmsnorm_qkv_prefill_head_major_quant. More...

size_t	fused_rmsnorm_qkv_scratch_size (int hidden)
	Get scratch buffer size for fused_rmsnorm_qkv_prefill. More...

void	geglu_backward_fp32 (const float x, const float d_out, float *d_x, int tokens, int dim)

void	geglu_forward_bf16 (const uint16_t x, uint16_t out, int tokens, int dim, float *scratch)

void	geglu_forward_fp32 (const float x, float out, int tokens, int dim)

void	gelu_backward_exact (const float input, const float d_output, float *d_input, size_t n)

void	gelu_backward_exact_bf16 (const uint16_t input, const uint16_t d_output, uint16_t d_input, size_t n, float scratch_input, float scratch_d_output, float scratch_d_input)

void	gelu_backward_fast (const float input, const float d_output, float *d_input, size_t n)

void	gelu_backward_fast_bf16 (const uint16_t input, const uint16_t d_output, uint16_t d_input, size_t n, float scratch_input, float scratch_d_output, float scratch_d_input)

void	gelu_backward_scalar (const float input, const float d_output, float *d_input, size_t n)

void	gelu_exact_inplace (float *data, size_t n)

void	gelu_fast_inplace (float *data, size_t n)

void	gelu_fast_inplace_bf16 (uint16_t data, size_t n, float scratch)

void	gemm_avx512_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_bias_gelu_fused (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_bias_relu_fused (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_bias_silu_fused (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_blocked_serial (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_blocked_serial_bf16 (const uint16_t A, const uint16_t B, const uint16_t bias, uint16_t C, int M, int N, int K)

void	gemm_fine_grained_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_microkernel (const float A, const float B, float *C, int M, int N, int K, int B_transposed)

void	gemm_microkernel_blocked (const float A, const float B, float *C, int M, int N, int K)

void	gemm_microkernel_blocked_bt (const float A, const float B, float *C, int M, int N, int K)

void	gemm_microkernel_packed (const float A, const float B, float *C, int M, int N, int K)

void	gemm_naive_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_nn_avx512 (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_nn_blocked (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_nn_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q4_0 (const float A, const void B, const float bias, float C, int M, int N, int K)
	Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias. More...

void	gemm_nt_q4_1 (const float A, const void B, const float bias, float C, int M, int N, int K)
	GEMM with transposed Q4_1 weights: C = A @ B^T. More...

void	gemm_nt_q4_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q4_k_q8_k (const void A_q8, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q5_0 (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q5_1 (const float A, const void B, const float bias, float C, int M, int N, int K)
	GEMM with transposed Q5_1 weights: C = A @ B^T. More...

void	gemm_nt_q5_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q6_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q6_k_q8_k (const void A_q8, const void B, const float bias, float C, int M, int N, int K)
	NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K. More...

void	gemm_nt_q8_0 (const float A, const void B, const float bias, float C, int M, int N, int K)
	Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias. More...

void	gemm_nt_q8_0_q8_0 (const void A_q8, const void B, const float bias, float C, int M, int N, int K)
	gemm_nt_q8_0_q8_0 with optional bias (matches header signature) More...

void	gemm_q4_k (float Y, const void W, const float *X, int M, int N, int K)
	Auto-dispatch GEMM based on available SIMD. More...

void	gemm_q4_k_q8_k (float Y, const void W, const void *X_q8, int M, int N, int K)

void	gemm_q6_k (float Y, const void W, const float *X, int M, int N, int K)

void	gemm_q6_k_q8_k (float Y, const void W, const void *X_q8, int M, int N, int K)
	GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K. More...

void	gemm_swiglu_fused (const float x, const float W_gate, const float W_up, const float b_gate, const float b_up, float output, int M, int N, int K)

void	gemm_tn_avx512 (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_tn_blocked (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemm_tn_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)

void	gemv_fused_q5_0_bias_dispatch (float y, const void W, const float x, const float bias, int M, int K)

void	gemv_fused_q8_0_bias_dispatch (float y, const void W, const float x, const float bias, int M, int K)

void	gemv_q4_0 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV. More...

void	gemv_q4_k (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV based on available SIMD. More...

void	gemv_q4_k_q8_k (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q4_k_q8_k_parallel (float y, const void W, const void *x_q8, int M, int K, int ith, int nth)

void	gemv_q4_k_q8_k_parallel_simd (float y, const void W, const void *x_q8, int M, int K, int ith, int nth)

void	gemv_q4_k_q8_k_ref (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q5_0 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV for Q5_0 weights based on CPU features. More...

void	gemv_q5_0_parallel (float y, const void W, const float *x, int M, int K, int ith, int nth)
	Parallel reference GEMV for Q5_0 × FP32. More...

void	gemv_q5_0_parallel_simd (float y, const void W, const float *x, int M, int K, int ith, int nth)
	Parallel SIMD GEMV for Q5_0 × FP32 with prefetching. More...

void	gemv_q5_0_q8_0 (float y, const void W, const void *x_q8, int M, int K)
	Matrix-vector multiply with Q5_0 weights and Q8_0 input. More...

void	gemv_q5_1 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV. More...

void	gemv_q5_k (float y, const void W, const float *x, int M, int K)

void	gemv_q6_k (float y, const void W, const float *x, int M, int K)

void	gemv_q6_k_q8_k (float y, const void W, const void *x_q8, int M, int K)
	GEMV: y = W @ x where W is Q6_K and x is Q8_K. More...

void	gemv_q6_k_q8_k_parallel (float y, const void W, const void *x_q8, int M, int K, int ith, int nth)
	Parallel reference GEMV for Q6_K × Q8_K. More...

void	gemv_q6_k_q8_k_parallel_simd (float y, const void W, const void *x_q8, int M, int K, int ith, int nth)
	Parallel SIMD GEMV for Q6_K × Q8_K. More...

void	gemv_q8_0 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV for Q8_0 weights based on CPU features. More...

void	gemv_q8_0_q8_0 (float y, const void W, const void *x_q8, int M, int K)
	Matrix-vector multiply with Q8_0 weights and Q8_0 input. More...

void	im2patch (const float image, float patches, int C, int H, int W, int P)

void	im2patch_bf16 (const uint16_t image, uint16_t patches, int C, int H, int W, int P)

void	kv_cache_repack_head_major_inplace (float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim)

void	kv_cache_store (float __restrict kv_cache_k, float __restrict kv_cache_v, const float __restrict k, const float __restrict v, int layer, int pos, int num_kv_heads, int head_dim, int max_seq_len)

void	kv_cache_write_head_major (const float __restrict k_token, const float __restrict v_token, float __restrict k_cache, float __restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)

void	layernorm_backward_kernel (const float d_output, const float input, const float gamma, const float mean, const float rstd, float d_input, float d_gamma, float d_beta, int tokens, int d_model, int aligned_embed_dim)

void	layernorm_backward_kernel_bf16 (const uint16_t d_output, const uint16_t input, const float gamma, const float mean, const float rstd, uint16_t d_input, float d_gamma, float d_beta, int tokens, int d_model, int aligned_embed_dim, float scratch_d_output, float scratch_input, float *scratch_d_input)

void	layernorm_forward_rolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)

void	layernorm_forward_rolled_slice_bf16 (const uint16_t __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, uint16_t __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps, float scratch_input, float scratch_output)

void	layernorm_forward_unrolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)

void	layernorm_forward_unrolled_slice_bf16 (const uint16_t __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, uint16_t __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps, float scratch_input, float scratch_output)

void	layernorm_naive_serial (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	layernorm_naive_serial_matched_precision (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, float eps)

void	mlp_token_parallel (const float input, const float W_fc1, const float b_fc1, const float W_fc2, const float b_fc2, float fc1_output, float *output, int T, int aligned_dim, int num_threads)

void	mlp_token_parallel_bf16 (const uint16_t input, const uint16_t W_fc1, const uint16_t b_fc1, const uint16_t W_fc2, const uint16_t b_fc2, float fc1_output, float output, int T, int aligned_dim, int num_threads, float scratch_bias1_f, float scratch_bias2_f, uint16_t scratch_fc1_bf16)

void	mlp_token_parallel_bf16_fp32act (const uint16_t input, const uint16_t W_fc1, const uint16_t b_fc1, const uint16_t W_fc2, const uint16_t b_fc2, float fc1_output, float output, int T, int aligned_dim, int num_threads, float scratch_input_f, float scratch_bias1_f, float scratch_bias2_f, uint16_t *scratch_fc1_bf16)

void	mlp_token_parallel_exact (const float input, const float W_fc1, const float b_fc1, const float W_fc2, const float b_fc2, float fc1_output, float *output, int T, int aligned_dim, int num_threads)

void	moe_accumulate_expert_f32 (float output, const float expert_output, float routing_weight, int hidden_dim)
	Accumulate expert output: output += routing_weight * expert_output. More...

void	patch2im (const float d_patches, float d_image, int C, int H, int W, int P)

void	patch2im_bf16 (const uint16_t d_patches, uint16_t d_image, int C, int H, int W, int P)

void	quantize_batch_q8_0 (const float x, void y, int num_rows, int k)
	Batch quantize FP32 to Q8_0 format (row-major output) More...

void	quantize_batch_q8_k (const float x, void y, int num_rows, int k)
	Batch quantize FP32 to Q8_K format (row-major output) More...

void	quantize_row_q8_0 (const float x, void y, int k)
	Quantize FP32 to Q8_0 format (scalar reference) More...

void	quantize_row_q8_k (const float x, void y, int k)

void	relu_backward (const float input, const float d_output, float *d_input, size_t n)

void	relu_backward_bf16 (const uint16_t input, const uint16_t d_output, uint16_t *d_input, size_t n)

void	relu_forward (const float input, float output, size_t n)

void	relu_forward_bf16 (const uint16_t input, uint16_t output, size_t n)

void	relu_forward_inplace (float *data, size_t n)

void	relu_forward_inplace_bf16 (uint16_t *data, size_t n)

void	rmsnorm_backward (const float d_output, const float input, const float gamma, const float rstd_cache, float d_input, float d_gamma, int tokens, int d_model, int aligned_embed_dim)

void	rmsnorm_backward_bf16 (const uint16_t d_output, const uint16_t input, const float gamma, const float rstd_cache, uint16_t d_input, float d_gamma, int tokens, int d_model, int aligned_embed_dim)

void	rmsnorm_backward_int4 (const uint8_t d_output, const uint8_t input, const float gamma, const float rstd_cache, uint8_t d_input, float d_gamma, int tokens, int d_model, int aligned_embed_dim, float scratch_d_output, float scratch_input, float *scratch_d_input)

void	rmsnorm_backward_int8 (const int8_t d_output, const int8_t input, const float gamma, const float rstd_cache, int8_t d_input, float d_gamma, int tokens, int d_model, int aligned_embed_dim, float scratch_d_output, float scratch_input, float *scratch_d_input)

void	rmsnorm_forward (const float input, const float gamma, float output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	rmsnorm_forward_bf16 (const uint16_t input, const float gamma, uint16_t output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	rmsnorm_forward_int4 (const uint8_t input, const float gamma, uint8_t output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps, float scratch_input, float scratch_output)

void	rmsnorm_forward_int8 (const int8_t input, const float gamma, int8_t output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps, float scratch_input, float scratch_output)

void	rope_backward (const float d_out, float d_x, const float cos_cache, const float sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_backward_bf16 (const uint16_t d_out, uint16_t d_x, const float cos_cache, const float sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_d_out, float scratch_d_x)

void	rope_backward_inplace (float d_x, const float cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_backward_qk (const float d_q_out, const float d_k_out, float d_q, float d_k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_backward_qk_bf16 (const uint16_t d_q_out, const uint16_t d_k_out, uint16_t d_q, uint16_t d_k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_dq_out, float scratch_dq, float scratch_dk_out, float scratch_dk)

void	rope_forward (float x, const float cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_forward_bf16 (uint16_t x, const float cos_cache, const float sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch)

void	rope_forward_qk (float q, float k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_forward_qk_bf16 (uint16_t q, uint16_t k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_q, float scratch_k)

void	rope_forward_qk_strided (float q, float k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int q_stride_tokens, int k_stride_tokens)

void	rope_forward_strided (float x, const float cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, int head_stride_tokens)

void	rope_precompute_cache (float cos_cache, float sin_cache, int max_seq_len, int head_dim, float base)

void	scal_copy_f32 (float y, const float x, float alpha, int n)
	Scaled copy: y = alpha * x. More...

void	sigmoid_backward (const float input, const float d_output, float *d_input, size_t n)

void	sigmoid_backward_bf16 (const uint16_t input, const uint16_t d_output, uint16_t d_input, size_t n, float scratch_input, float scratch_d_output, float scratch_d_input)

void	sigmoid_forward (const float input, float output, size_t n)

void	sigmoid_forward_bf16 (const uint16_t input, uint16_t output, size_t n, float scratch_input, float scratch_output)

float	sigmoid_scalar (float x)

void	softmax_cross_entropy_loss (const float logits, const int32_t targets, int tokens, int vocab_size, float d_logits, float loss_out)

void	softmax_cross_entropy_loss_bf16 (const uint16_t logits, const int32_t targets, int tokens, int vocab_size, uint16_t d_logits, float loss_out, float scratch_logits, float scratch_d_logits)

void	swiglu_backward (const float input, const float d_output, float *d_input, int tokens, int dim)

void	swiglu_backward_bf16 (const uint16_t input, const uint16_t d_output, uint16_t *d_input, int tokens, int dim)

void	swiglu_backward_exact (const float input, const float d_output, float *d_input, int tokens, int dim)

void	swiglu_forward (const float input, float output, int tokens, int dim)

void	swiglu_forward_bf16 (const uint16_t input, uint16_t output, int tokens, int dim)

void	swiglu_forward_exact (const float input, float output, int tokens, int dim)

void	topk_batched_f32 (const float scores, int num_tokens, int n_experts, int k, int indices, float *weights)
	Batched top-K selection for multiple tokens. More...

void	topk_f32 (const float scores, int n, int k, int indices, float *values)
	Find top-K indices and values from a score vector. More...

void	topk_softmax_f32 (const float scores, int n, int k, int indices, float *weights)
	Find top-K indices with softmax-normalized weights. More...

void	unfused_rmsnorm_qkv_prefill (const float x, const float gamma, const float Wq, const float Wk, const float Wv, float x_norm, float Q, float K, float *V, int seq_len, int hidden, int q_dim, int kv_dim, float eps)
	Unfused version for benchmarking comparison. More...

void	vec_dot_q6_k_q8_k (int n, float s, const void vx, const void *vy)
	Q6_K x Q8_K dot product (single row) More...

void	weighted_sum_f32 (float y, const float vectors, const float weights, int k, int n)
	Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i]) More...

Function Documentation

◆ add_backward_bf16()

void add_backward_bf16	(	const uint16_t *	d_y,
		uint16_t *	d_a,
		uint16_t *	d_b,
		size_t	n
	)

Definition at line 173 of file add_kernels_bf16.c.

 {
     if (!d_y || n == 0) {
         return;
     }
  
     size_t i = 0;
  
     /* Copy to d_a if not in-place */
     if (d_a && d_a != d_y) {
 #if defined(__AVX512F__)
         for (; i + 32 <= n; i += 32) {
             __m512i v0 = _mm512_loadu_si512((const __m512i*)&d_y[i]);
             __m512i v1 = _mm512_loadu_si512((const __m512i*)&d_y[i + 32]);
             _mm512_storeu_si512((__m512i*)&d_a[i], v0);
             _mm512_storeu_si512((__m512i*)&d_a[i + 32], v1);
         }
 #endif
         for (; i < n; ++i) {
             d_a[i] = d_y[i];
         }
     }
  
     /* Copy to d_b if not in-place */
     i = 0;
     if (d_b && d_b != d_y) {
 #if defined(__AVX512F__)
         for (; i + 32 <= n; i += 32) {
             __m512i v0 = _mm512_loadu_si512((const __m512i*)&d_y[i]);
             __m512i v1 = _mm512_loadu_si512((const __m512i*)&d_y[i + 32]);
             _mm512_storeu_si512((__m512i*)&d_b[i], v0);
             _mm512_storeu_si512((__m512i*)&d_b[i + 32], v1);
         }
 #endif
         for (; i < n; ++i) {
             d_b[i] = d_y[i];
         }
     }
 }

◆ add_forward_2d_bf16()

void add_forward_2d_bf16	(	const uint16_t *	a,
		const uint16_t *	b,
		uint16_t *	y,
		int	tokens,
		int	dim,
		int	aligned_dim
	)

Definition at line 221 of file add_kernels_bf16.c.

 {
     if (!a || !b || !y || tokens <= 0 || dim <= 0) {
         return;
     }
  
     for (int t = 0; t < tokens; ++t) {
         const uint16_t *a_row = a + (size_t)t * aligned_dim;
         const uint16_t *b_row = b + (size_t)t * aligned_dim;
         uint16_t *y_row = y + (size_t)t * aligned_dim;
  
         int d = 0;
  
 #if defined(__AVX512F__)
         for (; d + 16 <= dim; d += 16) {
             __m512 av = bf16_loadu_cvt_fp32(&a_row[d]);
             __m512 bv = bf16_loadu_cvt_fp32(&b_row[d]);
             __m512 yv = _mm512_add_ps(av, bv);
             fp32_cvt_storeu_bf16(&y_row[d], yv);
         }
 #endif
  
         for (; d < dim; ++d) {
             float af = bf16_to_float(a_row[d]);
             float bf = bf16_to_float(b_row[d]);
             y_row[d] = float_to_bf16(af + bf);
         }
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ add_forward_bf16()

void add_forward_bf16	(	const uint16_t *	a,
		const uint16_t *	b,
		uint16_t *	y,
		size_t	n
	)

Definition at line 38 of file add_kernels_bf16.c.

 {
     if (!a || !b || !y || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     /* AVX-512: Process 16 bf16 elements at a time */
     for (; i + 16 <= n; i += 16) {
         __m512 av = bf16_loadu_cvt_fp32(&a[i]);
         __m512 bv = bf16_loadu_cvt_fp32(&b[i]);
         __m512 yv = _mm512_add_ps(av, bv);
         fp32_cvt_storeu_bf16(&y[i], yv);
     }
 #endif
  
     /* Scalar fallback */
     for (; i < n; ++i) {
         float af = bf16_to_float(a[i]);
         float bf = bf16_to_float(b[i]);
         y[i] = float_to_bf16(af + bf);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ add_forward_f32()

void add_forward_f32	(	const float *	a,
		const float *	b,
		float *	y,
		size_t	n
	)

Element-wise add: y = a + b

Test:

test_add.py::TestAddForward::test_add_forward_f32

test_add.py::TestAddForward::test_add_inplace_f32

test_multi_layer_parity.py::TestMultiLayerParity::test_residual_add

Element-wise addition of two vectors.

After changes: make test

Definition at line 270 of file add_kernels_bf16.c.

 {
     if (!a || !b || !y || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     for (; i + 16 <= n; i += 16) {
         __m512 av = _mm512_loadu_ps(&a[i]);
         __m512 bv = _mm512_loadu_ps(&b[i]);
         __m512 yv = _mm512_add_ps(av, bv);
         _mm512_storeu_ps(&y[i], yv);
     }
 #endif
  
 #if defined(__AVX2__)
     for (; i + 8 <= n; i += 8) {
         __m256 av = _mm256_loadu_ps(&a[i]);
         __m256 bv = _mm256_loadu_ps(&b[i]);
         __m256 yv = _mm256_add_ps(av, bv);
         _mm256_storeu_ps(&y[i], yv);
     }
 #endif
  
     for (; i < n; ++i) {
         y[i] = a[i] + b[i];
     }
 }

◆ add_inplace_bf16()

void add_inplace_bf16	(	uint16_t *	a,
		const uint16_t *	b,
		size_t	n
	)

Definition at line 105 of file add_kernels_bf16.c.

 {
     if (!a || !b || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     for (; i + 16 <= n; i += 16) {
         __m512 av = bf16_loadu_cvt_fp32(&a[i]);
         __m512 bv = bf16_loadu_cvt_fp32(&b[i]);
         __m512 yv = _mm512_add_ps(av, bv);
         fp32_cvt_storeu_bf16(&a[i], yv);
     }
 #endif
  
     for (; i < n; ++i) {
         float af = bf16_to_float(a[i]);
         float bf = bf16_to_float(b[i]);
         a[i] = float_to_bf16(af + bf);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ add_inplace_f32()

void add_inplace_f32	(	float *	a,
		const float *	b,
		size_t	n
	)

Definition at line 304 of file add_kernels_bf16.c.

 {
     if (!a || !b || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     for (; i + 16 <= n; i += 16) {
         __m512 av = _mm512_loadu_ps(&a[i]);
         __m512 bv = _mm512_loadu_ps(&b[i]);
         __m512 yv = _mm512_add_ps(av, bv);
         _mm512_storeu_ps(&a[i], yv);
     }
 #endif
  
 #if defined(__AVX2__)
     for (; i + 8 <= n; i += 8) {
         __m256 av = _mm256_loadu_ps(&a[i]);
         __m256 bv = _mm256_loadu_ps(&b[i]);
         __m256 yv = _mm256_add_ps(av, bv);
         _mm256_storeu_ps(&a[i], yv);
     }
 #endif
  
     for (; i < n; ++i) {
         a[i] = a[i] + b[i];
     }
 }

Referenced by mega_fused_outproj_mlp_prefill().

◆ add_scaled_forward_bf16()

void add_scaled_forward_bf16	(	const uint16_t *	a,
		const uint16_t *	b,
		uint16_t *	y,
		float	alpha,
		size_t	n
	)

Definition at line 72 of file add_kernels_bf16.c.

 {
     if (!a || !b || !y || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     __m512 alpha_v = _mm512_set1_ps(alpha);
     for (; i + 16 <= n; i += 16) {
         __m512 av = bf16_loadu_cvt_fp32(&a[i]);
         __m512 bv = bf16_loadu_cvt_fp32(&b[i]);
         __m512 yv = _mm512_fmadd_ps(bv, alpha_v, av);  /* a + alpha * b */
         fp32_cvt_storeu_bf16(&y[i], yv);
     }
 #endif
  
     for (; i < n; ++i) {
         float af = bf16_to_float(a[i]);
         float bf = bf16_to_float(b[i]);
         y[i] = float_to_bf16(af + alpha * bf);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ add_scaled_inplace_bf16()

void add_scaled_inplace_bf16	(	uint16_t *	a,
		const uint16_t *	b,
		float	alpha,
		size_t	n
	)

Definition at line 135 of file add_kernels_bf16.c.

 {
     if (!a || !b || n == 0) {
         return;
     }
  
     size_t i = 0;
  
 #if defined(__AVX512F__)
     __m512 alpha_v = _mm512_set1_ps(alpha);
     for (; i + 16 <= n; i += 16) {
         __m512 av = bf16_loadu_cvt_fp32(&a[i]);
         __m512 bv = bf16_loadu_cvt_fp32(&b[i]);
         __m512 yv = _mm512_fmadd_ps(bv, alpha_v, av);
         fp32_cvt_storeu_bf16(&a[i], yv);
     }
 #endif
  
     for (; i < n; ++i) {
         float af = bf16_to_float(a[i]);
         float bf = bf16_to_float(b[i]);
         a[i] = float_to_bf16(af + alpha * bf);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ argmax_f32()

int argmax_f32	(	const float *	scores,
		int	n
	)

Find index of maximum value.

Parameters

scores	Input scores [n]
n	Number of scores

Returns: Index of maximum value

Definition at line 226 of file topk_kernels.c.

 {
     if (!scores || n <= 0) {
         return -1;
     }
  
     int max_idx = 0;
     float max_val = scores[0];
  
 #ifdef __AVX512F__
     /* AVX-512 vectorized argmax for large arrays */
     if (n >= 16) {
         __m512 vmax = _mm512_set1_ps(-FLT_MAX);
         __m512i vidx = _mm512_setzero_si512();
         __m512i vcur_max_idx = _mm512_setzero_si512();
  
         int i = 0;
         for (; i + 16 <= n; i += 16) {
             __m512 v = _mm512_loadu_ps(&scores[i]);
             __m512i cur_idx = _mm512_add_epi32(
                 _mm512_set1_epi32(i),
                 _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
             );
  
             __mmask16 gt_mask = _mm512_cmp_ps_mask(v, vmax, _CMP_GT_OQ);
             vmax = _mm512_mask_blend_ps(gt_mask, vmax, v);
             vcur_max_idx = _mm512_mask_blend_epi32(gt_mask, vcur_max_idx, cur_idx);
         }
  
         /* Horizontal reduction */
         float vals[16];
         int idxs[16];
         _mm512_storeu_ps(vals, vmax);
         _mm512_storeu_si512(idxs, vcur_max_idx);
  
         max_val = vals[0];
         max_idx = idxs[0];
         for (int j = 1; j < 16; j++) {
             if (vals[j] > max_val) {
                 max_val = vals[j];
                 max_idx = idxs[j];
             }
         }
  
         /* Handle remainder */
         for (; i < n; i++) {
             if (scores[i] > max_val) {
                 max_val = scores[i];
                 max_idx = i;
             }
         }
  
         return max_idx;
     }
 #endif
  
     /* Scalar fallback */
     for (int i = 1; i < n; i++) {
         if (scores[i] > max_val) {
             max_val = scores[i];
             max_idx = i;
         }
     }
  
     return max_idx;
 }

◆ attention_backward_causal_head_major()

void attention_backward_causal_head_major	(	const float *	d_output,
		const float *	q,
		const float *	k,
		const float *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention backward (non-GQA version)

Test:

test_attention_backward.py::TestAttentionBackward::test_backward

test_attention_backward.py::TestAttentionBackward::test_backward_vs_separate

test_parity.py::test_attention_backward_parity

Non-GQA version where num_heads == num_kv_heads. Simpler than GQA, no head broadcasting needed.

After changes: make test && make llamacpp-parity-full

Definition at line 1811 of file attention_kernels.c.

 {
     attention_backward_causal_head_major_gqa(
         d_output, q, k, v, attn_weights,
         d_q, d_k, d_v, d_scores,
         num_heads, num_heads,  // num_kv_heads == num_heads
         num_tokens, head_dim, aligned_head_dim, aligned_context_window);
 }

References attention_backward_causal_head_major_gqa().

◆ attention_backward_causal_head_major_gqa()

void attention_backward_causal_head_major_gqa	(	const float *	d_output,
		const float *	q,
		const float *	k,
		const float *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention backward (score-matrix version)

Test:

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_vs_separate

test_parity.py::test_attention_backward_parity

Computes dQ, dK, dV given dOutput and attention weights. Supports grouped-query attention with head broadcasting.

After changes: make test && make llamacpp-parity-full

Definition at line 1672 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
     int T = num_tokens;
     int H = num_heads;
     int H_kv = num_kv_heads;
     int hd = head_dim;
     int ad = aligned_head_dim;
     int aw = aligned_context_window;
  
     const size_t d_q_elems = (size_t)H * (size_t)T * (size_t)ad;
     const size_t kv_elems = (size_t)H_kv * (size_t)T * (size_t)ad;
     /* Zero the aligned outputs so padded lanes never leak garbage to downstream GEMMs. */
     for (size_t idx = 0; idx < d_q_elems; ++idx) {
         d_q[idx] = 0.0f;
     }
     for (size_t idx = 0; idx < kv_elems; ++idx) {
         d_k[idx] = 0.0f;
         d_v[idx] = 0.0f;
     }
  
     // Process each query head
     for (int h = 0; h < H; ++h) {
         // Which KV head does this query head use?
         int kv_h = (int)((long long)h * (long long)H_kv / (long long)H);
  
         // ----------------------------------------------------------------
         // Step 1: d_weights = d_output @ V^T  and  d_v += weights^T @ d_output
         // ----------------------------------------------------------------
         // For each query position i, compute d_weights[i, j] for j <= i
         // and accumulate d_v[j] contributions
  
         for (int i = 0; i < T; ++i) {
             size_t d_out_base = qkv_index(h, i, 0, T, ad);
  
             for (int j = 0; j <= i; ++j) {
                 size_t v_base = qkv_index(kv_h, j, 0, T, ad);
                 size_t w_idx = score_index(h, i, j, aw);
                 float w = attn_weights[w_idx];
  
                 // d_weights[h, i, j] = d_output[h, i, :] @ v[kv_h, j, :]^T
                 float dot = 0.0f;
                 for (int dd = 0; dd < hd; ++dd) {
                     dot += d_output[d_out_base + dd] * v[v_base + dd];
                 }
                 d_scores[w_idx] = dot;
  
                 // d_v[kv_h, j, :] += weights[h, i, j] * d_output[h, i, :]
                 for (int dd = 0; dd < hd; ++dd) {
                     d_v[v_base + dd] += w * d_output[d_out_base + dd];
                 }
             }
  
             // Zero out upper triangle of d_scores
             for (int j = i + 1; j < T; ++j) {
                 d_scores[score_index(h, i, j, aw)] = 0.0f;
             }
             /* Scores scratch uses aligned_context_window, zero the padded columns. */
             for (int j = T; j < aw; ++j) {
                 d_scores[score_index(h, i, j, aw)] = 0.0f;
             }
         }
  
         // ----------------------------------------------------------------
         // Step 2: Backward through softmax (in-place on d_scores for this head)
         // ----------------------------------------------------------------
         // d_scores = softmax_backward(d_scores, attn_weights)
         // Formula: d_score[i,j] = w[i,j] * (d_w[i,j] - sum_k(w[i,k] * d_w[i,k]))
  
         for (int i = 0; i < T; ++i) {
             int base = h * aw * aw + i * aw;
  
             // Compute dot product: sum_j w[i,j] * d_w[i,j]
             float dot_product = 0.0f;
             for (int j = 0; j <= i; ++j) {
                 float wt = attn_weights[base + j];
                 float dw = d_scores[base + j];
                 dot_product += wt * dw;
             }
  
             // Apply softmax backward formula
             for (int j = 0; j <= i; ++j) {
                 float wt = attn_weights[base + j];
                 float dw = d_scores[base + j];
                 d_scores[base + j] = wt * (dw - dot_product);
             }
         }
  
         // ----------------------------------------------------------------
         // Step 3: d_q = d_scores @ K * scale
         //         d_k += d_scores^T @ Q * scale
         // ----------------------------------------------------------------
  
         for (int i = 0; i < T; ++i) {
             size_t d_q_base = qkv_index(h, i, 0, T, ad);
             size_t q_base = qkv_index(h, i, 0, T, ad);
  
             // d_q[h, i, :] = sum_j d_scores[h, i, j] * k[kv_h, j, :] * scale
             // d_k[kv_h, j, :] += d_scores[h, i, j] * q[h, i, :] * scale
             for (int j = 0; j <= i; ++j) {
                 size_t k_base = qkv_index(kv_h, j, 0, T, ad);
                 size_t d_k_base = qkv_index(kv_h, j, 0, T, ad);
                 float ds = d_scores[score_index(h, i, j, aw)] * scale;
  
                 for (int dd = 0; dd < hd; ++dd) {
                     d_q[d_q_base + dd] += ds * k[k_base + dd];
                     d_k[d_k_base + dd] += ds * q[q_base + dd];
                 }
             }
         }
     }
 }

References qkv_index(), and score_index().

Referenced by attention_backward_causal_head_major(), attention_backward_causal_head_major_gqa_bf16(), and ck_layer_backward_rmsnorm_swiglu().

◆ attention_backward_causal_head_major_gqa_bf16()

void attention_backward_causal_head_major_gqa_bf16	(	const uint16_t *	d_output,
		float *	d_x,
		const uint16_t *	q,
		const uint16_t *	k,
		const uint16_t *	v,
		const float *	attn_weights,
		float *	d_q,
		float *	d_k,
		float *	d_v,
		float *	d_scores,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window,
		float *	scratch_d_output,
		float *	scratch_q,
		float *	scratch_k,
		float *	scratch_v
	)

BF16 attention backward with caller-provided scratch buffers

Test:: bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_backward

Accepts BF16 inputs, converts to FP32, runs FP32 backward. Caller provides scratch buffers (no per-call malloc).

After changes: make test

Definition at line 1619 of file attention_kernels.c.

 {
     (void)d_x;
     const size_t head_elems = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
     const size_t kv_elems = (size_t)num_kv_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     if (!scratch_d_output || !scratch_q || !scratch_k || !scratch_v) return;
  
     convert_bf16_tensor_to_buf(d_output, scratch_d_output, head_elems);
     convert_bf16_tensor_to_buf(q, scratch_q, head_elems);
     convert_bf16_tensor_to_buf(k, scratch_k, kv_elems);
     convert_bf16_tensor_to_buf(v, scratch_v, kv_elems);
  
     attention_backward_causal_head_major_gqa(scratch_d_output, scratch_q, scratch_k, scratch_v,
                                              attn_weights,
                                              d_q, d_k, d_v, d_scores,
                                              num_heads, num_kv_heads,
                                              num_tokens, head_dim,
                                              aligned_head_dim, aligned_context_window);
     /* No free - caller owns scratch buffers */
 }

References attention_backward_causal_head_major_gqa(), and convert_bf16_tensor_to_buf().

◆ attention_flash_decode()

void attention_flash_decode	(	float *	out,
		const float *	q,
		const float *	k,
		const float *	v,
		int	T_q,
		int	T_k,
		int	H,
		int	D_h,
		float	scale
	)

Main flash attention function with SIMD dispatch.

Parameters

out	Output [T_q, H, D_h]
q	Query [T_q, H, D_h]
k	Key [T_k, H, D_h]
v	Value [T_k, H, D_h]
T_q	Number of query tokens (1 for decode)
T_k	Number of key/value tokens (context length)
H	Number of heads
D_h	Head dimension
scale	1/sqrt(D_h)

Definition at line 696 of file attention_flash_true.c.

 {
     if (!out || !q || !k || !v) {
         return;
     }
     if (T_q <= 0 || T_k <= 0 || H <= 0 || D_h <= 0) {
         return;
     }
  
     // Dispatch based on CPU features
 #if defined(__AVX512F__)
     attention_flash_decode_avx512(out, q, k, v, T_q, T_k, H, D_h, scale);
 #elif defined(__AVX__) && !defined(__AVX512F__)
     attention_flash_decode_avx(out, q, k, v, T_q, T_k, H, D_h, scale);
 #else
     attention_flash_decode_scalar(out, q, k, v, T_q, T_k, H, D_h, scale);
 #endif
 }

References attention_flash_decode_scalar().

Referenced by attention_forward_decode_head_major_gqa_flash(), ck_attention_flash_decode_wrapper(), mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ attention_forward_causal_head_major()

void attention_forward_causal_head_major	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention forward (score-matrix version)

Test:

test_attention.py::TestAttentionForward::test_causal_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention.py::TestAttentionForward::test_exact_vs_fast

test_parity.py::test_attention_parity

Computes softmax(Q @ K^T / sqrt(d)) @ V with causal masking. Uses O(N^2) memory for scores matrix.

After changes: make test && make llamacpp-parity-full

Definition at line 70 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     // Phase 1: compute scaled dot-product scores Q·K^T / sqrt(d_k),
     // lower triangle only (j <= i).
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             // Ensure upper triangle is zeroed so there are no stale values
             // before the softmax kernel runs.
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Phase 2: apply causal row-wise softmax in-place over j <= i.
     causal_softmax_head_major(scores,
                               num_heads,
                               num_tokens,
                               aligned_context_window);
  
     // Phase 3: attention weights · V.
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
  
             // Zero the full aligned head slice so padded dims stay clean.
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             // Weighted sum over causal positions.
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major(), qkv_index(), and score_index().

◆ attention_forward_causal_head_major_exact()

void attention_forward_causal_head_major_exact	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

Causal attention forward (exact version using stdlib expf)

Test:

test_attention.py::TestAttentionForward::test_exact_single

test_attention.py::TestAttentionForward::test_exact_vs_fast

Uses standard library expf for numerical accuracy reference. Slower but provides maximum accuracy.

After changes: make test

Definition at line 146 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     // Phase 1: compute scaled dot-product scores Q·K^T / sqrt(d_k),
     // lower triangle only (j <= i).
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             // Ensure upper triangle is zeroed so there are no stale values
             // before the softmax kernel runs.
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Phase 2: apply causal row-wise softmax using exact expf.
     causal_softmax_head_major_exact(scores,
                                      num_heads,
                                      num_tokens,
                                      aligned_context_window);
  
     // Phase 3: attention weights · V.
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
  
             // Zero the full aligned head slice so padded dims stay clean.
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             // Weighted sum over causal positions.
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(h, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major_exact(), qkv_index(), and score_index().

◆ attention_forward_causal_head_major_gqa()

void attention_forward_causal_head_major_gqa	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention forward (score-matrix version)

Test:

test_attention.py::TestAttentionForward::test_gqa_forward

test_attention.py::TestAttentionForward::test_gqa_broadcast

test_attention_backward.py::TestAttentionBackwardGQA::test_gqa_backward

test_parity.py::test_attention_gqa_parity

Grouped-query attention: Q has num_heads, K/V have num_kv_heads. Each query head maps to a KV head via ratio.

After changes: make test && make llamacpp-parity-full

Definition at line 224 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     causal_softmax_head_major(scores,
                               num_heads,
                               num_tokens,
                               aligned_context_window);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major(), qkv_index(), and score_index().

Referenced by ck_layer_forward_rmsnorm_swiglu(), ck_layer_forward_rmsnorm_swiglu_q4_k(), ck_layer_forward_rmsnorm_swiglu_quant(), and ck_layer_forward_rmsnorm_swiglu_ref().

◆ attention_forward_causal_head_major_gqa_bf16()

void attention_forward_causal_head_major_gqa_bf16	(	const uint16_t *	q,
		const uint16_t *	k,
		const uint16_t *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window,
		float *	scratch_q,
		float *	scratch_k,
		float *	scratch_v
	)

BF16 GQA causal attention forward

Test:

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_forward

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_flash

Accepts BF16 inputs, converts to FP32, uses exact softmax. Caller provides scratch buffers (no per-call malloc).

After changes: make test

Definition at line 366 of file attention_kernels.c.

 {
     const size_t q_elems = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
     const size_t kv_elems = (size_t)num_kv_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     if (!scratch_q || !scratch_k || !scratch_v) return;
  
     convert_bf16_tensor_to_buf(q, scratch_q, q_elems);
     convert_bf16_tensor_to_buf(k, scratch_k, kv_elems);
     convert_bf16_tensor_to_buf(v, scratch_v, kv_elems);
  
     // Use exact version to avoid fast exp approximation error accumulating
     // with BF16 precision loss.
     attention_forward_causal_head_major_gqa_exact(scratch_q, scratch_k, scratch_v,
                                                    scores, output,
                                                    num_heads, num_kv_heads,
                                                    num_tokens, head_dim,
                                                    aligned_head_dim, aligned_context_window);
     /* No free - caller owns scratch buffers */
 }

References attention_forward_causal_head_major_gqa_exact(), and convert_bf16_tensor_to_buf().

◆ attention_forward_causal_head_major_gqa_exact()

void attention_forward_causal_head_major_gqa_exact	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	scores,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	aligned_context_window
	)

GQA causal attention forward (exact version using stdlib expf)

Test:

test_attention.py::TestAttentionForward::test_gqa_exact

bf16/test_attention_bf16.py::TestAttentionBF16::test_bf16_gqa

Uses standard library expf for numerical accuracy reference. Used by BF16 wrapper to avoid approximation error accumulation.

After changes: make test

Definition at line 294 of file attention_kernels.c.

 {
     const float scale = 1.0f / sqrtf((float)head_dim);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             for (int j = 0; j <= i; ++j) {
                 float dot = 0.0f;
                 size_t base_q = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
                 size_t base_k = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     dot += q[base_q + d] * k[base_k + d];
                 }
  
                 scores[score_index(h, i, j, aligned_context_window)] = dot * scale;
             }
  
             for (int j = i + 1; j < num_tokens; ++j) {
                 scores[score_index(h, i, j, aligned_context_window)] = 0.0f;
             }
         }
     }
  
     // Use exact softmax with standard library expf
     causal_softmax_head_major_exact(scores,
                                      num_heads,
                                      num_tokens,
                                      aligned_context_window);
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         for (int i = 0; i < num_tokens; ++i) {
             size_t out_base = qkv_index(h, i, 0, num_tokens, aligned_head_dim);
             for (int d = 0; d < aligned_head_dim; ++d) {
                 output[out_base + d] = 0.0f;
             }
  
             for (int j = 0; j <= i; ++j) {
                 float w = scores[score_index(h, i, j, aligned_context_window)];
                 size_t v_base = qkv_index(kv_head, j, 0, num_tokens, aligned_head_dim);
  
                 for (int d = 0; d < head_dim; ++d) {
                     output[out_base + d] += w * v[v_base + d];
                 }
             }
         }
     }
 }

References causal_softmax_head_major_exact(), qkv_index(), and score_index().

Referenced by attention_forward_causal_head_major_gqa_bf16().

◆ attention_forward_causal_head_major_gqa_flash()

void attention_forward_causal_head_major_gqa_flash	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention forward for GQA (prefill, no score materialization)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_forward

test_flash_attention.py::TestFlashAttention::test_flash_vs_score_matrix

test_flash_attention.py::TestFlashAttention::test_flash_gqa

test_attention.py::TestAttentionForward::test_flash_forward

Online softmax with streaming KV. O(N) memory instead of O(N^2). For prefill: all tokens attend to previous tokens.

After changes: make test && make llamacpp-parity-full

Definition at line 800 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL attention_flash_query_causal
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * (size_t)T * (size_t)aligned_head_dim;
         const float *v_head = v + (size_t)kv_head * (size_t)T * (size_t)aligned_head_dim;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             FLASH_QUERY_IMPL(q_vec, k_head, v_head,
                              /*kv_tokens=*/i + 1,
                              head_dim, aligned_head_dim,
                              scale, out_vec);
         }
     }
  
 #undef FLASH_QUERY_IMPL
 }

References FLASH_QUERY_IMPL, and qkv_index().

◆ attention_forward_causal_head_major_gqa_flash_strided()

void attention_forward_causal_head_major_gqa_flash_strided	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens
	)

Flash attention forward with custom KV stride (for KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_attention

Variant with configurable kv_stride_tokens for KV cache layouts where K/V may not be contiguous in memory.

After changes: make test

Definition at line 859 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL attention_flash_query_causal
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             FLASH_QUERY_IMPL(q_vec, k_head, v_head,
                              /*kv_tokens=*/i + 1,
                              head_dim, aligned_head_dim,
                              scale, out_vec);
         }
     }
  
 #undef FLASH_QUERY_IMPL
 }

References FLASH_QUERY_IMPL, and qkv_index().

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

void attention_forward_causal_head_major_gqa_flash_strided_sliding	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		int	sliding_window
	)

Flash attention forward with sliding window (prefill)

Test:: test_attention.py::TestAttentionForward::test_sliding_window_prefill

Sliding-window attention for prefill: each token attends to the last W tokens. When sliding_window <= 0, behaves like regular causal attention.

After changes: make test

Definition at line 1316 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
 #if defined(__AVX512F__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             SLIDING_FLASH_IMPL(q_vec, k_head, v_head,
                                /*query_pos=*/i,
                                /*kv_tokens=*/T,
                                head_dim, aligned_head_dim,
                                scale, out_vec,
                                sliding_window);
         }
     }
  
 #undef SLIDING_FLASH_IMPL
 }

References qkv_index(), and SLIDING_FLASH_IMPL.

◆ attention_forward_decode_head_major_gqa_flash()

void attention_forward_decode_head_major_gqa_flash	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Flash attention decode (single token attends to KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_decode

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_decode

test_fused_attention_decode.py::TestFusedAttentionDecode::test_flash_decode

test_attention.py::TestAttentionForward::test_flash_decode

Single query token attends to kv_tokens in KV cache. Uses true flash attention from attention_flash_true.c.

After changes: make test && make llamacpp-parity-full

Definition at line 1467 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         attention_flash_decode(out_head,
                                q_head,
                                k_head,
                                v_head,
                                1,
                                kv_tokens,
                                1,
                                aligned_head_dim,
                                scale);
     }
 }

References attention_flash_decode().

◆ attention_forward_decode_head_major_gqa_flash_sliding()

void attention_forward_decode_head_major_gqa_flash_sliding	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim,
		int	sliding_window
	)

Flash attention decode with sliding window

Test:: test_attention.py::TestAttentionForward::test_sliding_window_decode

Single query token attends to the last W tokens in the KV cache. For decode: effective_kv_tokens = min(kv_tokens, sliding_window)

After changes: make test

Definition at line 1382 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens <= 0 || kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     // Compute effective KV tokens based on sliding window
     int effective_kv_tokens = kv_tokens;
     if (sliding_window > 0 && sliding_window < kv_tokens) {
         effective_kv_tokens = sliding_window;
     }
  
     // Guard against empty window (shouldn't happen with kv_tokens >= 1)
     if (effective_kv_tokens <= 0) {
         return;
     }
  
     // Offset to start reading from the last effective_kv_tokens entries
     int kv_start_offset = kv_tokens - effective_kv_tokens;
  
 #if defined(__AVX512F__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         // Offset K/V pointer to start from the first token in the sliding window
         const float *k_head = k_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         const float *v_head = v_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         // Use query_pos relative to the windowed KV (last token = effective_kv_tokens - 1)
         // sliding_window = 0 since we've already windowed via K/V pointer offset
         SLIDING_DECODE_IMPL(q_head, k_head, v_head,
                             /*query_pos=*/effective_kv_tokens - 1,
                             /*kv_tokens=*/effective_kv_tokens,
                             head_dim, aligned_head_dim,
                             scale, out_head,
                             /*sliding_window=*/0);
     }
  
 #undef SLIDING_DECODE_IMPL
 }

References SLIDING_DECODE_IMPL.

◆ attention_forward_decode_head_major_gqa_regular()

void attention_forward_decode_head_major_gqa_regular	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

WARNING: This is NOT true flash attention!

This function is named "flash" but implements regular attention with O(n) complexity. It's kept for reference and as a fallback.

TRUE flash attention is implemented in attention_flash_true.c

Test:

test_kv_cache_attention.py::TestKVCacheAttention::test_regular_decode

test_attention.py::TestAttentionForward::test_regular_decode

Regular attention decode (score-matrix version) for fallback.

After changes: make test

Definition at line 1524 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL_DECODE attention_flash_query_causal
 #endif
  
 #pragma omp parallel for schedule(static) if(num_heads > 1)
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_vec = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_vec = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         FLASH_QUERY_IMPL_DECODE(q_vec, k_head, v_head,
                                  kv_tokens, head_dim, aligned_head_dim,
                                  scale, out_vec);
     }
  
 #undef FLASH_QUERY_IMPL_DECODE
 }

References FLASH_QUERY_IMPL_DECODE.

◆ axpy_2d_f32()

void axpy_2d_f32	(	float *	Y,
		const float *	X,
		float	alpha,
		int	num_tokens,
		int	dim,
		int	y_stride,
		int	x_stride
	)

Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:].

Parameters

Y	Output tensor [num_tokens, dim]
X	Input tensor [num_tokens, dim]
alpha	Scalar multiplier
num_tokens	Number of tokens
dim	Hidden dimension
y_stride	Stride between Y rows (for alignment)
x_stride	Stride between X rows

Definition at line 221 of file axpy_kernels.c.

 {
     if (!Y || !X || num_tokens <= 0 || dim <= 0) {
         return;
     }
  
     /* Default strides if not specified */
     if (y_stride <= 0) y_stride = dim;
     if (x_stride <= 0) x_stride = dim;
  
     for (int t = 0; t < num_tokens; t++) {
         axpy_f32(Y + t * y_stride, X + t * x_stride, alpha, dim);
     }
 }

References axpy_f32().

◆ axpy_f32()

void axpy_f32	(	float *	y,
		const float *	x,
		float	alpha,
		int	n
	)

In-place AXPY: y += alpha * x.

Test:

test_axpy.py::TestAXPY::test_axpy_f32

test_axpy.py::TestAXPY::test_axpy_vs_naive

In-place scaled vector addition: y += alpha * x BLAS-like axpy operation.

After changes: make test

Definition at line 54 of file axpy_kernels.c.

 {
     if (!y || !x || n <= 0) {
         return;
     }
  
     int i = 0;
  
 #ifdef __AVX512F__
     __m512 valpha = _mm512_set1_ps(alpha);
     for (; i + 16 <= n; i += 16) {
         __m512 vy = _mm512_loadu_ps(&y[i]);
         __m512 vx = _mm512_loadu_ps(&x[i]);
         vy = _mm512_fmadd_ps(vx, valpha, vy);  /* y = y + alpha * x */
         _mm512_storeu_ps(&y[i], vy);
     }
 #endif
  
 #ifdef __AVX2__
     __m256 valpha256 = _mm256_set1_ps(alpha);
     for (; i + 8 <= n; i += 8) {
         __m256 vy = _mm256_loadu_ps(&y[i]);
         __m256 vx = _mm256_loadu_ps(&x[i]);
         vy = _mm256_fmadd_ps(vx, valpha256, vy);
         _mm256_storeu_ps(&y[i], vy);
     }
 #endif
  
     /* Scalar remainder */
     for (; i < n; i++) {
         y[i] += alpha * x[i];
     }
 }

Referenced by axpy_2d_f32(), axpy_zero_f32(), moe_accumulate_expert_f32(), and weighted_sum_f32().

◆ axpy_zero_f32()

void axpy_zero_f32	(	float *	y,
		const float *	x,
		float	alpha,
		int	n
	)

Zero output then accumulate: y = 0; y += alpha * x.

Parameters

y	Output vector [n], zeroed then accumulated
x	Input vector [n]
alpha	Scalar multiplier
n	Vector length

Definition at line 188 of file axpy_kernels.c.

 {
     if (!y || n <= 0) {
         return;
     }
  
     memset(y, 0, n * sizeof(float));
  
     if (x) {
         axpy_f32(y, x, alpha, n);
     }
 }

References axpy_f32().

◆ backward_causal_softmax_head_major()

void backward_causal_softmax_head_major	(	float *	d_scores,
		const float *	weights,
		int	num_heads,
		int	num_tokens,
		int	aligned_context_window
	)

Definition at line 382 of file softmax_kernels.c.

 {
     int H = num_heads;
     int T = num_tokens;
  
     for (int h = 0; h < H; ++h) {
         for (int i = 0; i < T; ++i) {
             int base = h * aligned_context_window * aligned_context_window
                      + i * aligned_context_window;
             float *drow = &d_scores[base];
             const float *wrow = &weights[base];
             int len = i + 1;
  
 #if defined(__AVX512F__)
             // Compute dot product (vectorized)
             __m512 dot_vec = _mm512_setzero_ps();
             int j = 0;
             for (; j + 16 <= len; j += 16) {
                 __m512 w = _mm512_loadu_ps(&wrow[j]);
                 __m512 dw = _mm512_loadu_ps(&drow[j]);
                 dot_vec = _mm512_fmadd_ps(w, dw, dot_vec);
             }
             float dot_product = _mm512_reduce_add_ps(dot_vec);
             for (; j < len; ++j) {
                 dot_product += wrow[j] * drow[j];
             }
  
             // Compute gradient: d_scores = w * (dw - dot_product)
             __m512 dot_broadcast = _mm512_set1_ps(dot_product);
             j = 0;
             for (; j + 16 <= len; j += 16) {
                 __m512 w = _mm512_loadu_ps(&wrow[j]);
                 __m512 dw = _mm512_loadu_ps(&drow[j]);
                 __m512 diff = _mm512_sub_ps(dw, dot_broadcast);
                 __m512 result = _mm512_mul_ps(w, diff);
                 _mm512_storeu_ps(&drow[j], result);
             }
             for (; j < len; ++j) {
                 drow[j] = wrow[j] * (drow[j] - dot_product);
             }
  
             // Zero out future tokens
             __m512 zero = _mm512_setzero_ps();
             for (; j + 16 <= T; j += 16) {
                 _mm512_storeu_ps(&drow[j], zero);
             }
             for (; j < T; ++j) {
                 drow[j] = 0.0f;
             }
  
 #elif defined(__AVX__)
             // Compute dot product (vectorized)
             __m256 dot_vec = _mm256_setzero_ps();
             int j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 w = _mm256_loadu_ps(&wrow[j]);
                 __m256 dw = _mm256_loadu_ps(&drow[j]);
                 // No FMA in AVX1: use mul + add
                 __m256 prod = _mm256_mul_ps(w, dw);
                 dot_vec = _mm256_add_ps(dot_vec, prod);
             }
             float dot_product = hsum256_ps_softmax(dot_vec);
             for (; j < len; ++j) {
                 dot_product += wrow[j] * drow[j];
             }
  
             // Compute gradient: d_scores = w * (dw - dot_product)
             __m256 dot_broadcast = _mm256_set1_ps(dot_product);
             j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 w = _mm256_loadu_ps(&wrow[j]);
                 __m256 dw = _mm256_loadu_ps(&drow[j]);
                 __m256 diff = _mm256_sub_ps(dw, dot_broadcast);
                 __m256 result = _mm256_mul_ps(w, diff);
                 _mm256_storeu_ps(&drow[j], result);
             }
             for (; j < len; ++j) {
                 drow[j] = wrow[j] * (drow[j] - dot_product);
             }
  
             // Zero out future tokens
             __m256 zero = _mm256_setzero_ps();
             for (; j + 8 <= T; j += 8) {
                 _mm256_storeu_ps(&drow[j], zero);
             }
             for (; j < T; ++j) {
                 drow[j] = 0.0f;
             }
  
 #else
             // Scalar fallback
             float dot_product = 0.0f;
             for (int j = 0; j < len; ++j) {
                 dot_product += wrow[j] * drow[j];
             }
  
             for (int j = 0; j < len; ++j) {
                 drow[j] = wrow[j] * (drow[j] - dot_product);
             }
  
             for (int j = len; j < T; ++j) {
                 drow[j] = 0.0f;
             }
 #endif
         }
     }
 }

Referenced by backward_causal_softmax_head_major_bf16().

◆ backward_causal_softmax_head_major_bf16()

void backward_causal_softmax_head_major_bf16	(	uint16_t *	d_scores,
		const uint16_t *	weights,
		int	num_heads,
		int	num_tokens,
		int	aligned_context_window,
		float *	scratch_d_scores,
		float *	scratch_weights
	)

Definition at line 53 of file softmax_kernels_bf16.c.

 {
     if (!d_scores || !weights || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;
     if (!scratch_d_scores || !scratch_weights) return;
  
     const size_t total = (size_t)num_heads *
                          (size_t)aligned_context_window *
                          (size_t)aligned_context_window;
  
     bf16_tensor_to_float(d_scores, scratch_d_scores, total);
     bf16_tensor_to_float(weights, scratch_weights, total);
     backward_causal_softmax_head_major(scratch_d_scores, scratch_weights, num_heads, num_tokens, aligned_context_window);
     float_tensor_to_bf16(scratch_d_scores, d_scores, total);
 }

References backward_causal_softmax_head_major(), bf16_tensor_to_float(), and float_tensor_to_bf16().

◆ causal_softmax_head_major()

void causal_softmax_head_major	(	float *	scores,
		int	num_heads,
		int	num_tokens,
		int	aligned_context_window
	)

Causal softmax (in-place, row-wise)

Test:

test_softmax.py::TestSoftmaxForward::test_causal_softmax

test_softmax.py::TestSoftmaxForward::test_causal_vs_softmax

test_attention.py::TestAttentionForward::test_softmax_correctness

Applies causal mask (j > i => 0) and softmax to scores matrix. In-place on [num_heads, T, T] scores matrix.

After changes: make test && make llamacpp-parity-full

Definition at line 144 of file softmax_kernels.c.

 {
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             int base = h * aligned_context_window * aligned_context_window
                      + i * aligned_context_window;
             float *row = &scores[base];
             int len = i + 1;  // Number of valid elements (0..i inclusive)
  
 #if defined(__AVX512F__)
             // Find max (vectorized)
             __m512 max_vec = _mm512_set1_ps(-INFINITY);
             int j = 0;
             for (; j + 16 <= len; j += 16) {
                 __m512 v = _mm512_loadu_ps(&row[j]);
                 max_vec = _mm512_max_ps(max_vec, v);
             }
             float max_val = _mm512_reduce_max_ps(max_vec);
             for (; j < len; ++j) {
                 if (row[j] > max_val) max_val = row[j];
             }
  
             // Compute exp and sum (vectorized)
             __m512 max_broadcast = _mm512_set1_ps(max_val);
             __m512 sum_vec = _mm512_setzero_ps();
             j = 0;
             for (; j + 16 <= len; j += 16) {
                 __m512 v = _mm512_loadu_ps(&row[j]);
                 __m512 e = exp512_approx(_mm512_sub_ps(v, max_broadcast));
                 _mm512_storeu_ps(&row[j], e);
                 sum_vec = _mm512_add_ps(sum_vec, e);
             }
             float sum = _mm512_reduce_add_ps(sum_vec);
             for (; j < len; ++j) {
                 float e = expf(row[j] - max_val);
                 row[j] = e;
                 sum += e;
             }
  
             // Normalize (vectorized)
             float inv_sum = 1.0f / sum;
             __m512 inv_sum_vec = _mm512_set1_ps(inv_sum);
             j = 0;
             for (; j + 16 <= len; j += 16) {
                 __m512 v = _mm512_loadu_ps(&row[j]);
                 _mm512_storeu_ps(&row[j], _mm512_mul_ps(v, inv_sum_vec));
             }
             for (; j < len; ++j) {
                 row[j] *= inv_sum;
             }
  
             // Zero out future tokens (vectorized)
             __m512 zero = _mm512_setzero_ps();
             for (; j + 16 <= num_tokens; j += 16) {
                 _mm512_storeu_ps(&row[j], zero);
             }
             for (; j < num_tokens; ++j) {
                 row[j] = 0.0f;
             }
  
 #elif defined(__AVX2__)
             // AVX2: Find max (vectorized)
             __m256 max_vec = _mm256_set1_ps(-INFINITY);
             int j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 v = _mm256_loadu_ps(&row[j]);
                 max_vec = _mm256_max_ps(max_vec, v);
             }
             float max_val = hmax256_ps(max_vec);
             for (; j < len; ++j) {
                 if (row[j] > max_val) max_val = row[j];
             }
  
             // Compute exp and sum (vectorized with fast exp)
             __m256 max_broadcast = _mm256_set1_ps(max_val);
             __m256 sum_vec = _mm256_setzero_ps();
             j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 v = _mm256_loadu_ps(&row[j]);
                 __m256 e = exp256_approx(_mm256_sub_ps(v, max_broadcast));
                 _mm256_storeu_ps(&row[j], e);
                 sum_vec = _mm256_add_ps(sum_vec, e);
             }
             float sum = hsum256_ps_softmax(sum_vec);
             for (; j < len; ++j) {
                 float e = expf(row[j] - max_val);
                 row[j] = e;
                 sum += e;
             }
  
             // Normalize (vectorized)
             float inv_sum = 1.0f / sum;
             __m256 inv_sum_vec = _mm256_set1_ps(inv_sum);
             j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 v = _mm256_loadu_ps(&row[j]);
                 _mm256_storeu_ps(&row[j], _mm256_mul_ps(v, inv_sum_vec));
             }
             for (; j < len; ++j) {
                 row[j] *= inv_sum;
             }
  
             // Zero out future tokens (vectorized)
             __m256 zero = _mm256_setzero_ps();
             for (; j + 8 <= num_tokens; j += 8) {
                 _mm256_storeu_ps(&row[j], zero);
             }
             for (; j < num_tokens; ++j) {
                 row[j] = 0.0f;
             }
  
 #elif defined(__AVX__)
             // AVX1: vectorized max/sum/normalize, scalar exp
             __m256 max_vec = _mm256_set1_ps(-INFINITY);
             int j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 v = _mm256_loadu_ps(&row[j]);
                 max_vec = _mm256_max_ps(max_vec, v);
             }
             float max_val = hmax256_ps(max_vec);
             for (; j < len; ++j) {
                 if (row[j] > max_val) max_val = row[j];
             }
  
             // Compute exp and sum (scalar exp, no fast approx for AVX1)
             float sum = 0.0f;
             for (j = 0; j < len; ++j) {
                 float e = expf(row[j] - max_val);
                 row[j] = e;
                 sum += e;
             }
  
             // Normalize (vectorized)
             float inv_sum = 1.0f / sum;
             __m256 inv_sum_vec = _mm256_set1_ps(inv_sum);
             j = 0;
             for (; j + 8 <= len; j += 8) {
                 __m256 v = _mm256_loadu_ps(&row[j]);
                 _mm256_storeu_ps(&row[j], _mm256_mul_ps(v, inv_sum_vec));
             }
             for (; j < len; ++j) {
                 row[j] *= inv_sum;
             }
  
             // Zero out future tokens (vectorized)
             __m256 zero = _mm256_setzero_ps();
             for (; j + 8 <= num_tokens; j += 8) {
                 _mm256_storeu_ps(&row[j], zero);
             }
             for (; j < num_tokens; ++j) {
                 row[j] = 0.0f;
             }
  
 #else
             // Scalar fallback
             float max_val = row[0];
             for (int j = 1; j < len; ++j) {
                 if (row[j] > max_val) max_val = row[j];
             }
  
             float sum = 0.0f;
             for (int j = 0; j < len; ++j) {
                 float e = expf(row[j] - max_val);
                 row[j] = e;
                 sum += e;
             }
  
             float inv_sum = 1.0f / sum;
             for (int j = 0; j < len; ++j) {
                 row[j] *= inv_sum;
             }
  
             for (int j = len; j < num_tokens; ++j) {
                 row[j] = 0.0f;
             }
 #endif
         }
     }
 }

Referenced by attention_forward_causal_head_major(), attention_forward_causal_head_major_gqa(), and causal_softmax_head_major_bf16().

◆ causal_softmax_head_major_bf16()

void causal_softmax_head_major_bf16	(	uint16_t *	scores,
		int	num_heads,
		int	num_tokens,
		int	aligned_context_window,
		float *	scratch
	)

Definition at line 31 of file softmax_kernels_bf16.c.

 {
     if (!scores || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;
     if (!scratch) return;
  
     const size_t total = (size_t)num_heads *
                          (size_t)aligned_context_window *
                          (size_t)aligned_context_window;
  
     bf16_tensor_to_float(scores, scratch, total);
     causal_softmax_head_major(scratch, num_heads, num_tokens, aligned_context_window);
     float_tensor_to_bf16(scratch, scores, total);
 }

References bf16_tensor_to_float(), causal_softmax_head_major(), and float_tensor_to_bf16().

◆ causal_softmax_head_major_exact()

void causal_softmax_head_major_exact	(	float *	scores,
		int	num_heads,
		int	num_tokens,
		int	aligned_context_window
	)

Causal softmax (exact version using stdlib expf)

Test:

test_softmax.py::TestSoftmaxForward::test_causal_softmax_exact

test_softmax.py::TestSoftmaxForward::test_exact_vs_fast

Exact causal softmax using standard library expf for numerical accuracy reference.

After changes: make test

Definition at line 339 of file softmax_kernels.c.

 {
     for (int h = 0; h < num_heads; ++h) {
         for (int i = 0; i < num_tokens; ++i) {
             int base = h * aligned_context_window * aligned_context_window
                      + i * aligned_context_window;
             float *row = &scores[base];
             int len = i + 1;
  
             // Find max
             float max_val = -INFINITY;
             for (int j = 0; j < len; ++j) {
                 if (row[j] > max_val) max_val = row[j];
             }
  
             // Compute exp and sum using standard library expf
             float sum = 0.0f;
             for (int j = 0; j < len; ++j) {
                 float e = expf(row[j] - max_val);
                 row[j] = e;
                 sum += e;
             }
  
             // Normalize
             float inv_sum = 1.0f / sum;
             for (int j = 0; j < len; ++j) {
                 row[j] *= inv_sum;
             }
  
             // Zero out future tokens
             for (int j = len; j < num_tokens; ++j) {
                 row[j] = 0.0f;
             }
         }
     }
 }

Referenced by attention_forward_causal_head_major_exact(), and attention_forward_causal_head_major_gqa_exact().

◆ ck_attention_flash_decode_wrapper()

void ck_attention_flash_decode_wrapper	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Wrapper to call TRUE flash attention from orchestration layer.

Parameters

q_token	Query token [H, D_h]
k_cache	Cached keys [T_k, H, D_h]
v_cache	Cached values [T_k, H, D_h]
out_token	Output [H, D_h]
num_heads	Number of heads
num_kv_heads	Number of KV heads (for GQA)
kv_tokens	Number of tokens in KV cache
cache_capacity	Cache capacity
head_dim	Head dimension
aligned_head_dim	Aligned head dimension

Definition at line 72 of file ckernel_orchestration.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     static int use_strict = -1;
     if (use_strict < 0) {
         const char *env = getenv("CK_FLASH_ATTN_STRICT");
         use_strict = (env && env[0] && env[0] != '0') ? 1 : 0;
     }
  
     if (use_strict) {
         attention_forward_decode_head_major_gqa_regular(q_token,
                                                         k_cache,
                                                         v_cache,
                                                         out_token,
                                                         num_heads,
                                                         num_kv_heads,
                                                         kv_tokens,
                                                         cache_capacity,
                                                         head_dim,
                                                         aligned_head_dim);
         return;
     }
  
     // Scale factor: 1/sqrt(head_dim)
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
 #pragma omp parallel for schedule(static) if(num_heads > 1)
     for (int h = 0; h < num_heads; ++h) {
         const int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         const float *k_head = k_cache + (size_t)kv_head * head_stride;
         const float *v_head = v_cache + (size_t)kv_head * head_stride;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         // Use aligned_head_dim as D_h so per-token stride matches the cache layout.
         attention_flash_decode(out_head,
                                q_head,
                                k_head,
                                v_head,
                                1,
                                kv_tokens,
                                1,
                                aligned_head_dim,
                                scale);
     }
 }

References attention_flash_decode(), and attention_forward_decode_head_major_gqa_regular().

Referenced by ck_layer_forward_rmsnorm_swiglu_decode(), ck_layer_forward_rmsnorm_swiglu_decode_fused(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), and ck_layer_forward_rmsnorm_swiglu_decode_quant().

◆ ck_flash_attn_choose_tile_k()

int ck_flash_attn_choose_tile_k ( int D_h )

Definition at line 108 of file attention_flash_true.c.

                                          {
     return ck_flash_attn_tile_k(D_h);
 }

References ck_flash_attn_tile_k().

◆ ck_flash_attn_fast_exp_kind()

int ck_flash_attn_fast_exp_kind ( void )

Definition at line 112 of file attention_flash_true.c.

                                       {
 #if CK_FLASH_ATTN_FAST_EXP
 #if defined(__AVX512F__)
     return 512;
 #elif defined(__AVX__)
     return 256;
 #else
     return 0;
 #endif
 #else
     return 0;
 #endif
 }

◆ ck_gemm_nt_head_major_q5_0()

void ck_gemm_nt_head_major_q5_0	(	const float *	attn_out,
		const void *	wo,
		const float *	bias,
		float *	output,
		int	tokens,
		int	embed_dim,
		int	num_heads,
		int	head_dim
	)

Output projection from head-major attention (auto-dispatch)

This replaces flatten_head_major() + ck_gemm_nt_quant() with a single strided-access kernel that reads head-major attention output directly.

Definition at line 328 of file gemm_head_major_output.c.

 {
 #if defined(__AVX__) && defined(__F16C__)
     gemv_nt_q5_0_head_major_output_avx(output, attn_out, wo, bias,
                                        tokens, embed_dim, num_heads, head_dim);
 #else
     gemv_nt_q5_0_head_major_output(output, attn_out, wo, bias,
                                    tokens, embed_dim, num_heads, head_dim);
 #endif
 }

References gemv_nt_q5_0_head_major_output().

Referenced by mega_fused_attention_prefill().

◆ ck_gemm_nt_head_major_q8_0()

void ck_gemm_nt_head_major_q8_0	(	const float *	attn_out,
		const void *	wo,
		const float *	bias,
		float *	output,
		int	tokens,
		int	embed_dim,
		int	num_heads,
		int	head_dim
	)

Output projection from head-major attention (Q8_0 weights)

Definition at line 353 of file gemm_head_major_output.c.

 {
     if (!output || !attn_out || !wo) return;
     if (tokens <= 0 || embed_dim <= 0 || num_heads <= 0 || head_dim <= 0) return;
  
     const int blocks_per_head = head_dim / QK8_0;
     const int blocks_per_row = embed_dim / QK8_0;
     const block_q8_0 *weights = (const block_q8_0 *)wo;
  
     const size_t token_stride = head_dim;
     const size_t head_stride = (size_t)tokens * token_stride;
  
     /* Initialize output */
     if (bias) {
         for (int t = 0; t < tokens; t++) {
             float *out_row = output + (size_t)t * embed_dim;
             for (int n = 0; n < embed_dim; n++) {
                 out_row[n] = bias[n];
             }
         }
     } else {
         memset(output, 0, (size_t)tokens * embed_dim * sizeof(float));
     }
  
     /* Accumulate from each head */
     for (int h = 0; h < num_heads; h++) {
         const float *head_data = attn_out + (size_t)h * head_stride;
         const int head_offset = h * blocks_per_head;
  
         for (int n_block = 0; n_block < blocks_per_head; n_block++) {
             for (int n = 0; n < embed_dim; n++) {
                 const block_q8_0 *w_row = weights + (size_t)n * blocks_per_row + head_offset + n_block;
                 const float d = CK_FP16_TO_FP32(w_row->d);
  
                 for (int t = 0; t < tokens; t++) {
                     const float *token_vec = head_data + (size_t)t * token_stride + (size_t)n_block * QK8_0;
                     float sum = 0.0f;
  
                     for (int j = 0; j < QK8_0; j++) {
                         sum += d * (float)w_row->qs[j] * token_vec[j];
                     }
  
                     output[(size_t)t * embed_dim + n] += sum;
                 }
             }
         }
     }
 }

References CK_FP16_TO_FP32, block_q8_0::d, QK8_0, and block_q8_0::qs.

Referenced by mega_fused_attention_prefill().

◆ ck_get_num_threads()

int ck_get_num_threads ( void )

Definition at line 178 of file ckernel_strict.c.

 {
     // Auto-initialize if not set
     if (!g_threads_initialized) {
         ck_set_num_threads(0);  // Auto-detect
     }
     return g_num_threads;
 }

References ck_set_num_threads(), g_num_threads, and g_threads_initialized.

Referenced by gemm_blocked_serial().

◆ ck_get_physical_cores()

int ck_get_physical_cores ( void )

Definition at line 62 of file ckernel_strict.c.

 {
     int physical_cores = 0;
     int logical_cores = (int)sysconf(_SC_NPROCESSORS_ONLN);
     if (logical_cores <= 0) {
         logical_cores = 1;
     }
  
     // Read from /proc/cpuinfo (Linux) and count unique (physical id, core id) pairs.
     FILE *f = fopen("/proc/cpuinfo", "r");
     if (f) {
         char line[256];
         int physical_id = -1;
         int core_id = -1;
  
         struct {
             int physical_id;
             int core_id;
         } seen[8192];
         int seen_count = 0;
  
         const int seen_cap = (int)(sizeof(seen) / sizeof(seen[0]));
  
         // Helper: add (pid,cid) to set if not present.
         #define CK_ADD_PAIR(pid, cid)                                            \
             do {                                                                 \
                 if ((pid) >= 0 && (cid) >= 0) {                                  \
                     int exists = 0;                                              \
                     for (int ii = 0; ii < seen_count; ++ii) {                    \
                         if (seen[ii].physical_id == (pid) &&                     \
                             seen[ii].core_id == (cid)) {                         \
                             exists = 1;                                          \
                             break;                                               \
                         }                                                        \
                     }                                                            \
                     if (!exists && seen_count < seen_cap) {                      \
                         seen[seen_count].physical_id = (pid);                    \
                         seen[seen_count].core_id = (cid);                        \
                         ++seen_count;                                            \
                     }                                                            \
                 }                                                                \
             } while (0)
  
         while (fgets(line, sizeof(line), f)) {
             int val;
  
             // Blank line separates processor blocks.
             if (line[0] == '\n' || line[0] == '\0') {
                 CK_ADD_PAIR(physical_id, core_id);
                 physical_id = -1;
                 core_id = -1;
                 continue;
             }
  
             if (sscanf(line, "physical id : %d", &val) == 1) {
                 physical_id = val;
                 continue;
             }
             if (sscanf(line, "core id : %d", &val) == 1) {
                 core_id = val;
                 continue;
             }
         }
         fclose(f);
  
         // Handle file without trailing blank line.
         CK_ADD_PAIR(physical_id, core_id);
  
         #undef CK_ADD_PAIR
  
         physical_cores = seen_count;
     }
  
     // If we couldn't reliably detect physical cores (common in containers),
     // fall back to logical CPUs instead of incorrectly forcing single-thread execution.
     if (physical_cores <= 1 && logical_cores > 1) {
         return logical_cores;
     }
  
     if (physical_cores > 1) {
         return physical_cores;
     }
  
     return logical_cores;
 }

References CK_ADD_PAIR.

◆ ck_set_num_threads()

void ck_set_num_threads ( int num_threads )

Definition at line 148 of file ckernel_strict.c.

 {
     // 0 = auto-detect
     if (num_threads <= 0) {
         // Prefer explicit env controls when present:
         // - CK_NUM_THREADS: engine-level override
         // - OMP_NUM_THREADS: standard OpenMP control (set by `ck run --threads`)
         int env_threads = ck_parse_env_int("CK_NUM_THREADS");
         if (env_threads <= 0) {
             env_threads = ck_parse_env_int("OMP_NUM_THREADS");
         }
         num_threads = env_threads > 0 ? env_threads : ck_get_physical_cores();
     }
  
     g_num_threads = num_threads;
     g_threads_initialized = 1;
  
 #ifdef _OPENMP
     omp_set_dynamic(0);  // Disable dynamic adjustment
     omp_set_num_threads(num_threads);
 #endif
  
 #if defined(USE_MKL)
     mkl_set_num_threads(num_threads);
 #endif
  
     fprintf(stderr, "[CK] Set %d threads (auto=%d)\n",
             num_threads, ck_get_physical_cores());
 }

References ck_get_physical_cores(), ck_parse_env_int(), g_num_threads, and g_threads_initialized.

Referenced by ck_get_num_threads().

◆ ck_set_strict_parity()

void ck_set_strict_parity ( int enabled )

Definition at line 22 of file ckernel_strict.c.

 {
     ck_strict_parity = enabled ? 1 : 0;
 #ifdef _OPENMP
     if (ck_strict_parity) {
         omp_set_dynamic(0);
         omp_set_num_threads(1);
     }
 #endif
 }

References ck_strict_parity.

◆ ck_strict_parity_enabled()

int ck_strict_parity_enabled ( void )

Definition at line 33 of file ckernel_strict.c.

 {
     return ck_strict_parity;
 }

References ck_strict_parity.

Referenced by ck_q8k_activations_enabled(), gemm_avx512_parallel(), gemm_blocked_serial(), gemm_fine_grained_parallel(), gemm_naive_parallel(), gemm_nn_avx512(), gemm_nn_blocked(), gemm_nn_parallel(), gemm_tn_avx512(), gemm_tn_blocked(), and gemm_tn_parallel().

◆ ckernel_backend_native()

CKMathBackend ckernel_backend_native ( void )

Obtain the built-in native backend (single-node CPU, C + intrinsics).

Definition at line 39 of file backend_native.c.

 {
     CKMathBackend b;
     b.sgemm = &ckernel_sgemm_native;
     return b;
 }

References ckernel_sgemm_native(), and CKMathBackend::sgemm.

◆ dequant_q4_0_row()

void dequant_q4_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_0 row (multiple blocks)

Parameters

src	Q4_0 data
dst	FP32 output
n_elements	Number of elements to dequantize

Definition at line 61 of file dequant_kernels.c.

 {
     const block_q4_0 *blocks = (const block_q4_0 *)src;
     const size_t n_blocks = n_elements / QK4_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_0_block(&blocks[b], &dst[b * QK4_0]);
     }
 }

References dequant_q4_0_block(), and QK4_0.

◆ dequant_q4_1_row()

void dequant_q4_1_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_1 row (multiple blocks)

Definition at line 139 of file dequant_kernels.c.

 {
     const block_q4_1 *blocks = (const block_q4_1 *)src;
     const size_t n_blocks = n_elements / QK4_1;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_1_block(&blocks[b], &dst[b * QK4_1]);
     }
 }

References dequant_q4_1_block(), and QK4_1.

Referenced by dequant_row().

◆ dequant_q4_k_row()

void dequant_q4_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_K row (multiple blocks)

Definition at line 370 of file dequant_kernels.c.

 {
     const block_q4_K *blocks = (const block_q4_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

References dequant_q4_k_block(), and QK_K.

Referenced by embedding_forward_q4_k().

◆ dequant_q5_0_row()

void dequant_q5_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q5_0 row (multiple blocks)

Definition at line 196 of file dequant_kernels.c.

 {
     const block_q5_0 *blocks = (const block_q5_0 *)src;
     const size_t n_blocks = n_elements / QK5_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q5_0_block(&blocks[b], &dst[b * QK5_0]);
     }
 }

References dequant_q5_0_block(), and QK5_0.

◆ dequant_q5_1_row()

void dequant_q5_1_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q5_1 row (multiple blocks)

Definition at line 255 of file dequant_kernels.c.

 {
     const block_q5_1 *blocks = (const block_q5_1 *)src;
     const size_t n_blocks = n_elements / QK5_1;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q5_1_block(&blocks[b], &dst[b * QK5_1]);
     }
 }

References dequant_q5_1_block(), and QK5_1.

◆ dequant_q6_k_row()

void dequant_q6_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q6_K row (multiple blocks)

Definition at line 420 of file dequant_kernels.c.

 {
     const block_q6_K *blocks = (const block_q6_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q6_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

References dequant_q6_k_block(), and QK_K.

Referenced by embedding_forward_q6_k().

◆ dequant_q8_0_row()

void dequant_q8_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q8_0 row (multiple blocks)

Definition at line 286 of file dequant_kernels.c.

 {
     const block_q8_0 *blocks = (const block_q8_0 *)src;
     const size_t n_blocks = n_elements / QK8_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q8_0_block(&blocks[b], &dst[b * QK8_0]);
     }
 }

References dequant_q8_0_block(), and QK8_0.

Referenced by dequant_row(), and embedding_forward_q8_0().

◆ embedding_backward()

void embedding_backward	(	const int32_t *	token_ids,
		int	token_count,
		const float *	d_output,
		float *	d_token_embeddings,
		float *	d_pos_embeddings,
		int	vocab_size,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 241 of file embedding_kernels.c.

 {
     if (!token_ids || !d_output || !d_token_embeddings) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) {
         tokens = 0;
     }
     if (tokens > context_window) {
         tokens = context_window;
     }
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const float *d_out = d_output + (size_t)t * (size_t)aligned_embed_dim;
         float *d_tok = d_token_embeddings + (size_t)id * (size_t)aligned_embed_dim;
         float *d_pos = d_pos_embeddings ? (d_pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
  
         for (int d = 0; d < embed_dim; ++d) {
             float grad = d_out[d];
             d_tok[d] += grad;
             if (add_pos && d_pos) {
                 d_pos[d] += grad;
             }
         }
     }
 }

References vocab_size.

◆ embedding_backward_bf16()

void embedding_backward_bf16	(	const int32_t *	token_ids,
		int	token_count,
		const uint16_t *	d_output,
		uint16_t *	d_token_embeddings,
		uint16_t *	d_pos_embeddings,
		int	vocab_size,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 72 of file embedding_kernels_bf16.c.

 {
     if (!token_ids || !d_output || !d_token_embeddings) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) tokens = 0;
     if (tokens > context_window) tokens = context_window;
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const uint16_t *d_out = d_output + (size_t)t * (size_t)aligned_embed_dim;
         uint16_t *d_tok = d_token_embeddings + (size_t)id * (size_t)aligned_embed_dim;
         uint16_t *d_pos = d_pos_embeddings ? (d_pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
  
         for (int d = 0; d < embed_dim; ++d) {
             float grad = bf16_to_float(d_out[d]);
  
             float cur_tok = bf16_to_float(d_tok[d]);
             d_tok[d] = float_to_bf16(cur_tok + grad);
  
             if (add_pos && d_pos) {
                 float cur_pos = bf16_to_float(d_pos[d]);
                 d_pos[d] = float_to_bf16(cur_pos + grad);
             }
         }
     }
 }

References bf16_to_float(), float_to_bf16(), and vocab_size.

◆ embedding_forward()

void embedding_forward	(	const int32_t *	token_ids,
		int	token_count,
		int	vocab_size,
		const float *	token_embeddings,
		const float *	pos_embeddings,
		float *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 22 of file embedding_kernels.c.

 {
     if (!token_ids || !token_embeddings || !output) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) {
         tokens = 0;
     }
     if (tokens > context_window) {
         tokens = context_window;
     }
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const float *tok = token_embeddings + (size_t)id * (size_t)aligned_embed_dim;
         const float *pos = pos_embeddings ? (pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
  
         if (add_pos && pos) {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] = tok[d] + pos[d];
             }
         } else {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] = tok[d];
             }
         }
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             out[d] = 0.0f;
         }
     }
  
     for (int t = tokens; t < context_window; ++t) {
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
         memset(out, 0, (size_t)aligned_embed_dim * sizeof(float));
     }
 }

References vocab_size.

◆ embedding_forward_bf16()

void embedding_forward_bf16	(	const int32_t *	token_ids,
		int	token_count,
		int	vocab_size,
		const uint16_t *	token_embeddings,
		const uint16_t *	pos_embeddings,
		uint16_t *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 21 of file embedding_kernels_bf16.c.

 {
     if (!token_ids || !token_embeddings || !output) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) tokens = 0;
     if (tokens > context_window) tokens = context_window;
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const uint16_t *tok = token_embeddings + (size_t)id * (size_t)aligned_embed_dim;
         const uint16_t *pos = pos_embeddings ? (pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
         uint16_t *out = output + (size_t)t * (size_t)aligned_embed_dim;
  
         if (add_pos && pos) {
             for (int d = 0; d < embed_dim; ++d) {
                 float v = bf16_to_float(tok[d]) + bf16_to_float(pos[d]);
                 out[d] = float_to_bf16(v);
             }
         } else {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] = tok[d];
             }
         }
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             out[d] = 0;
         }
     }
  
     for (int t = tokens; t < context_window; ++t) {
         uint16_t *out = output + (size_t)t * (size_t)aligned_embed_dim;
         memset(out, 0, (size_t)aligned_embed_dim * sizeof(uint16_t));
     }
 }

References bf16_to_float(), float_to_bf16(), and vocab_size.

◆ embedding_forward_q4_k()

void embedding_forward_q4_k	(	const int32_t *	token_ids,
		int	token_count,
		int	vocab_size,
		const void *	token_embeddings,
		const float *	pos_embeddings,
		float *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 76 of file embedding_kernels.c.

 {
     if (!token_ids || !token_embeddings || !output) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) {
         tokens = 0;
     }
     if (tokens > context_window) {
         tokens = context_window;
     }
  
     const size_t row_bytes = ck_dtype_row_bytes(CK_DT_Q4_K, (size_t)aligned_embed_dim);
     const uint8_t *base = (const uint8_t *)token_embeddings;
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const void *tok = base + (size_t)id * row_bytes;
         const float *pos = pos_embeddings ? (pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
  
         dequant_q4_k_row(tok, out, (size_t)aligned_embed_dim);
  
         if (add_pos && pos) {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] += pos[d];
             }
         }
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             out[d] = 0.0f;
         }
     }
  
     for (int t = tokens; t < context_window; ++t) {
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
         memset(out, 0, (size_t)aligned_embed_dim * sizeof(float));
     }
 }

References CK_DT_Q4_K, ck_dtype_row_bytes(), dequant_q4_k_row(), and vocab_size.

Referenced by model_decode_token(), model_forward_prefill_impl(), qwen2_0_5b_decode_decode_token(), and qwen2_0_5b_decode_forward_prefill_impl().

◆ embedding_forward_q6_k()

void embedding_forward_q6_k	(	const int32_t *	token_ids,
		int	token_count,
		int	vocab_size,
		const void *	token_embeddings,
		const float *	pos_embeddings,
		float *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 186 of file embedding_kernels.c.

 {
     if (!token_ids || !token_embeddings || !output) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) {
         tokens = 0;
     }
     if (tokens > context_window) {
         tokens = context_window;
     }
  
     const size_t row_bytes = ck_dtype_row_bytes(CK_DT_Q6_K, (size_t)aligned_embed_dim);
     const uint8_t *base = (const uint8_t *)token_embeddings;
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const void *tok = base + (size_t)id * row_bytes;
         const float *pos = pos_embeddings ? (pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
  
         dequant_q6_k_row(tok, out, (size_t)aligned_embed_dim);
  
         if (add_pos && pos) {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] += pos[d];
             }
         }
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             out[d] = 0.0f;
         }
     }
  
     for (int t = tokens; t < context_window; ++t) {
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
         memset(out, 0, (size_t)aligned_embed_dim * sizeof(float));
     }
 }

References CK_DT_Q6_K, ck_dtype_row_bytes(), dequant_q6_k_row(), and vocab_size.

◆ embedding_forward_q8_0()

void embedding_forward_q8_0	(	const int32_t *	token_ids,
		int	token_count,
		int	vocab_size,
		const void *	token_embeddings,
		const float *	pos_embeddings,
		float *	output,
		int	embed_dim,
		int	aligned_embed_dim,
		int	context_window,
		int	add_pos
	)

Definition at line 131 of file embedding_kernels.c.

 {
     if (!token_ids || !token_embeddings || !output) {
         return;
     }
  
     int tokens = token_count;
     if (tokens < 0) {
         tokens = 0;
     }
     if (tokens > context_window) {
         tokens = context_window;
     }
  
     const size_t row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const uint8_t *base = (const uint8_t *)token_embeddings;
  
     for (int t = 0; t < tokens; ++t) {
         int id = token_ids[t];
         if (id < 0 || id >= vocab_size) {
             id = 0;
         }
  
         const void *tok = base + (size_t)id * row_bytes;
         const float *pos = pos_embeddings ? (pos_embeddings + (size_t)t * (size_t)aligned_embed_dim) : NULL;
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
  
         dequant_q8_0_row(tok, out, (size_t)aligned_embed_dim);
  
         if (add_pos && pos) {
             for (int d = 0; d < embed_dim; ++d) {
                 out[d] += pos[d];
             }
         }
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             out[d] = 0.0f;
         }
     }
  
     for (int t = tokens; t < context_window; ++t) {
         float *out = output + (size_t)t * (size_t)aligned_embed_dim;
         memset(out, 0, (size_t)aligned_embed_dim * sizeof(float));
     }
 }

References CK_DT_Q8_0, ck_dtype_row_bytes(), dequant_q8_0_row(), and vocab_size.

Referenced by qwen2_0_5b_decode_decode_token(), and qwen2_0_5b_decode_forward_prefill_impl().

◆ fc1_backward_kernel()

void fc1_backward_kernel	(	const float *	d_output,
		const float *	fc1_input,
		const float *	W_fc1,
		float *	d_input,
		float *	d_W_fc1,
		float *	d_b_fc1,
		int	T,
		int	aligned_in,
		int	aligned_out,
		int	num_threads
	)

Definition at line 167 of file mlp_kernels.c.

 {
     (void)num_threads;  // Threading handled by GEMM kernels
  
     // 1. d_input[T, in] = d_output[T, out] @ W[out, in]
     // Using gemm_nn: C[M,N] = A[M,K] @ B[K,N]
     // A = d_output [T, out], B = W [out, in], C = d_input [T, in]
     // M = T, N = aligned_in, K = aligned_out
     gemm_nn_avx512(d_output, W_fc1, NULL, d_input,
                    T, aligned_in, aligned_out);
  
     // 2. d_W[out, in] = d_output[T, out].T @ fc1_input[T, in]
     // Using gemm_tn: C[M,N] = A[K,M].T @ B[K,N]
     // A = d_output [T, out] (stored as [K=T, M=out]), B = fc1_input [T, in]
     // C = d_W [out, in], M = aligned_out, N = aligned_in, K = T
     gemm_tn_avx512(d_output, fc1_input, NULL, d_W_fc1,
                    aligned_out, aligned_in, T);
  
     // 3. d_b_fc1 = sum_over_T(d_output)
 #pragma omp parallel for schedule(static)
     for (int out_idx = 0; out_idx < aligned_out; ++out_idx) {
         float bias_grad = 0.0f;
         for (int t = 0; t < T; ++t) {
             bias_grad += d_output[(size_t)t * aligned_out + out_idx];
         }
         d_b_fc1[out_idx] += bias_grad;
     }
 }

References gemm_nn_avx512(), and gemm_tn_avx512().

Referenced by ck_layer_backward_rmsnorm_swiglu().

◆ fc2_backward_kernel()

void fc2_backward_kernel	(	const float *	d_output,
		const float *	fc2_input,
		const float *	W_fc2,
		float *	d_input,
		float *	d_W_fc2,
		float *	d_b_fc2,
		int	T,
		int	aligned_in,
		int	aligned_out,
		int	num_threads
	)

Definition at line 118 of file mlp_kernels.c.

 {
     (void)num_threads;  // Threading handled by GEMM kernels
  
     // 1. d_input[T, in] = d_output[T, out] @ W[out, in]
     // Using gemm_nn: C[M,N] = A[M,K] @ B[K,N]
     // A = d_output [T, out], B = W [out, in], C = d_input [T, in]
     // M = T, N = aligned_in, K = aligned_out
     gemm_nn_avx512(d_output, W_fc2, NULL, d_input,
                    T, aligned_in, aligned_out);
  
     // 2. d_W[out, in] = d_output[T, out].T @ fc2_input[T, in]
     // Using gemm_tn: C[M,N] = A[K,M].T @ B[K,N]
     // A = d_output [T, out] (stored as [K=T, M=out]), B = fc2_input [T, in]
     // C = d_W [out, in], M = aligned_out, N = aligned_in, K = T
     // Note: gemm_tn overwrites, so we need to save and add if accumulating
     // For now, assume d_W starts zeroed (gradient accumulation handled at higher level)
     gemm_tn_avx512(d_output, fc2_input, NULL, d_W_fc2,
                    aligned_out, aligned_in, T);
  
     // 3. d_b_fc2 = sum_over_T(d_output)
 #pragma omp parallel for schedule(static)
     for (int out_idx = 0; out_idx < aligned_out; ++out_idx) {
         float bias_grad = 0.0f;
         for (int t = 0; t < T; ++t) {
             bias_grad += d_output[(size_t)t * aligned_out + out_idx];
         }
         d_b_fc2[out_idx] += bias_grad;
     }
 }

References gemm_nn_avx512(), and gemm_tn_avx512().

Referenced by ck_attention_project_head_major_backward(), ck_layer_backward_rmsnorm_swiglu(), and ck_qkv_project_head_major_backward().

◆ fused_mlp_swiglu_decode()

void fused_mlp_swiglu_decode	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 154 of file mlp_fused_decode.c.

 {
 #if defined(__AVX512F__)
     // Initialize output with bias or zero
     if (b_down) {
         memcpy(output, b_down, D * sizeof(float));
     } else {
         memset(output, 0, D * sizeof(float));
     }
  
     // Process intermediate dimension in tiles
     // Each tile computes MLP_TILE_SIZE swiglu values and immediately
     // accumulates them into the output
  
     /* Bounds check for stack allocation */
     if (D > 4096) return;
  
     #pragma omp parallel
     {
         /* Thread-local accumulator on stack (no malloc!) */
         float local_output[4096] __attribute__((aligned(64)));
         memset(local_output, 0, D * sizeof(float));
  
         #pragma omp for schedule(static)
         for (int t = 0; t < Hff; t += MLP_TILE_SIZE) {
             int tile_end = (t + MLP_TILE_SIZE < Hff) ? t + MLP_TILE_SIZE : Hff;
             int tile_size = tile_end - t;
  
             // Compute SwiGLU for this tile (stays in L1 cache)
             float swiglu_tile[MLP_TILE_SIZE] __attribute__((aligned(64)));
  
             for (int j = t; j < tile_end; j++) {
                 const float *wg_row = &W_gate[j * D];
                 const float *wu_row = &W_up[j * D];
  
                 // Compute gate = x @ W_gate[j] using AVX-512
                 __m512 gate_acc = _mm512_setzero_ps();
                 __m512 up_acc = _mm512_setzero_ps();
  
                 int k = 0;
                 for (; k <= D - 16; k += 16) {
                     __m512 x_vec = _mm512_loadu_ps(&x[k]);
                     __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
                     __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
                     gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
                     up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
                 }
  
                 float gate = hsum512_ps(gate_acc);
                 float up = hsum512_ps(up_acc);
  
                 // Scalar remainder
                 for (; k < D; k++) {
                     gate += x[k] * wg_row[k];
                     up += x[k] * wu_row[k];
                 }
  
                 // Add biases
                 if (b_gate) gate += b_gate[j];
                 if (b_up) up += b_up[j];
  
                 // SwiGLU: SiLU(gate) * up
                 swiglu_tile[j - t] = silu_scalar(gate) * up;
             }
  
             // Accumulate into output via W_down
             // output[i] += sum_j(swiglu_tile[j] * W_down[i, t+j])
             for (int i = 0; i < D; i++) {
                 const float *wd_row = &W_down[i * Hff + t];
  
                 __m512 acc = _mm512_setzero_ps();
                 int j = 0;
                 for (; j <= tile_size - 16; j += 16) {
                     __m512 sw_vec = _mm512_loadu_ps(&swiglu_tile[j]);
                     __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
                     acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
                 }
  
                 float sum = hsum512_ps(acc);
                 for (; j < tile_size; j++) {
                     sum += swiglu_tile[j] * wd_row[j];
                 }
  
                 local_output[i] += sum;
             }
         }
  
         // Reduce thread-local outputs
         #pragma omp critical
         {
             for (int i = 0; i < D; i++) {
                 output[i] += local_output[i];
             }
         }
         /* No free - stack buffer auto-deallocates */
     }
  
 #else
     // Scalar fallback (same algorithm, no SIMD)
     if (b_down) {
         memcpy(output, b_down, D * sizeof(float));
     } else {
         memset(output, 0, D * sizeof(float));
     }
  
     for (int t = 0; t < Hff; t += MLP_TILE_SIZE) {
         int tile_end = (t + MLP_TILE_SIZE < Hff) ? t + MLP_TILE_SIZE : Hff;
         int tile_size = tile_end - t;
  
         float swiglu_tile[MLP_TILE_SIZE];
  
         for (int j = t; j < tile_end; j++) {
             float gate = 0.0f;
             float up = 0.0f;
  
             for (int k = 0; k < D; k++) {
                 gate += x[k] * W_gate[j * D + k];
                 up += x[k] * W_up[j * D + k];
             }
  
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             swiglu_tile[j - t] = silu_scalar(gate) * up;
         }
  
         for (int i = 0; i < D; i++) {
             for (int j = 0; j < tile_size; j++) {
                 output[i] += swiglu_tile[j] * W_down[i * Hff + t + j];
             }
         }
     }
 #endif
 }

References __attribute__(), MLP_TILE_SIZE, and silu_scalar().

◆ fused_mlp_swiglu_decode_tiled()

void fused_mlp_swiglu_decode_tiled	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 429 of file mlp_fused_decode.c.

 {
     // Tile size chosen to fit in L2 with W_down tile
     // Tile of swiglu: 256 floats = 1KB
     // Tile of W_down: 256 * D floats = 256 * 896 * 4 = 896KB
     // Fits in 2MB L2 with room for x and prefetch
     const int TILE = 256;
  
 #if defined(__AVX512F__)
     // Initialize output
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < D; i++) {
         output[i] = b_down ? b_down[i] : 0.0f;
     }
  
     // Process tiles of intermediate dimension
     for (int t = 0; t < Hff; t += TILE) {
         int tile_end = (t + TILE < Hff) ? t + TILE : Hff;
         int tile_size = tile_end - t;
  
         // Compute swiglu tile
         float swiglu_tile[256] __attribute__((aligned(64)));
  
         #pragma omp parallel for schedule(static)
         for (int jj = 0; jj < tile_size; jj++) {
             int j = t + jj;
             const float *wg_row = &W_gate[j * D];
             const float *wu_row = &W_up[j * D];
  
             __m512 gate_acc = _mm512_setzero_ps();
             __m512 up_acc = _mm512_setzero_ps();
  
             int k = 0;
             for (; k <= D - 16; k += 16) {
                 __m512 x_vec = _mm512_loadu_ps(&x[k]);
                 __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
                 __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
                 gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
                 up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
             }
  
             float gate = hsum512_ps(gate_acc);
             float up = hsum512_ps(up_acc);
  
             for (; k < D; k++) {
                 gate += x[k] * wg_row[k];
                 up += x[k] * wu_row[k];
             }
  
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             swiglu_tile[jj] = silu_scalar(gate) * up;
         }
  
         // Accumulate into output (parallelize over D)
         #pragma omp parallel for schedule(static)
         for (int i = 0; i < D; i++) {
             const float *wd_row = &W_down[i * Hff + t];
  
             __m512 acc = _mm512_setzero_ps();
             int j = 0;
             for (; j <= tile_size - 16; j += 16) {
                 __m512 sw_vec = _mm512_loadu_ps(&swiglu_tile[j]);
                 __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
                 acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
             }
  
             float sum = hsum512_ps(acc);
             for (; j < tile_size; j++) {
                 sum += swiglu_tile[j] * wd_row[j];
             }
  
             // Atomic add (or use thread-local buffers for better perf)
             #pragma omp atomic
             output[i] += sum;
         }
     }
  
 #else
     // Scalar fallback
     for (int i = 0; i < D; i++) {
         output[i] = b_down ? b_down[i] : 0.0f;
     }
  
     for (int t = 0; t < Hff; t += TILE) {
         int tile_end = (t + TILE < Hff) ? t + TILE : Hff;
  
         float swiglu_tile[256];
  
         for (int j = t; j < tile_end; j++) {
             float gate = 0.0f, up = 0.0f;
             for (int k = 0; k < D; k++) {
                 gate += x[k] * W_gate[j * D + k];
                 up += x[k] * W_up[j * D + k];
             }
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
             swiglu_tile[j - t] = silu_scalar(gate) * up;
         }
  
         for (int i = 0; i < D; i++) {
             for (int j = t; j < tile_end; j++) {
                 output[i] += swiglu_tile[j - t] * W_down[i * Hff + j];
             }
         }
     }
 #endif
 }

References __attribute__(), and silu_scalar().

Referenced by fused_mlp_swiglu_decode_v2().

◆ fused_mlp_swiglu_decode_v2()

void fused_mlp_swiglu_decode_v2	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 318 of file mlp_fused_decode.c.

 {
     // For large Hff, use tiled version to avoid stack overflow
     if (Hff > MAX_SWIGLU_STACK) {
         fused_mlp_swiglu_decode_tiled(x, W_gate, W_up, W_down,
                                       b_gate, b_up, b_down, output, D, Hff);
         return;
     }
  
 #if defined(__AVX512F__)
     // Stack-allocated swiglu buffer (max 32KB)
     float swiglu[MAX_SWIGLU_STACK] __attribute__((aligned(64)));
  
     // Phase 1: Compute all swiglu values (parallelize over Hff)
     #pragma omp parallel for schedule(static)
     for (int j = 0; j < Hff; j++) {
         const float *wg_row = &W_gate[j * D];
         const float *wu_row = &W_up[j * D];
  
         __m512 gate_acc = _mm512_setzero_ps();
         __m512 up_acc = _mm512_setzero_ps();
  
         int k = 0;
         for (; k <= D - 16; k += 16) {
             __m512 x_vec = _mm512_loadu_ps(&x[k]);
             __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
             __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
             gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
             up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
         }
  
         float gate = hsum512_ps(gate_acc);
         float up = hsum512_ps(up_acc);
  
         for (; k < D; k++) {
             gate += x[k] * wg_row[k];
             up += x[k] * wu_row[k];
         }
  
         if (b_gate) gate += b_gate[j];
         if (b_up) up += b_up[j];
  
         swiglu[j] = silu_scalar(gate) * up;
     }
  
     // Phase 2: Down projection (parallelize over D)
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < D; i++) {
         const float *wd_row = &W_down[i * Hff];
  
         __m512 acc = _mm512_setzero_ps();
         int j = 0;
         for (; j <= Hff - 16; j += 16) {
             __m512 sw_vec = _mm512_loadu_ps(&swiglu[j]);
             __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
             acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
         }
  
         float sum = hsum512_ps(acc);
         for (; j < Hff; j++) {
             sum += swiglu[j] * wd_row[j];
         }
  
         output[i] = sum + (b_down ? b_down[i] : 0.0f);
     }
  
 #else
     // Scalar fallback with stack buffer
     float swiglu[MAX_SWIGLU_STACK];
  
     for (int j = 0; j < Hff; j++) {
         float gate = 0.0f, up = 0.0f;
         for (int k = 0; k < D; k++) {
             gate += x[k] * W_gate[j * D + k];
             up += x[k] * W_up[j * D + k];
         }
         if (b_gate) gate += b_gate[j];
         if (b_up) up += b_up[j];
         swiglu[j] = silu_scalar(gate) * up;
     }
  
     for (int i = 0; i < D; i++) {
         float sum = 0.0f;
         for (int j = 0; j < Hff; j++) {
             sum += swiglu[j] * W_down[i * Hff + j];
         }
         output[i] = sum + (b_down ? b_down[i] : 0.0f);
     }
 #endif
 }

References __attribute__(), fused_mlp_swiglu_decode_tiled(), MAX_SWIGLU_STACK, and silu_scalar().

Referenced by ck_mlp_swiglu_forward_fully_fused_token().

◆ fused_mlp_swiglu_prefill()

void fused_mlp_swiglu_prefill	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		float *	output,
		int	seq_len,
		int	hidden,
		int	intermediate,
		float *	scratch
	)

Fused MLP (Gate + Up + SwiGLU + Down) for prefill.

Tiles along token dimension to keep gate/up/hidden in L3 cache.

Parameters

scratch Temporary buffer from fused_mlp_swiglu_scratch_size()

Definition at line 879 of file prefill_fused_gemm.c.

 {
     fused_mlp_swiglu_prefill_bias(x, W_gate, W_up, W_down,
                                   NULL, NULL, NULL,
                                   output, seq_len, hidden, intermediate,
                                   scratch);
 }

References fused_mlp_swiglu_prefill_bias().

◆ fused_mlp_swiglu_prefill_bias()

void fused_mlp_swiglu_prefill_bias	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	B_gate,
		const float *	B_up,
		const float *	B_down,
		float *	output,
		int	seq_len,
		int	hidden,
		int	intermediate,
		float *	scratch
	)

Fused MLP (Gate + Up + SwiGLU + Down) for prefill with biases.

Definition at line 746 of file prefill_fused_gemm.c.

 {
     /* MLP is more complex because we have:
      * gate = x @ W_gate
      * up = x @ W_up
      * hidden = silu(gate) * up
      * out = hidden @ W_down
      *
      * The intermediate (gate, up, hidden) is large: seq_len × intermediate
      * For Qwen2-0.5B: 1024 × 4864 × 4 = 19.4MB (way bigger than L3!)
      *
      * Strategy: Tile along intermediate dimension for gate/up,
      * then fuse SwiGLU, then tile down projection.
      */
  
     /* scratch layout:
      * [gate_tile: TILE_M × TILE_N_INTER]
      * [up_tile: TILE_M × TILE_N_INTER]
      */
     const int TILE_N_INTER = 512;  /* Intermediate tile size */
     float *gate_tile = scratch;
     float *up_tile = scratch + (size_t)PREFILL_TILE_M * TILE_N_INTER;
     float *hidden_tile = gate_tile;  /* Reuse gate_tile for hidden after SwiGLU */
  
     /* For each chunk of intermediate dimension */
     for (int inter_start = 0; inter_start < intermediate; inter_start += TILE_N_INTER) {
         int tile_inter = (inter_start + TILE_N_INTER <= intermediate)
                              ? TILE_N_INTER : (intermediate - inter_start);
  
         const float *W_gate_tile = W_gate + (size_t)inter_start * hidden;
         const float *W_up_tile = W_up + (size_t)inter_start * hidden;
  
         /* For each chunk of tokens */
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
  
             const float *x_tile = x + (size_t)m_start * hidden;
  
             /* Compute gate and up projections for this tile */
             gemm_tile_nt_strided(x_tile, W_gate_tile, gate_tile,
                                   tile_m, tile_inter, hidden, tile_inter);
             gemm_tile_nt_strided(x_tile, W_up_tile, up_tile,
                                   tile_m, tile_inter, hidden, tile_inter);
             if (B_gate) {
                 add_bias_tile(gate_tile, B_gate + inter_start, tile_m, tile_inter);
             }
             if (B_up) {
                 add_bias_tile(up_tile, B_up + inter_start, tile_m, tile_inter);
             }
  
             /* Fused SwiGLU: hidden = silu(gate) * up */
             for (int i = 0; i < tile_m; ++i) {
                 float *g = gate_tile + (size_t)i * tile_inter;
                 float *u = up_tile + (size_t)i * tile_inter;
                 for (int j = 0; j < tile_inter; ++j) {
                     float gv = g[j];
                     float silu = gv / (1.0f + expf(-gv));
                     g[j] = silu * u[j];  /* hidden_tile = gate_tile */
                 }
             }
  
             /* Down projection: accumulate into output
              * out[m_start:, :] += hidden_tile @ W_down[inter_start:, :]^T
              */
             const float *W_down_slice = W_down + (size_t)inter_start;  /* Column slice */
             float *out_tile = output + (size_t)m_start * hidden;
  
             /* This is trickier - W_down is [hidden × intermediate]
              * We have hidden_tile[tile_m × tile_inter]
              * We want out[tile_m × hidden] += hidden_tile × W_down[:, inter_start:inter_start+tile_inter]^T
              *
              * For proper accumulation, need to handle this carefully.
              * For now, use a simpler approach: accumulate partial results.
              */
             for (int i = 0; i < tile_m; ++i) {
                 float *h = hidden_tile + (size_t)i * tile_inter;
                 float *o = out_tile + (size_t)i * hidden;
  
                 for (int d = 0; d < hidden; ++d) {
                     const float *w_row = W_down + (size_t)d * intermediate + inter_start;
                     float sum = (inter_start == 0)
                         ? (B_down ? B_down[d] : 0.0f)
                         : o[d];
  
 #if defined(__AVX512F__)
                     __m512 acc = _mm512_setzero_ps();
                     int j = 0;
                     for (; j + 16 <= tile_inter; j += 16) {
                         __m512 hv = _mm512_loadu_ps(h + j);
                         __m512 wv = _mm512_loadu_ps(w_row + j);
                         acc = _mm512_fmadd_ps(hv, wv, acc);
                     }
                     sum += _mm512_reduce_add_ps(acc);
                     for (; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #elif defined(__AVX__)
                     __m256 acc = _mm256_setzero_ps();
                     int j = 0;
                     for (; j + 8 <= tile_inter; j += 8) {
                         __m256 hv = _mm256_loadu_ps(h + j);
                         __m256 wv = _mm256_loadu_ps(w_row + j);
                         acc = _mm256_add_ps(acc, _mm256_mul_ps(hv, wv));
                     }
                     sum += hsum256_prefill(acc);
                     for (; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #else
                     for (int j = 0; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #endif
                     o[d] = sum;
                 }
             }
         }
     }
 }

References add_bias_tile(), gemm_tile_nt_strided(), PREFILL_TILE_M, and silu().

Referenced by fused_mlp_swiglu_prefill().

◆ fused_mlp_swiglu_prefill_w1w2_quant()

void fused_mlp_swiglu_prefill_w1w2_quant	(	const float *	x,
		const void *	W1,
		const float *	B1,
		CKDataType	w1_dt,
		const void *	W2,
		const float *	B2,
		CKDataType	w2_dt,
		float *	output,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	intermediate_dim,
		int	aligned_intermediate_dim,
		void *	scratch
	)

Quantized fused MLP for prefill (W1=gate+up, W2=down)

W1 uses Q8_0 activations (Q5_0/Q8_0 weights), W2 uses Q8_K activations (Q4_K/Q6_K weights).

Uses Q8_0 activations for W1 (Q5_0/Q8_0 weights) and Q8_K activations for W2 (Q4_K/Q6_K weights).

Definition at line 965 of file prefill_fused_gemm.c.

 {
     if (!x || !W1 || !W2 || !output || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         intermediate_dim <= 0 || aligned_intermediate_dim <= 0) {
         return;
     }
     if (aligned_embed_dim < embed_dim || aligned_intermediate_dim < intermediate_dim) {
         return;
     }
     if ((aligned_embed_dim % 32) != 0 || (aligned_intermediate_dim % 256) != 0) {
         return;
     }
     if (!mlp_q8_0_dtype_supported(w1_dt) || !mlp_q8_k_dtype_supported(w2_dt)) {
         return;
     }
  
     const int tile_m_max = PREFILL_TILE_M;
     const int inter = aligned_intermediate_dim;
     const size_t q8_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_intermediate_dim);
     const size_t w1_row_bytes = ck_dtype_row_bytes(w1_dt, (size_t)aligned_embed_dim);
  
     uint8_t *scratch_bytes = (uint8_t *)scratch;
     size_t q8_bytes = (size_t)tile_m_max * q8_row_bytes;
     size_t gate_bytes = (size_t)tile_m_max * (size_t)inter * sizeof(float);
     size_t up_bytes = gate_bytes;
     size_t gate_offset = align_up_size(q8_bytes, 64);
     size_t up_offset = gate_offset + align_up_size(gate_bytes, 64);
     size_t q8k_offset = up_offset + align_up_size(up_bytes, 64);
  
     uint8_t *q8_tile = scratch_bytes;
     float *gate_tile = (float *)(scratch_bytes + gate_offset);
     float *up_tile = (float *)(scratch_bytes + up_offset);
     uint8_t *q8k_tile = scratch_bytes + q8k_offset;
  
     const uint8_t *w1_base = (const uint8_t *)W1;
     const uint8_t *w_gate = w1_base;
     const uint8_t *w_up = w1_base + (size_t)inter * w1_row_bytes;
  
     const float *b_gate = B1;
     const float *b_up = B1 ? (B1 + (size_t)inter) : NULL;
  
     for (int m_start = 0; m_start < seq_len; m_start += tile_m_max) {
         int tile_m = (m_start + tile_m_max <= seq_len)
                          ? tile_m_max : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         float *out_tile = output + (size_t)m_start * (size_t)aligned_embed_dim;
  
         for (int t = 0; t < tile_m; ++t) {
             const float *row = x_tile + (size_t)t * (size_t)aligned_embed_dim;
             quantize_row_q8_0(row,
                               q8_tile + (size_t)t * q8_row_bytes,
                               aligned_embed_dim);
         }
  
         gemm_nt_q8_0_mlp_dispatch(q8_tile, w_gate, b_gate, gate_tile,
                                  tile_m, inter, aligned_embed_dim, w1_dt);
         gemm_nt_q8_0_mlp_dispatch(q8_tile, w_up, b_up, up_tile,
                                  tile_m, inter, aligned_embed_dim, w1_dt);
  
         for (int i = 0; i < tile_m; ++i) {
             float *g = gate_tile + (size_t)i * (size_t)inter;
             float *u = up_tile + (size_t)i * (size_t)inter;
             for (int j = 0; j < inter; ++j) {
                 g[j] = silu_prefill(g[j]) * u[j];
             }
         }
  
         for (int i = 0; i < tile_m; ++i) {
             const float *row = gate_tile + (size_t)i * (size_t)inter;
             quantize_row_q8_k(row,
                               q8k_tile + (size_t)i * q8k_row_bytes,
                               aligned_intermediate_dim);
         }
  
         gemm_nt_q8_k_mlp_dispatch(q8k_tile, W2, B2, out_tile,
                                   tile_m, aligned_embed_dim, aligned_intermediate_dim, w2_dt);
     }
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), gemm_nt_q8_0_mlp_dispatch(), gemm_nt_q8_k_mlp_dispatch(), mlp_q8_0_dtype_supported(), mlp_q8_k_dtype_supported(), PREFILL_TILE_M, quantize_row_q8_0(), quantize_row_q8_k(), and silu_prefill().

Referenced by mega_fused_outproj_mlp_prefill().

◆ fused_mlp_swiglu_prefill_w1w2_quant_scratch_size()

size_t fused_mlp_swiglu_prefill_w1w2_quant_scratch_size	(	int	aligned_embed_dim,
		int	aligned_intermediate_dim
	)

Get scratch buffer size for fused_mlp_swiglu_prefill_w1w2_quant.

Definition at line 1063 of file prefill_fused_gemm.c.

 {
     if (aligned_embed_dim <= 0 || aligned_intermediate_dim <= 0) {
         return 0;
     }
     const size_t q8_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_intermediate_dim);
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     const size_t gate_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_intermediate_dim * sizeof(float);
     const size_t up_bytes = gate_bytes;
     const size_t q8k_bytes = (size_t)PREFILL_TILE_M * q8k_row_bytes;
  
     return align_up_size(q8_bytes, 64) +
            align_up_size(gate_bytes, 64) +
            align_up_size(up_bytes, 64) +
            align_up_size(q8k_bytes, 64);
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), and PREFILL_TILE_M.

Referenced by mega_fused_outproj_mlp_prefill_scratch_size().

◆ fused_mlp_swiglu_scratch_size()

size_t fused_mlp_swiglu_scratch_size ( int intermediate )

Get scratch buffer size for fused_mlp_swiglu_prefill.

Definition at line 899 of file prefill_fused_gemm.c.

                                                        {
     const int TILE_N_INTER = 512;
     /* gate_tile + up_tile */
     return 2 * (size_t)PREFILL_TILE_M * TILE_N_INTER * sizeof(float);
 }

References PREFILL_TILE_M.

◆ fused_rmsnorm_qkv_prefill()

void fused_rmsnorm_qkv_prefill	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Wk,
		const float *	Wv,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	hidden,
		int	q_dim,
		int	kv_dim,
		float	eps,
		float *	scratch
	)

Fused RMSNorm + QKV projection for prefill.

Tiles along token dimension to keep intermediate x_norm in L2 cache. Avoids ~7MB DRAM traffic per layer for seq_len=1024, hidden=896.

Parameters

scratch Temporary buffer from fused_rmsnorm_qkv_scratch_size()

Fused RMSNorm + QKV projection for prefill.

KEY INSIGHT: For Qwen2-0.5B, all QKV weights fit in L3: Wq (896×896) + Wk (128×896) + Wv (128×896) = 4.1MB < 6MB L3

So we use M-tiling (tokens) only:

For each token tile: a. Compute RMSNorm ONCE into scratch (x_norm stays in L2) b. Do all three GEMMs (Q, K, V) against cached x_norm c. Weights stay hot in L3 across all token tiles

This avoids both:

Large x_norm intermediate buffer (only TILE_M × hidden in L2)
RMSNorm recomputation (done once per token tile, used 3×)

Definition at line 393 of file prefill_fused_gemm.c.

 {
     /* scratch is x_norm tile: [TILE_M × hidden] fits in L2 */
  
     /* Process token tiles - weights stay in L3 across all tiles */
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * hidden;
  
         /* Step 1: RMSNorm for this token tile (computed ONCE, used 3×) */
         rmsnorm_tile(x_tile, gamma, scratch, tile_m, hidden, hidden, eps);
  
         /* Step 2: Q projection - x_norm is hot in L2, Wq hot in L3 */
         float *Q_tile = Q + (size_t)m_start * q_dim;
         gemm_tile_nt_strided(scratch, Wq, Q_tile, tile_m, q_dim, hidden, q_dim);
  
         /* Step 3: K projection - x_norm still hot, Wk displaces some Wq */
         float *K_tile = K + (size_t)m_start * kv_dim;
         gemm_tile_nt_strided(scratch, Wk, K_tile, tile_m, kv_dim, hidden, kv_dim);
  
         /* Step 4: V projection - x_norm still hot, Wv displaces Wk */
         float *V_tile = V + (size_t)m_start * kv_dim;
         gemm_tile_nt_strided(scratch, Wv, V_tile, tile_m, kv_dim, hidden, kv_dim);
     }
 }

References gemm_tile_nt_strided(), PREFILL_TILE_M, and rmsnorm_tile().

◆ fused_rmsnorm_qkv_prefill_head_major()

void fused_rmsnorm_qkv_prefill_head_major	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Bq,
		const float *	Wk,
		const float *	Bk,
		const float *	Wv,
		const float *	Bv,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		float	eps,
		float *	scratch
	)

Fused RMSNorm + QKV projection for prefill (head-major outputs)

Writes Q as [num_heads, seq_len, aligned_head_dim] and K/V with stride kv_stride_tokens for KV-cache compatibility.

Q is written as [num_heads, seq_len, aligned_head_dim]. K/V are written with kv_stride_tokens for KV-cache compatibility.

Definition at line 441 of file prefill_fused_gemm.c.

 {
     if (!x || !gamma || !Wq || !Wk || !Wv || !Q || !K || !V || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         head_dim <= 0 || aligned_head_dim <= 0 ||
         num_heads <= 0 || num_kv_heads <= 0) {
         return;
     }
     if (kv_stride_tokens < seq_len) {
         return;
     }
  
     const size_t q_head_stride = (size_t)seq_len * (size_t)aligned_head_dim;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
     const size_t head_w_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;
  
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         rmsnorm_tile(x_tile, gamma, scratch, tile_m, embed_dim, aligned_embed_dim, eps);
  
         for (int h = 0; h < num_heads; ++h) {
             const float *wq_h = Wq + (size_t)h * head_w_stride;
             const float *bq_h = Bq ? (Bq + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *q_h = Q + (size_t)h * q_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             gemm_tile_nt_strided(scratch, wq_h, q_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(q_h, bq_h, tile_m, aligned_head_dim);
         }
  
         for (int h = 0; h < num_kv_heads; ++h) {
             const float *wk_h = Wk + (size_t)h * head_w_stride;
             const float *wv_h = Wv + (size_t)h * head_w_stride;
             const float *bk_h = Bk ? (Bk + (size_t)h * (size_t)aligned_head_dim) : NULL;
             const float *bv_h = Bv ? (Bv + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *k_h = K + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
             float *v_h = V + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             gemm_tile_nt_strided(scratch, wk_h, k_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(k_h, bk_h, tile_m, aligned_head_dim);
  
             gemm_tile_nt_strided(scratch, wv_h, v_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(v_h, bv_h, tile_m, aligned_head_dim);
         }
     }
 }

References add_bias_tile(), gemm_tile_nt_strided(), PREFILL_TILE_M, and rmsnorm_tile().

Referenced by mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ fused_rmsnorm_qkv_prefill_head_major_quant()

void fused_rmsnorm_qkv_prefill_head_major_quant	(	const float *	x,
		const float *	gamma,
		const void *	Wq,
		const float *	Bq,
		CKDataType	wq_dt,
		const void *	Wk,
		const float *	Bk,
		CKDataType	wk_dt,
		const void *	Wv,
		const float *	Bv,
		CKDataType	wv_dt,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		float	eps,
		void *	scratch
	)

Fused RMSNorm + QKV projection for prefill (head-major, Q8 activations)

Supports Q5_0 or Q8_0 weights with Q8_0 activations.

Supports Q5_0 or Q8_0 weights with Q8_0 activations. Writes K/V directly into KV cache layout (kv_stride_tokens).

Definition at line 519 of file prefill_fused_gemm.c.

 {
     if (!x || !gamma || !Wq || !Wk || !Wv || !Q || !K || !V || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         head_dim <= 0 || aligned_head_dim <= 0 ||
         num_heads <= 0 || num_kv_heads <= 0) {
         return;
     }
     if (aligned_embed_dim % 32 != 0) {
         return;
     }
     if (kv_stride_tokens < seq_len) {
         return;
     }
     /* Determine quantization path: Q8_0 activations for Q5_0/Q8_0 weights,
      * Q8_K activations for Q4_K/Q6_K weights. All QKV weights must use
      * the same quantization family. */
     int use_q8_k_path = qkv_q8_k_dtype_supported(wq_dt);
     int use_q8_0_path = qkv_q8_0_dtype_supported(wq_dt);
  
     if (!use_q8_k_path && !use_q8_0_path) {
         /* Unsupported dtype for wq */
         return;
     }
  
     /* Verify all dtypes are from the same family */
     if (use_q8_k_path) {
         if (!qkv_q8_k_dtype_supported(wk_dt) || !qkv_q8_k_dtype_supported(wv_dt)) {
             return;  /* Mixed Q8_K and Q8_0 paths not supported */
         }
     } else {
         if (!qkv_q8_0_dtype_supported(wk_dt) || !qkv_q8_0_dtype_supported(wv_dt)) {
             return;
         }
     }
  
     const size_t float_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_embed_dim * sizeof(float);
     /* Q8_K has larger blocks (256) than Q8_0 (32), so use appropriate size */
     const CKDataType act_quant_type = use_q8_k_path ? CK_DT_Q8_K : CK_DT_Q8_0;
     const size_t q8_row_bytes = ck_dtype_row_bytes(act_quant_type, (size_t)aligned_embed_dim);
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     const size_t q8_offset = align_up_size(float_bytes, 64);
  
     float *normed = (float *)scratch;
     uint8_t *q8_tile = (uint8_t *)scratch + q8_offset;
     (void)q8_bytes;
  
     const size_t q_head_stride = (size_t)seq_len * (size_t)aligned_head_dim;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;
     const size_t wq_head_bytes = ck_dtype_row_bytes(wq_dt, head_w_elems);
     const size_t wk_head_bytes = ck_dtype_row_bytes(wk_dt, head_w_elems);
     const size_t wv_head_bytes = ck_dtype_row_bytes(wv_dt, head_w_elems);
  
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         rmsnorm_tile(x_tile, gamma, normed, tile_m, embed_dim, aligned_embed_dim, eps);
  
         /* Quantize activations to appropriate format */
         for (int t = 0; t < tile_m; ++t) {
             const float *row = normed + (size_t)t * (size_t)aligned_embed_dim;
             if (use_q8_k_path) {
                 quantize_row_q8_k(row,
                                   q8_tile + (size_t)t * q8_row_bytes,
                                   aligned_embed_dim);
             } else {
                 quantize_row_q8_0(row,
                                   q8_tile + (size_t)t * q8_row_bytes,
                                   aligned_embed_dim);
             }
         }
  
         for (int h = 0; h < num_heads; ++h) {
             const uint8_t *wq_h = (const uint8_t *)Wq + (size_t)h * wq_head_bytes;
             const float *bq_h = Bq ? (Bq + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *q_h = Q + (size_t)h * q_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             if (use_q8_k_path) {
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wq_h, bq_h, q_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wq_dt);
             } else {
                 gemm_nt_q8_0_dispatch(q8_tile, wq_h, bq_h, q_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wq_dt);
             }
         }
  
         for (int h = 0; h < num_kv_heads; ++h) {
             const uint8_t *wk_h = (const uint8_t *)Wk + (size_t)h * wk_head_bytes;
             const uint8_t *wv_h = (const uint8_t *)Wv + (size_t)h * wv_head_bytes;
             const float *bk_h = Bk ? (Bk + (size_t)h * (size_t)aligned_head_dim) : NULL;
             const float *bv_h = Bv ? (Bv + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *k_h = K + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
             float *v_h = V + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             if (use_q8_k_path) {
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wk_h, bk_h, k_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wk_dt);
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wv_h, bv_h, v_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wv_dt);
             } else {
                 gemm_nt_q8_0_dispatch(q8_tile, wk_h, bk_h, k_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wk_dt);
                 gemm_nt_q8_0_dispatch(q8_tile, wv_h, bv_h, v_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wv_dt);
             }
         }
     }
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), gemm_nt_q8_0_dispatch(), gemm_nt_q8_k_qkv_dispatch(), PREFILL_TILE_M, qkv_q8_0_dtype_supported(), qkv_q8_k_dtype_supported(), quantize_row_q8_0(), quantize_row_q8_k(), and rmsnorm_tile().

Referenced by mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size()

size_t fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size ( int aligned_embed_dim )

Get scratch buffer size for fused_rmsnorm_qkv_prefill_head_major_quant.

Definition at line 651 of file prefill_fused_gemm.c.

                                                                                       {
     if (aligned_embed_dim <= 0) {
         return 0;
     }
     const size_t float_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_embed_dim * sizeof(float);
     /* Use max of Q8_0 and Q8_K sizes to support both paths */
     const size_t q8_0_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8_k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_embed_dim);
     const size_t q8_row_bytes = (q8_k_row_bytes > q8_0_row_bytes) ? q8_k_row_bytes : q8_0_row_bytes;
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     return align_up_size(float_bytes, 64) + q8_bytes;
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), and PREFILL_TILE_M.

Referenced by mega_fused_attention_prefill(), mega_fused_attention_prefill_q8_0(), mega_fused_attention_prefill_q8_0_scratch_size(), and mega_fused_attention_prefill_scratch_size().

◆ fused_rmsnorm_qkv_scratch_size()

size_t fused_rmsnorm_qkv_scratch_size ( int hidden )

Get scratch buffer size for fused_rmsnorm_qkv_prefill.

Definition at line 739 of file prefill_fused_gemm.c.

                                                   {
     return (size_t)PREFILL_TILE_M * hidden * sizeof(float);
 }

References PREFILL_TILE_M.

◆ geglu_backward_fp32()

void geglu_backward_fp32	(	const float *	x,
		const float *	d_out,
		float *	d_x,
		int	tokens,
		int	dim
	)

GeGLU backward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_backward_fp32

dL/dx given dL/d(out) where out = GELU(a) * b Chain rule: dL/da = dL/dout * d(GELU)/da * b dL/db = dL/dout * GELU(a)

After changes: make test

Definition at line 843 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         const float *d_out_ptr = d_out + (size_t)t * dim;
         float *d_x_ptr = d_x + (size_t)t * inner_dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float dout = d_out_ptr[d];
  
             // GELU(a) derivative components
             float a2 = a * a;
             float a3 = a2 * a;
             float g = sqrt_2_over_pi * (a + coeff * a3);
             float tanh_g = tanhf(g);
             float sech2_g = 1.0f - tanh_g * tanh_g;
             float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * a2);
  
             // d(GELU)/da = 0.5 * (1 + tanh(g)) + 0.5 * a * sech^2(g) * g'
             float d_gelu = 0.5f * (1.0f + tanh_g) + 0.5f * a * sech2_g * g_prime;
  
             // dL/da = dL/dout * d(GELU)/da * b
             d_x_ptr[d] = dout * d_gelu * b;
  
             // dL/db = dL/dout * GELU(a)
             float gelu_a = 0.5f * a * (1.0f + tanh_g);
             d_x_ptr[dim + d] = dout * gelu_a;
         }
     }
 }

◆ geglu_forward_bf16()

void geglu_forward_bf16	(	const uint16_t *	x,
		uint16_t *	out,
		int	tokens,
		int	dim,
		float *	scratch
	)

GeGLU forward pass (bf16)

Test:: test_geglu.py::TestGeGLU::test_geglu_forward_bf16

BF16 version: converts to FP32, computes, converts back. Caller provides scratch buffer of size 3 * tokens * dim * sizeof(float).

Layout:

scratch[0 : 2*tokens*dim] = FP32 input [a, b]
scratch[2*tokens*dim : ...] = FP32 output

Note: We need separate buffers for input and output to avoid overlap when tokens > 1. The input is 2*dim per token, output is dim per token.

After changes: make test

Definition at line 813 of file gelu_kernels.c.

 {
     if (!x || !out || !scratch) return;
  
     const size_t fp32_size = (size_t)tokens * (size_t)dim;
     const size_t input_size = fp32_size * 2;  // [a, b] = 2*dim per token
     float *fp32_input = scratch;
     float *fp32_output = scratch + input_size;
  
     // Convert BF16 input to FP32
     bf16_tensor_to_float(x, fp32_input, input_size);
  
     // Run FP32 GeGLU (output goes to separate buffer to avoid overlap)
     geglu_forward_fp32(fp32_input, fp32_output, tokens, dim);
  
     // Convert FP32 output back to BF16
     float_tensor_to_bf16(fp32_output, out, fp32_size);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and geglu_forward_fp32().

◆ geglu_forward_fp32()

void geglu_forward_fp32	(	const float *	x,
		float *	out,
		int	tokens,
		int	dim
	)

GeGLU forward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_forward_fp32

Computes out = GELU(a) * b where x = [a, b] along last dimension. Input shape: [tokens, 2 * dim], Output shape: [tokens, dim]

After changes: make test

Definition at line 623 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         // Process first half (a) with GELU, second half (b) directly
         for (; d + 32 <= dim; d += 32) {
             // Load a (first half of inner_dim)
             __m512 a0 = _mm512_loadu_ps(&x_ptr[d]);
             __m512 a1 = _mm512_loadu_ps(&x_ptr[d + 16]);
  
             // Compute GELU(a)
             __m512 a0_sq = _mm512_mul_ps(a0, a0);
             __m512 a0_cu = _mm512_mul_ps(a0_sq, a0);
             __m512 a1_sq = _mm512_mul_ps(a1, a1);
             __m512 a1_cu = _mm512_mul_ps(a1_sq, a1);
  
             // inner = sqrt(2/pi) * (a + 0.044715 * a^3)
             __m512 inner0 = _mm512_fmadd_ps(coeff_vec, a0_cu, a0);
             __m512 inner1 = _mm512_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm512_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm512_mul_ps(sqrt_2_pi_vec, inner1);
  
             // tanh(inner)
             __m512 tanh0 = tanh512_fast(inner0);
             __m512 tanh1 = tanh512_fast(inner1);
  
             // GELU = 0.5 * a * (1 + tanh)
             __m512 gelu0 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a0, _mm512_add_ps(one_vec, tanh0)));
             __m512 gelu1 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a1, _mm512_add_ps(one_vec, tanh1)));
  
             // Load b (second half of inner_dim)
             __m512 b0 = _mm512_loadu_ps(&x_ptr[dim + d]);
             __m512 b1 = _mm512_loadu_ps(&x_ptr[dim + d + 16]);
  
             // out = GELU(a) * b
             _mm512_storeu_ps(&out_ptr[d], _mm512_mul_ps(gelu0, b0));
             _mm512_storeu_ps(&out_ptr[d + 16], _mm512_mul_ps(gelu1, b1));
         }
         // Handle remaining
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 16 <= dim; d += 16) {
             // Load a
             __m256 a0 = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a1 = _mm256_loadu_ps(&x_ptr[d + 8]);
  
             // GELU(a)
             __m256 a0_sq = _mm256_mul_ps(a0, a0);
             __m256 a0_cu = _mm256_mul_ps(a0_sq, a0);
             __m256 a1_sq = _mm256_mul_ps(a1, a1);
             __m256 a1_cu = _mm256_mul_ps(a1_sq, a1);
  
             __m256 inner0 = _mm256_fmadd_ps(coeff_vec, a0_cu, a0);
             __m256 inner1 = _mm256_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm256_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm256_mul_ps(sqrt_2_pi_vec, inner1);
  
             __m256 tanh0 = tanh256_fast(inner0);
             __m256 tanh1 = tanh256_fast(inner1);
  
             __m256 gelu0 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a0, _mm256_add_ps(one_vec, tanh0)));
             __m256 gelu1 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a1, _mm256_add_ps(one_vec, tanh1)));
  
             // b
             __m256 b0 = _mm256_loadu_ps(&x_ptr[dim + d]);
             __m256 b1 = _mm256_loadu_ps(&x_ptr[dim + d + 8]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu0, b0));
             _mm256_storeu_ps(&out_ptr[d + 8], _mm256_mul_ps(gelu1, b1));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     float inner_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 8 <= dim; d += 8) {
             __m256 a = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a_sq = _mm256_mul_ps(a, a);
             __m256 a_cu = _mm256_mul_ps(a_sq, a);
  
             __m256 coeff_a_cu = _mm256_mul_ps(coeff_vec, a_cu);
             __m256 inner = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(a, coeff_a_cu));
  
             _mm256_store_ps(inner_arr, inner);
             for (int j = 0; j < 8; ++j) {
                 tanh_arr[j] = tanhf(inner_arr[j]);
             }
             __m256 tanh_val = _mm256_load_ps(tanh_arr);
  
             __m256 gelu = _mm256_mul_ps(half_vec, _mm256_mul_ps(a, _mm256_add_ps(one_vec, tanh_val)));
             __m256 b = _mm256_loadu_ps(&x_ptr[dim + d]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu, b));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #else
     // Scalar fallback
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
 #endif
 }

References __attribute__().

◆ gelu_backward_exact()

void gelu_backward_exact	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 257 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 coeff3_vec = _mm512_set1_ps(3.0f * coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&input[i]);
         __m512 dy = _mm512_loadu_ps(&d_output[i]);
  
         __m512 x2 = _mm512_mul_ps(x, x);
         __m512 x3 = _mm512_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m512 g = _mm512_fmadd_ps(coeff_vec, x3, x);
         g = _mm512_mul_ps(sqrt_2_pi_vec, g);
  
         __m512 tanh_g = tanh512_fast(g);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m512 g_prime = _mm512_fmadd_ps(coeff3_vec, x2, one_vec);
         g_prime = _mm512_mul_ps(sqrt_2_pi_vec, g_prime);
  
         // sech^2(g) = 1 - tanh^2(g)
         __m512 sech2_g = _mm512_fnmadd_ps(tanh_g, tanh_g, one_vec);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m512 term1 = _mm512_mul_ps(half_vec, _mm512_add_ps(one_vec, tanh_g));
         __m512 term2 = _mm512_mul_ps(half_vec, _mm512_mul_ps(x, _mm512_mul_ps(sech2_g, g_prime)));
         __m512 gelu_deriv = _mm512_add_ps(term1, term2);
  
         __m512 result = _mm512_mul_ps(dy, gelu_deriv);
         _mm512_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 coeff3_vec = _mm256_set1_ps(3.0f * coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 g = _mm256_fmadd_ps(coeff_vec, x3, x);
         g = _mm256_mul_ps(sqrt_2_pi_vec, g);
  
         __m256 tanh_g = tanh256_fast(g);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m256 g_prime = _mm256_fmadd_ps(coeff3_vec, x2, one_vec);
         g_prime = _mm256_mul_ps(sqrt_2_pi_vec, g_prime);
  
         // sech^2(g) = 1 - tanh^2(g)
         __m256 sech2_g = _mm256_fnmadd_ps(tanh_g, tanh_g, one_vec);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m256 term1 = _mm256_mul_ps(half_vec, _mm256_add_ps(one_vec, tanh_g));
         __m256 term2 = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, _mm256_mul_ps(sech2_g, g_prime)));
         __m256 gelu_deriv = _mm256_add_ps(term1, term2);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar tanh
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 coeff3_vec = _mm256_set1_ps(3.0f * coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     float g_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 coeff_x3 = _mm256_mul_ps(coeff_vec, x3);
         __m256 g = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(x, coeff_x3));
  
         // Compute tanh scalarly
         _mm256_store_ps(g_arr, g);
         for (int j = 0; j < 8; ++j) {
             tanh_arr[j] = tanhf(g_arr[j]);
         }
         __m256 tanh_g = _mm256_load_ps(tanh_arr);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m256 coeff3_x2 = _mm256_mul_ps(coeff3_vec, x2);
         __m256 g_prime = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(one_vec, coeff3_x2));
  
         // sech^2(g) = 1 - tanh^2(g)
         __m256 tanh_g_sq = _mm256_mul_ps(tanh_g, tanh_g);
         __m256 sech2_g = _mm256_sub_ps(one_vec, tanh_g_sq);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m256 term1 = _mm256_mul_ps(half_vec, _mm256_add_ps(one_vec, tanh_g));
         __m256 term2 = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, _mm256_mul_ps(sech2_g, g_prime)));
         __m256 gelu_deriv = _mm256_add_ps(term1, term2);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #else
     // Scalar fallback
     for (size_t i = 0; i < n; ++i) {
         float x = input[i];
  
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
  
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
  
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative =
             0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
  
         d_input[i] = d_output[i] * gelu_derivative;
     }
 #endif
 }

References __attribute__().

◆ gelu_backward_exact_bf16()

void gelu_backward_exact_bf16	(	const uint16_t *	input,
		const uint16_t *	d_output,
		uint16_t *	d_input,
		size_t	n,
		float *	scratch_input,
		float *	scratch_d_output,
		float *	scratch_d_input
	)

Definition at line 46 of file gelu_kernels_bf16.c.

 {
     if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
  
     bf16_tensor_to_float(input, scratch_input, n);
     bf16_tensor_to_float(d_output, scratch_d_output, n);
  
     // Use scalar exact version to avoid fast tanh approximation error
     // accumulating with BF16 precision loss.
     gelu_backward_scalar(scratch_input, scratch_d_output, scratch_d_input, n);
  
     float_tensor_to_bf16(scratch_d_input, d_input, n);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_backward_scalar().

◆ gelu_backward_fast()

void gelu_backward_fast	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 486 of file gelu_kernels.c.

 {
     const float beta = 1.702f;
  
 #if defined(__AVX512F__)
     const __m512 beta_vec = _mm512_set1_ps(beta);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
     const __m512 neg_beta_vec = _mm512_set1_ps(-beta);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&input[i]);
         __m512 dy = _mm512_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m512 neg_beta_x = _mm512_mul_ps(neg_beta_vec, x);
         __m512 exp_neg = exp512_fast(neg_beta_x);
         __m512 s = _mm512_div_ps(one_vec, _mm512_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m512 one_minus_s = _mm512_sub_ps(one_vec, s);
         __m512 inner = _mm512_fmadd_ps(_mm512_mul_ps(x, one_minus_s), beta_vec, one_vec);
         __m512 gelu_deriv = _mm512_mul_ps(s, inner);
  
         __m512 result = _mm512_mul_ps(dy, gelu_deriv);
         _mm512_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX2__)
     const __m256 beta_vec = _mm256_set1_ps(beta);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
     const __m256 neg_beta_vec = _mm256_set1_ps(-beta);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m256 neg_beta_x = _mm256_mul_ps(neg_beta_vec, x);
         __m256 exp_neg = exp256_fast(neg_beta_x);
         __m256 s = _mm256_div_ps(one_vec, _mm256_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m256 one_minus_s = _mm256_sub_ps(one_vec, s);
         __m256 inner = _mm256_fmadd_ps(_mm256_mul_ps(x, one_minus_s), beta_vec, one_vec);
         __m256 gelu_deriv = _mm256_mul_ps(s, inner);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar exp
     const __m256 beta_vec = _mm256_set1_ps(beta);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
     const __m256 neg_beta_vec = _mm256_set1_ps(-beta);
  
     size_t i = 0;
     float neg_beta_x_arr[8] __attribute__((aligned(32)));
     float exp_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m256 neg_beta_x = _mm256_mul_ps(neg_beta_vec, x);
  
         // Compute exp scalarly
         _mm256_store_ps(neg_beta_x_arr, neg_beta_x);
         for (int j = 0; j < 8; ++j) {
             exp_arr[j] = expf(neg_beta_x_arr[j]);
         }
         __m256 exp_neg = _mm256_load_ps(exp_arr);
  
         __m256 s = _mm256_div_ps(one_vec, _mm256_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m256 one_minus_s = _mm256_sub_ps(one_vec, s);
         __m256 x_one_minus_s = _mm256_mul_ps(x, one_minus_s);
         __m256 x_one_minus_s_beta = _mm256_mul_ps(x_one_minus_s, beta_vec);
         __m256 inner = _mm256_add_ps(one_vec, x_one_minus_s_beta);
         __m256 gelu_deriv = _mm256_mul_ps(s, inner);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
 #endif
 }

References __attribute__().

Referenced by gelu_backward_fast_bf16().

◆ gelu_backward_fast_bf16()

void gelu_backward_fast_bf16	(	const uint16_t *	input,
		const uint16_t *	d_output,
		uint16_t *	d_input,
		size_t	n,
		float *	scratch_input,
		float *	scratch_d_output,
		float *	scratch_d_input
	)

Definition at line 69 of file gelu_kernels_bf16.c.

 {
     if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
  
     bf16_tensor_to_float(input, scratch_input, n);
     bf16_tensor_to_float(d_output, scratch_d_output, n);
  
     gelu_backward_fast(scratch_input, scratch_d_output, scratch_d_input, n);
  
     float_tensor_to_bf16(scratch_d_input, d_input, n);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_backward_fast().

◆ gelu_backward_scalar()

void gelu_backward_scalar	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 462 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     for (size_t i = 0; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
 }

Referenced by gelu_backward_exact_bf16().

◆ gelu_exact_inplace()

void gelu_exact_inplace	(	float *	data,
		size_t	n
	)

Definition at line 446 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     for (size_t i = 0; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
 }

Referenced by gelu_fast_inplace_bf16(), and mlp_token_parallel_exact().

◆ gelu_fast_inplace()

void gelu_fast_inplace	(	float *	data,
		size_t	n
	)

GELU activation forward (fast approximation, in-place)

Test:

test_gelu.py::TestGELUForward::test_gelu_fast_inplace

test_gelu.py::TestGELUForward::test_gelu_vs_exact

test_parity.py::test_gelu_parity

Fast GELU approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) In-place on contiguous buffer.

After changes: make test && make llamacpp-parity-full

Definition at line 132 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&data[i]);
         __m512 x2 = _mm512_mul_ps(x, x);
         __m512 x3 = _mm512_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m512 inner = _mm512_fmadd_ps(coeff_vec, x3, x);
         inner = _mm512_mul_ps(sqrt_2_pi_vec, inner);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m512 tanh_val = tanh512_fast(inner);
         __m512 one_plus_tanh = _mm512_add_ps(one_vec, tanh_val);
         __m512 result = _mm512_mul_ps(half_vec, _mm512_mul_ps(x, one_plus_tanh));
  
         _mm512_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&data[i]);
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 inner = _mm256_fmadd_ps(coeff_vec, x3, x);
         inner = _mm256_mul_ps(sqrt_2_pi_vec, inner);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m256 tanh_val = tanh256_fast(inner);
         __m256 one_plus_tanh = _mm256_add_ps(one_vec, tanh_val);
         __m256 result = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, one_plus_tanh));
  
         _mm256_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar tanh
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     float inner_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&data[i]);
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 coeff_x3 = _mm256_mul_ps(coeff_vec, x3);
         __m256 inner = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(x, coeff_x3));
  
         // Compute tanh scalarly
         _mm256_store_ps(inner_arr, inner);
         for (int j = 0; j < 8; ++j) {
             tanh_arr[j] = tanhf(inner_arr[j]);
         }
         __m256 tanh_val = _mm256_load_ps(tanh_arr);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m256 one_plus_tanh = _mm256_add_ps(one_vec, tanh_val);
         __m256 result = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, one_plus_tanh));
  
         _mm256_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #else
     // Scalar fallback
     for (size_t i = 0; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
 #endif
 }

References __attribute__().

Referenced by mlp_token_parallel().

◆ gelu_fast_inplace_bf16()

void gelu_fast_inplace_bf16	(	uint16_t *	data,
		size_t	n,
		float *	scratch
	)

Definition at line 31 of file gelu_kernels_bf16.c.

 {
     if (!scratch) return;
  
     bf16_tensor_to_float(data, scratch, n);
     // Use exact version to avoid fast tanh approximation error accumulating
     // with BF16 precision loss. Conversion overhead dominates anyway.
     gelu_exact_inplace(scratch, n);
     float_tensor_to_bf16(scratch, data, n);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_exact_inplace().

◆ gemm_avx512_parallel()

void gemm_avx512_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 149 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_naive_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             __m512 sum_vec = _mm512_setzero_ps();
             int k;
             for (k = 0; k <= K - 16; k += 16) {
                 __m512 a_vec = _mm512_loadu_ps(&A[i * K + k]);
                 __m512 b_vec = _mm512_loadu_ps(&B[j * K + k]);
                 sum_vec = _mm512_fmadd_ps(a_vec, b_vec, sum_vec);
             }
             float sum = _mm512_reduce_add_ps(sum_vec);
             for (; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             float bias_val = bias ? bias[j] : 0.0f;
             C[i * N + j] = sum + bias_val;
         }
     }
 #elif defined(__AVX__)
     // AVX1 path: 256-bit vectors, no FMA (use mul + add)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             __m256 sum_vec = _mm256_setzero_ps();
             int k;
             for (k = 0; k <= K - 8; k += 8) {
                 __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                 __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                 __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             float sum = hsum256_ps(sum_vec);
             for (; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             float bias_val = bias ? bias[j] : 0.0f;
             C[i * N + j] = sum + bias_val;
         }
     }
 #else
     gemm_naive_parallel(A, B, bias, C, M, N, K);
 #endif
 }

References C, ck_strict_parity_enabled(), gemm_naive_parallel(), and gemm_naive_serial_double().

◆ gemm_bias_gelu_fused()

void gemm_bias_gelu_fused	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 131 of file gemm_fused_kernels.c.

 {
 #if defined(__AVX__)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             __m256 sum_vec = _mm256_setzero_ps();
             int k;
             for (k = 0; k <= K - 8; k += 8) {
                 __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                 __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                 __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             float sum = hsum256_ps_fused(sum_vec);
             for (; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             sum += bias[j];
             C[i * N + j] = fast_gelu_scalar(sum);
         }
     }
 #else
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             sum += bias[j];
             C[i * N + j] = fast_gelu_scalar(sum);
         }
     }
 #endif
 }

References C, fast_gelu_scalar(), and hsum256_ps_fused().

◆ gemm_bias_relu_fused()

void gemm_bias_relu_fused	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 84 of file gemm_fused_kernels.c.

 {
 #if defined(__AVX__)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             __m256 sum_vec = _mm256_setzero_ps();
             int k;
             for (k = 0; k <= K - 8; k += 8) {
                 __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                 __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                 __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             float sum = hsum256_ps_fused(sum_vec);
             for (; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             // Fused: add bias and ReLU while still in register
             sum += bias[j];
             C[i * N + j] = sum > 0.0f ? sum : 0.0f;
         }
     }
 #else
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             sum += bias[j];
             C[i * N + j] = sum > 0.0f ? sum : 0.0f;
         }
     }
 #endif
 }

References C, and hsum256_ps_fused().

◆ gemm_bias_silu_fused()

void gemm_bias_silu_fused	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 177 of file gemm_fused_kernels.c.

 {
 #if defined(__AVX__)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             __m256 sum_vec = _mm256_setzero_ps();
             int k;
             for (k = 0; k <= K - 8; k += 8) {
                 __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                 __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                 __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             float sum = hsum256_ps_fused(sum_vec);
             for (; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             sum += bias[j];
             // SiLU: x * sigmoid(x)
             float sig = 1.0f / (1.0f + expf(-sum));
             C[i * N + j] = sum * sig;
         }
     }
 #else
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             sum += bias[j];
             float sig = 1.0f / (1.0f + expf(-sum));
             C[i * N + j] = sum * sig;
         }
     }
 #endif
 }

References C, and hsum256_ps_fused().

◆ gemm_blocked_serial()

void gemm_blocked_serial	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 661 of file gemm_kernels.c.

 {
     // Ensure threads are initialized (auto-detects on first call)
     (void)ck_get_num_threads();
  
     if (ck_strict_parity_enabled()) {
         gemm_naive_serial_double(A, B, bias, C, M, N, K);
         return;
     }
  
     // Decode-time matvec (M=1) is extremely common and benefits from parallelism over N.
     // Lower threshold to parallelize more ops; OpenMP overhead is ~1-2μs per barrier.
     // For N*K >= 64K elements, parallel is worthwhile.
     if (M == 1 && (size_t)N * (size_t)K >= 65536) {
         gemm_nt_matvec_parallel(A, B, bias, C, N, K);
         return;
     }
  
     /*
      * Use gemm_microkernel for large matrices - it uses MKL/oneDNN when available,
      * which is substantially faster than our hand-written SIMD kernels.
      * B is stored as [N x K] (transposed), so we pass B_transposed=1.
      * Note: Use threshold of 32 to avoid numerical precision issues with small matrices.
      */
     if (M >= 32 && N >= 32 && K >= 32) {
         gemm_microkernel(A, B, C, M, N, K, 1);  // B_transposed=1
         ck_gemm_add_bias(C, bias, M, N);
         return;
     }
 #if defined(__AVX512F__)
     const int block_size = 64;
 #elif defined(__AVX__)
     const int block_size = 32;
 #else
     const int block_size = 32;
 #endif
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             C[i * N + j] = bias ? bias[j] : 0.0f;
         }
     }
     for (int ii = 0; ii < M; ii += block_size) {
         for (int jj = 0; jj < N; jj += block_size) {
             for (int kk = 0; kk < K; kk += block_size) {
                 int i_end = ck_min(ii + block_size, M);
                 int j_end = ck_min(jj + block_size, N);
                 int k_end = ck_min(kk + block_size, K);
  
                 for (int i = ii; i < i_end; i++) {
                     for (int j = jj; j < j_end; j++) {
 #if defined(__AVX512F__)
                         __m512 sum_vec = _mm512_setzero_ps();
                         int k;
                         for (k = kk; k <= k_end - 16; k += 16) {
                             __m512 a_vec = _mm512_loadu_ps(&A[i * K + k]);
                             __m512 b_vec = _mm512_loadu_ps(&B[j * K + k]);
                             sum_vec = _mm512_fmadd_ps(a_vec, b_vec, sum_vec);
                         }
                         float partial_sum = _mm512_reduce_add_ps(sum_vec);
                         for (; k < k_end; k++) {
                             partial_sum += A[i * K + k] * B[j * K + k];
                         }
 #elif defined(__AVX__)
                         __m256 sum_vec = _mm256_setzero_ps();
                         int k;
                         for (k = kk; k <= k_end - 8; k += 8) {
                             __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                             __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                             __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                             sum_vec = _mm256_add_ps(sum_vec, prod);
                         }
                         float partial_sum = hsum256_ps(sum_vec);
                         for (; k < k_end; k++) {
                             partial_sum += A[i * K + k] * B[j * K + k];
                         }
 #else
                         float partial_sum = 0.0f;
                         for (int k = kk; k < k_end; k++) {
                             partial_sum += A[i * K + k] * B[j * K + k];
                         }
 #endif
                         C[i * N + j] += partial_sum;
                     }
                 }
             }
         }
     }
 }

References C, ck_gemm_add_bias(), ck_get_num_threads(), ck_min(), ck_strict_parity_enabled(), gemm_microkernel(), gemm_naive_serial_double(), and gemm_nt_matvec_parallel().

Referenced by ck_attention_project_head_major(), ck_gemm_nt_quant(), ck_mlp_swiglu_forward(), ck_mlp_swiglu_forward_fused_token(), ck_qkv_project_head_major(), ck_qkv_project_head_major_token(), mlp_token_parallel(), and mlp_token_parallel_exact().

◆ gemm_blocked_serial_bf16()

void gemm_blocked_serial_bf16	(	const uint16_t *	A,
		const uint16_t *	B,
		const uint16_t *	bias,
		uint16_t *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 272 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
 #if HAVE_NATIVE_BF16
     /* Native BF16 instructions available (Ice Lake / Sapphire Rapids+) */
     gemm_bf16_native(A, B, bias, C, M, N, K);
 #elif defined(__AVX512F__)
     /* Use AVX-512F with software BF16 conversion */
     if (M * N > 4096) {
         gemm_bf16_blocked_avx512(A, B, bias, C, M, N, K);
     } else {
         gemm_bf16_avx512(A, B, bias, C, M, N, K);
     }
 #else
     /* Scalar fallback */
     gemm_bf16_scalar(A, B, bias, C, M, N, K);
 #endif
 }

References C.

◆ gemm_fine_grained_parallel()

void gemm_fine_grained_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 205 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_naive_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
     const int block_size = 64;
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             C[i * N + j] = bias ? bias[j] : 0.0f;
         }
     }
 #pragma omp parallel for collapse(3)
     for (int ii = 0; ii < M; ii += block_size) {
         for (int jj = 0; jj < N; jj += block_size) {
             for (int kk = 0; kk < K; kk += block_size) {
                 int i_end = ck_min(ii + block_size, M);
                 int j_end = ck_min(jj + block_size, N);
                 int k_end = ck_min(kk + block_size, K);
  
                 for (int i = ii; i < i_end; i++) {
                     for (int j = jj; j < j_end; j++) {
                         __m512 sum_vec = _mm512_setzero_ps();
                         int k;
                         for (k = kk; k <= k_end - 16; k += 16) {
                             __m512 a_vec = _mm512_loadu_ps(&A[i * K + k]);
                             __m512 b_vec = _mm512_loadu_ps(&B[j * K + k]);
                             sum_vec = _mm512_fmadd_ps(a_vec, b_vec, sum_vec);
                         }
                         float partial_sum = _mm512_reduce_add_ps(sum_vec);
                         for (; k < k_end; k++) {
                             partial_sum += A[i * K + k] * B[j * K + k];
                         }
 #pragma omp atomic
                         C[i * N + j] += partial_sum;
                     }
                 }
             }
         }
     }
 #elif defined(__AVX__)
     // AVX1 cache-blocked version
     const int block_size = 32;  // Smaller block for L1 cache
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             C[i * N + j] = bias ? bias[j] : 0.0f;
         }
     }
 #pragma omp parallel for collapse(3)
     for (int ii = 0; ii < M; ii += block_size) {
         for (int jj = 0; jj < N; jj += block_size) {
             for (int kk = 0; kk < K; kk += block_size) {
                 int i_end = ck_min(ii + block_size, M);
                 int j_end = ck_min(jj + block_size, N);
                 int k_end = ck_min(kk + block_size, K);
  
                 for (int i = ii; i < i_end; i++) {
                     for (int j = jj; j < j_end; j++) {
                         __m256 sum_vec = _mm256_setzero_ps();
                         int k;
                         for (k = kk; k <= k_end - 8; k += 8) {
                             __m256 a_vec = _mm256_loadu_ps(&A[i * K + k]);
                             __m256 b_vec = _mm256_loadu_ps(&B[j * K + k]);
                             __m256 prod = _mm256_mul_ps(a_vec, b_vec);
                             sum_vec = _mm256_add_ps(sum_vec, prod);
                         }
                         float partial_sum = hsum256_ps(sum_vec);
                         for (; k < k_end; k++) {
                             partial_sum += A[i * K + k] * B[j * K + k];
                         }
 #pragma omp atomic
                         C[i * N + j] += partial_sum;
                     }
                 }
             }
         }
     }
 #else
     gemm_naive_parallel(A, B, bias, C, M, N, K);
 #endif
 }

References C, ck_min(), ck_strict_parity_enabled(), gemm_naive_parallel(), and gemm_naive_serial_double().

◆ gemm_microkernel()

void gemm_microkernel	(	const float *	A,
		const float *	B,
		float *	C,
		int	M,
		int	N,
		int	K,
		int	B_transposed
	)

Definition at line 1134 of file gemm_microkernel.c.

 {
     if (B_transposed) {
         gemm_microkernel_blocked_bt(A, B, C, M, N, K);
     } else {
         // Use packed version for large matrices
         if (M >= PACK_THRESHOLD && N >= PACK_THRESHOLD && K >= PACK_THRESHOLD) {
             gemm_microkernel_packed(A, B, C, M, N, K);
         } else {
             gemm_microkernel_blocked(A, B, C, M, N, K);
         }
     }
 }

References C, gemm_microkernel_blocked(), gemm_microkernel_blocked_bt(), gemm_microkernel_packed(), and PACK_THRESHOLD.

Referenced by gemm_blocked_serial().

◆ gemm_microkernel_blocked()

void gemm_microkernel_blocked	(	const float *	A,
		const float *	B,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 934 of file gemm_microkernel.c.

 {
     const int mr = MR;
     const int nr = NR;
  
     // Use sequential version for small matrices to avoid OpenMP overhead
     // Threshold tuned for typical 4-8 core systems
     if ((size_t)M * N * K <= 512ULL * 512 * 512) {
         gemm_microkernel_sequential(A, B, C, M, N, K);
         return;
     }
  
     // Initialize thread count to physical cores (once)
     gemm_init_threads();
  
     // Zero output first
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < M; i++) {
         memset(&C[i * N], 0, N * sizeof(float));
     }
  
     // Block over K (outermost - for accumulation across all threads)
     for (int k0 = 0; k0 < K; k0 += KC) {
         int kb = (k0 + KC <= K) ? KC : (K - k0);
         int first_k = (k0 == 0);
  
         // Parallelize over M rows - each thread gets a chunk of M
         // This gives better cache locality than tile-level parallelism
         #pragma omp parallel for schedule(static)
         for (int m0 = 0; m0 < M; m0 += mr) {
             int mr_actual = (m0 + mr <= M) ? mr : (M - m0);
  
             // Each thread processes all N tiles for its M rows
             for (int n0 = 0; n0 < N; n0 += nr) {
                 int nr_actual = (n0 + nr <= N) ? nr : (N - n0);
  
                 const float *A_tile = &A[m0 * K + k0];
                 const float *B_tile = &B[k0 * N + n0];
                 float *C_tile = &C[m0 * N + n0];
  
                 if (mr_actual == mr && nr_actual == nr) {
 #if defined(__AVX512F__)
                     gemm_microkernel_6x32_avx512(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
 #elif defined(__FMA__)
                     gemm_microkernel_6x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
 #elif defined(__AVX__)
                     gemm_microkernel_4x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
 #else
                     gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
 #endif
                 } else {
                     gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
                 }
             }
         }
     }
 }

References C, gemm_init_threads(), gemm_microkernel_edge(), gemm_microkernel_sequential(), KC, MR, and NR.

Referenced by gemm_microkernel(), and gemm_microkernel_packed().

◆ gemm_microkernel_blocked_bt()

void gemm_microkernel_blocked_bt	(	const float *	A,
		const float *	B,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 1058 of file gemm_microkernel.c.

 {
     // Zero output first
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < M; i++) {
         memset(&C[i * N], 0, N * sizeof(float));
     }
  
     const int mr = MR;
     const int nr = NR;
  
     #pragma omp parallel for schedule(dynamic) collapse(2)
     for (int m0 = 0; m0 < M; m0 += MC) {
         for (int n0 = 0; n0 < N; n0 += NC) {
             int mb = (m0 + MC <= M) ? MC : (M - m0);
             int nb = (n0 + NC <= N) ? NC : (N - n0);
  
             for (int k0 = 0; k0 < K; k0 += KC) {
                 int kb = (k0 + KC <= K) ? KC : (K - k0);
                 int first_k = (k0 == 0);
  
                 for (int m1 = 0; m1 < mb; m1 += mr) {
                     int mr_actual = (m1 + mr <= mb) ? mr : (mb - m1);
  
                     for (int n1 = 0; n1 < nb; n1 += nr) {
                         int nr_actual = (n1 + nr <= nb) ? nr : (nb - n1);
  
                         const float *A_tile = &A[(m0 + m1) * K + k0];
                         const float *B_tile = &B[(n0 + n1) * K + k0];
                         float *C_tile = &C[(m0 + m1) * N + (n0 + n1)];
  
                         if (mr_actual == mr && nr_actual == nr) {
 #if defined(__AVX512F__)
                             gemm_microkernel_6x32_bt_avx512(kb, A_tile, K, B_tile, K, C_tile, N, first_k);
 #else
                             // Scalar fallback for B-transposed
                             for (int i = 0; i < mr; i++) {
                                 for (int j = 0; j < nr; j++) {
                                     float sum = first_k ? 0.0f : C_tile[i * N + j];
                                     for (int kk = 0; kk < kb; kk++) {
                                         sum += A_tile[i * K + kk] * B_tile[j * K + kk];
                                     }
                                     C_tile[i * N + j] = sum;
                                 }
                             }
 #endif
                         } else {
                             // Edge case
                             for (int i = 0; i < mr_actual; i++) {
                                 for (int j = 0; j < nr_actual; j++) {
                                     float sum = first_k ? 0.0f : C_tile[i * N + j];
                                     for (int kk = 0; kk < kb; kk++) {
                                         sum += A_tile[i * K + kk] * B_tile[j * K + kk];
                                     }
                                     C_tile[i * N + j] = sum;
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
 }

References C, KC, MC, MR, NC, and NR.

Referenced by gemm_microkernel().

◆ gemm_microkernel_packed()

void gemm_microkernel_packed	(	const float *	A,
		const float *	B,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 840 of file gemm_microkernel.c.

 {
     // Use tile-parallel blocked version - scales better on many-core systems
     gemm_microkernel_blocked(A, B, C, M, N, K);
 }

References C, and gemm_microkernel_blocked().

Referenced by gemm_microkernel().

◆ gemm_naive_parallel()

void gemm_naive_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 125 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_naive_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[j * K + k];
             }
             float bias_val = bias ? bias[j] : 0.0f;
             C[i * N + j] = sum + bias_val;
         }
     }
 }

References C, ck_strict_parity_enabled(), and gemm_naive_serial_double().

Referenced by ck_attention_project_head_major_ref(), ck_mlp_swiglu_forward_ref(), ck_qkv_project_head_major_ref(), gemm_avx512_parallel(), and gemm_fine_grained_parallel().

◆ gemm_nn_avx512()

void gemm_nn_avx512	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 339 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_nn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
     // For gemm_nn, we can't vectorize over K easily since B[k,j] has stride N.
     // Instead, vectorize over N (output columns) when N >= 16.
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         int j = 0;
         // Process 16 output columns at a time
         for (; j <= N - 16; j += 16) {
             __m512 sum_vec = bias ? _mm512_loadu_ps(&bias[j]) : _mm512_setzero_ps();
             for (int k = 0; k < K; k++) {
                 __m512 a_broadcast = _mm512_set1_ps(A[i * K + k]);
                 __m512 b_vec = _mm512_loadu_ps(&B[k * N + j]);
                 sum_vec = _mm512_fmadd_ps(a_broadcast, b_vec, sum_vec);
             }
             _mm512_storeu_ps(&C[i * N + j], sum_vec);
         }
         // Handle remaining columns
         for (; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 #elif defined(__AVX__)
     // AVX1: vectorize over N (8 columns at a time)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         int j = 0;
         for (; j <= N - 8; j += 8) {
             __m256 sum_vec = bias ? _mm256_loadu_ps(&bias[j]) : _mm256_setzero_ps();
             for (int k = 0; k < K; k++) {
                 __m256 a_broadcast = _mm256_set1_ps(A[i * K + k]);
                 __m256 b_vec = _mm256_loadu_ps(&B[k * N + j]);
                 __m256 prod = _mm256_mul_ps(a_broadcast, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             _mm256_storeu_ps(&C[i * N + j], sum_vec);
         }
         for (; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 #else
     gemm_nn_parallel(A, B, bias, C, M, N, K);
 #endif
 }

References C, ck_strict_parity_enabled(), gemm_nn_parallel(), and gemm_nn_serial_double().

Referenced by fc1_backward_kernel(), and fc2_backward_kernel().

◆ gemm_nn_blocked()

void gemm_nn_blocked	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 402 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_nn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
     const int block_size = 64;
 #elif defined(__AVX__)
     const int block_size = 32;
 #else
     const int block_size = 32;
 #endif
     // Initialize C with bias (parallelized)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             C[i * N + j] = bias ? bias[j] : 0.0f;
         }
     }
     // Blocked multiply-accumulate (parallelized over M blocks)
 #pragma omp parallel for
     for (int ii = 0; ii < M; ii += block_size) {
         for (int kk = 0; kk < K; kk += block_size) {
             for (int jj = 0; jj < N; jj += block_size) {
                 int i_end = ck_min(ii + block_size, M);
                 int k_end = ck_min(kk + block_size, K);
                 int j_end = ck_min(jj + block_size, N);
  
                 for (int i = ii; i < i_end; i++) {
                     for (int k = kk; k < k_end; k++) {
                         float a_val = A[i * K + k];
 #if defined(__AVX512F__)
                         __m512 a_broadcast = _mm512_set1_ps(a_val);
                         int j;
                         for (j = jj; j <= j_end - 16; j += 16) {
                             __m512 b_vec = _mm512_loadu_ps(&B[k * N + j]);
                             __m512 c_vec = _mm512_loadu_ps(&C[i * N + j]);
                             c_vec = _mm512_fmadd_ps(a_broadcast, b_vec, c_vec);
                             _mm512_storeu_ps(&C[i * N + j], c_vec);
                         }
                         for (; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #elif defined(__AVX__)
                         __m256 a_broadcast = _mm256_set1_ps(a_val);
                         int j;
                         for (j = jj; j <= j_end - 8; j += 8) {
                             __m256 b_vec = _mm256_loadu_ps(&B[k * N + j]);
                             __m256 c_vec = _mm256_loadu_ps(&C[i * N + j]);
                             __m256 prod = _mm256_mul_ps(a_broadcast, b_vec);
                             c_vec = _mm256_add_ps(c_vec, prod);
                             _mm256_storeu_ps(&C[i * N + j], c_vec);
                         }
                         for (; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #else
                         for (int j = jj; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #endif
                     }
                 }
             }
         }
     }
 }

References C, ck_min(), ck_strict_parity_enabled(), and gemm_nn_serial_double().

◆ gemm_nn_parallel()

void gemm_nn_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 317 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_nn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[i * K + k] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 }

References C, ck_strict_parity_enabled(), and gemm_nn_serial_double().

Referenced by gemm_nn_avx512().

◆ gemm_nt_q4_0()

void gemm_nt_q4_0	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.

Parameters

A	Input matrix [M x K], row-major FP32
B	Weight matrix in Q4_0 format, [N x K] stored row-major
bias	Optional bias [N], NULL if not used
C	Output [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension (number of rows in B)
K	Input dimension

Definition at line 176 of file gemm_kernels_q4_0.c.

 {
     const block_q4_0 *blocks = (const block_q4_0 *)B;
     const int blocks_per_row = K / QK4_0;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_row; b++) {
                 const block_q4_0 *block = &blocks[n * blocks_per_row + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float *ap = &a_row[b * QK4_0];
  
                 for (int i = 0; i < QK4_0 / 2; i++) {
                     const uint8_t packed = block->qs[i];
                     const int q0 = (packed & 0x0F) - 8;
                     const int q1 = (packed >> 4) - 8;
  
                     sum += d * (float)q0 * ap[2 * i + 0];
                     sum += d * (float)q1 * ap[2 * i + 1];
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

References C, CK_FP16_TO_FP32, block_q4_0::d, QK4_0, and block_q4_0::qs.

Referenced by ck_gemm_nt_quant().

◆ gemm_nt_q4_1()

void gemm_nt_q4_1	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

GEMM with transposed Q4_1 weights: C = A @ B^T.

Parameters

A	Input activations [M x K], row-major FP32
B	Weight matrix in Q4_1 format [N x K], row-major quantized
bias	Optional bias [N], NULL if not used
C	Output [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension
K	Input dimension

Definition at line 256 of file gemm_kernels_q4_1.c.

 {
     const block_q4_1 *blocks = (const block_q4_1 *)B;
     const int blocks_per_row = K / QK4_1;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_row; b++) {
                 const block_q4_1 *block = &blocks[n * blocks_per_row + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float min = CK_FP16_TO_FP32(block->m);
                 const float *ap = &a_row[b * QK4_1];
  
                 for (int i = 0; i < QK4_1 / 2; i++) {
                     const uint8_t packed = block->qs[i];
                     const int q0 = (packed & 0x0F);
                     const int q1 = (packed >> 4);
  
                     const float w0 = d * (float)q0 + min;
                     const float w1 = d * (float)q1 + min;
  
                     sum += w0 * ap[2 * i + 0];
                     sum += w1 * ap[2 * i + 1];
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

References C, CK_FP16_TO_FP32, block_q4_1::d, block_q4_1::m, QK4_1, and block_q4_1::qs.

Referenced by ck_gemm_nt_quant().

◆ gemm_nt_q4_k()

void gemm_nt_q4_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 683 of file gemm_kernels_q4k.c.

 {
     if (!A || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     /* gemm_q4_k produces Y as [batch x M_out]. Here:
      *   batch = M (tokens)
      *   M_out = N (output channels) */
     gemm_q4_k(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

References C, and gemm_q4_k().

◆ gemm_nt_q4_k_q8_k()

void gemm_nt_q4_k_q8_k	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 295 of file gemm_kernels_q4k_q8k.c.

 {
     if (!A_q8 || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     gemm_q4_k_q8_k(C, B, A_q8, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

References C, and gemm_q4_k_q8_k().

Referenced by ck_attention_project_head_major_q4_k_q8_k(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), ck_mlp_swiglu_forward_q4_k_q8_k(), ck_mlp_swiglu_forward_q4_k_q8_k_prefill(), ck_qkv_project_head_major_token_q4_k_q8_k(), gemm_nt_q8_k_mlp_dispatch(), gemm_nt_q8_k_qkv_dispatch(), model_forward_prefill_impl(), and qwen2_0_5b_decode_forward_prefill_impl().

◆ gemm_nt_q5_0()

void gemm_nt_q5_0	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 831 of file gemm_kernels_q5_0.c.

 {
     /* For decode (M=1), use direct GEMV which has AVX optimization */
     if (M == 1) {
         /* gemm_q5_0 expects column-major output, but we need row-major
          * So we call gemv_q5_0 directly for each output element */
         gemv_q5_0(C, B, A, N, K);
         if (bias) {
             for (int n = 0; n < N; n++) {
                 C[n] += bias[n];
             }
         }
         return;
     }
  
     /* For prefill (M>1), use GEMM which dispatches to GEMV with AVX/AVX512 */
     /* gemm_q5_0 produces Y as [batch x M_out]. Here:
      *   batch = M (tokens)
      *   M_out = N (output channels) */
     gemm_q5_0(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (bias) {
         for (int m = 0; m < M; m++) {
             float *row = C + (size_t)m * (size_t)N;
             for (int n = 0; n < N; n++) {
                 row[n] += bias[n];
             }
         }
     }
 }

References C, gemm_q5_0(), and gemv_q5_0().

◆ gemm_nt_q5_1()

void gemm_nt_q5_1	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

GEMM with transposed Q5_1 weights: C = A @ B^T.

Parameters

A	Input activations [M x K], row-major FP32
B	Weight matrix in Q5_1 format [N x K], row-major quantized
bias	Optional bias [N], NULL if not used
C	Output [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension
K	Input dimension

Definition at line 309 of file gemm_kernels_q5_1.c.

 {
     const block_q5_1 *blocks = (const block_q5_1 *)B;
     const int blocks_per_row = K / QK5_1;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_row; b++) {
                 const block_q5_1 *block = &blocks[n * blocks_per_row + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float min = CK_FP16_TO_FP32(block->m);
                 const float *ap = &a_row[b * QK5_1];
  
                 uint32_t qh;
                 memcpy(&qh, block->qh, sizeof(qh));
  
                 /* First 16 weights: low nibbles, high bits from qh[0:15] */
                 for (int j = 0; j < QK5_1 / 2; j++) {
                     const int lo = (block->qs[j] & 0x0F);
                     const int hi = ((qh >> j) & 1) << 4;
                     sum += (d * (float)(lo | hi) + min) * ap[j];
                 }
  
                 /* Second 16 weights: high nibbles, high bits from qh[16:31] */
                 for (int j = 0; j < QK5_1 / 2; j++) {
                     const int lo = (block->qs[j] >> 4);
                     const int hi = ((qh >> (j + 16)) & 1) << 4;
                     sum += (d * (float)(lo | hi) + min) * ap[j + QK5_1 / 2];
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

References C, CK_FP16_TO_FP32, block_q5_1::d, block_q5_1::m, block_q5_1::qh, QK5_1, and block_q5_1::qs.

Referenced by ck_gemm_nt_quant().

◆ gemm_nt_q5_k()

void gemm_nt_q5_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 218 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #else
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #endif
 }

References C, and gemm_nt_q5_k_ref().

◆ gemm_nt_q6_k()

void gemm_nt_q6_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 212 of file gemm_kernels_q6k.c.

 {
     if (!A || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     /* gemm_q6_k produces Y as [batch x M_out] where:
      *   batch = M (tokens)
      *   M_out = N (output channels) */
     gemm_q6_k(C, B, A, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

References C, and gemm_q6_k().

Referenced by ck_gemm_nt_quant(), gemm_nt_q6_k_ref(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().

◆ gemm_nt_q6_k_q8_k()

void gemm_nt_q6_k_q8_k	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K.

This is the typical inference pattern:

A: Activations in Q8_K format [M x K]
B: Weights in Q6_K format [N x K]
C: Output [M x N]

Parameters

A_q8	Input activations in Q8_K format
B	Weight matrix in Q6_K format
bias	Optional bias vector [N]
C	Output matrix
M	Batch size (number of tokens)
N	Output dimension
K	Input dimension

Definition at line 1144 of file gemm_kernels_q6k_q8k.c.

 {
     if (!A_q8 || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     gemm_q6_k_q8_k(C, B, A_q8, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

References C, and gemm_q6_k_q8_k().

Referenced by gemm_nt_q8_k_mlp_dispatch(), and gemm_nt_q8_k_qkv_dispatch().

◆ gemm_nt_q8_0()

void gemm_nt_q8_0	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.

Parameters

A	Input matrix [M x K], row-major FP32
B	Weight matrix in Q8_0 format, [N x K] stored row-major
bias	Optional bias [N], NULL if not used
C	Output [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension (number of rows in B)
K	Input dimension

Definition at line 681 of file gemm_kernels_q8_0.c.

 {
     /* Use GEMV dispatch which selects AVX/SSE/scalar based on CPU */
     for (int m = 0; m < M; m++) {
         gemv_q8_0(&C[m * N], B, &A[m * K], N, K);
         if (bias) {
             for (int n = 0; n < N; n++) C[m * N + n] += bias[n];
         }
     }
     return;
  
     const block_q8_0 *blocks = (const block_q8_0 *)B;
     const int blocks_per_row = K / QK8_0;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_row; b++) {
                 const block_q8_0 *block = &blocks[n * blocks_per_row + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float *ap = &a_row[b * QK8_0];
  
                 for (int i = 0; i < QK8_0; i++) {
                     sum += d * (float)block->qs[i] * ap[i];
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

References C, CK_FP16_TO_FP32, block_q8_0::d, gemv_q8_0(), QK8_0, and block_q8_0::qs.

◆ gemm_nt_q8_0_q8_0()

void gemm_nt_q8_0_q8_0	(	const void *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

gemm_nt_q8_0_q8_0 with optional bias (matches header signature)

C[m,n] = A[m,K] @ B[n,K]^T + bias[n]

Definition at line 582 of file gemm_batch_int8.c.

 {
     /* First compute GEMM */
 #if defined(__AVX512VNNI__)
     gemm_nt_q8_0_q8_0_vnni(A, B, C, M, N, K);
 #elif defined(__AVX512F__)
     gemm_nt_q8_0_q8_0_avx512(A, B, C, M, N, K);
 #elif defined(__AVX2__)
     gemm_nt_q8_0_q8_0_avx2(A, B, C, M, N, K);
 #elif defined(__AVX__)
     gemm_nt_q8_0_q8_0_avx(A, B, C, M, N, K);
 #else
     gemm_nt_q8_0_q8_0_ref(A, B, C, M, N, K);
 #endif
  
     /* Add bias if provided */
     if (bias != NULL) {
         for (int m = 0; m < M; m++) {
             for (int n = 0; n < N; n++) {
                 C[(size_t)m * N + n] += bias[n];
             }
         }
     }
 }

References C, and gemm_nt_q8_0_q8_0_ref().

Referenced by gemm_nt_q8_0_dispatch(), and gemm_nt_q8_0_mlp_dispatch().

◆ gemm_q4_k()

void gemm_q4_k	(	float *	Y,
		const void *	W,
		const float *	X,
		int	M,
		int	N,
		int	K
	)

Auto-dispatch GEMM based on available SIMD.

Definition at line 461 of file gemm_kernels_q4k.c.

 {
     /* Use reference implementation for correctness
      * TODO: Fix AVX-512 version to match llama.cpp layout */
     gemm_q4_k_ref(Y, W, X, M, N, K);
 }

References gemm_q4_k_ref().

Referenced by gemm_nt_q4_k().

◆ gemm_q4_k_q8_k()

void gemm_q4_k_q8_k	(	float *	Y,
		const void *	W,
		const void *	X_q8,
		int	M,
		int	N,
		int	K
	)

Definition at line 277 of file gemm_kernels_q4k_q8k.c.

 {
     if (!Y || !W || !X_q8 || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     const block_q8_K *X = (const block_q8_K *)X_q8;
     const int blocks_per_vec = K / QK_K;
  
     for (int n = 0; n < N; ++n) {
         const block_q8_K *x_row = X + (size_t)n * (size_t)blocks_per_vec;
         gemv_q4_k_q8_k(&Y[n * M], W, x_row, M, K);
     }
 }

References gemv_q4_k_q8_k(), and QK_K.

Referenced by gemm_nt_q4_k_q8_k().

◆ gemm_q6_k()

void gemm_q6_k	(	float *	Y,
		const void *	W,
		const float *	X,
		int	M,
		int	N,
		int	K
	)

Definition at line 195 of file gemm_kernels_q6k.c.

 {
     if (!Y || !W || !X) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     for (int n = 0; n < N; ++n) {
         gemv_q6_k(&Y[n * M], W, &X[n * K], M, K);
     }
 }

References gemv_q6_k().

Referenced by gemm_nt_q6_k().

◆ gemm_q6_k_q8_k()

void gemm_q6_k_q8_k	(	float *	Y,
		const void *	W,
		const void *	X_q8,
		int	M,
		int	N,
		int	K
	)

GEMM: Y = W @ X^T where W is Q6_K and X is Q8_K.

Parameters

Y	Output matrix [N x M] in row-major
W	Weight matrix in Q6_K format [M x K]
X_q8	Input matrix in Q8_K format [N x K]
M	Number of output rows (output dim)
N	Number of input vectors (batch size)
K	Input dimension

Definition at line 1110 of file gemm_kernels_q6k_q8k.c.

 {
     if (!Y || !W || !X_q8 || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     const block_q8_K *X = (const block_q8_K *)X_q8;
     const int blocks_per_vec = K / QK_K;
  
     for (int n = 0; n < N; ++n) {
         const block_q8_K *x_row = X + (size_t)n * (size_t)blocks_per_vec;
         gemv_q6_k_q8_k(&Y[n * M], W, x_row, M, K);
     }
 }

References gemv_q6_k_q8_k(), and QK_K.

Referenced by gemm_nt_q6_k_q8_k().

◆ gemm_swiglu_fused()

void gemm_swiglu_fused	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	b_gate,
		const float *	b_up,
		float *	output,
		int	M,
		int	N,
		int	K
	)

Definition at line 241 of file gemm_fused_kernels.c.

 {
 #if defined(__AVX__)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         const float *x_row = &x[i * K];
         float *out_row = &output[i * N];
  
         for (int j = 0; j < N; j++) {
             const float *w_gate_row = &W_gate[j * K];
             const float *w_up_row = &W_up[j * K];
  
             // Compute both dot products in parallel using SIMD
             __m256 gate_vec = _mm256_setzero_ps();
             __m256 up_vec = _mm256_setzero_ps();
  
             int k;
             for (k = 0; k <= K - 8; k += 8) {
                 __m256 x_vec = _mm256_loadu_ps(&x_row[k]);
                 __m256 wg_vec = _mm256_loadu_ps(&w_gate_row[k]);
                 __m256 wu_vec = _mm256_loadu_ps(&w_up_row[k]);
  
                 // gate += x * W_gate
                 gate_vec = _mm256_add_ps(gate_vec, _mm256_mul_ps(x_vec, wg_vec));
                 // up += x * W_up
                 up_vec = _mm256_add_ps(up_vec, _mm256_mul_ps(x_vec, wu_vec));
             }
  
             // Horizontal sum
             float gate = hsum256_ps_fused(gate_vec);
             float up = hsum256_ps_fused(up_vec);
  
             // Scalar remainder
             for (; k < K; k++) {
                 gate += x_row[k] * w_gate_row[k];
                 up += x_row[k] * w_up_row[k];
             }
  
             // Add biases
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             // SwiGLU: SiLU(gate) * up = gate * sigmoid(gate) * up
             float sig = 1.0f / (1.0f + expf(-gate));
             out_row[j] = gate * sig * up;
         }
     }
 #else
     // Scalar fallback
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float gate = 0.0f;
             float up = 0.0f;
  
             for (int k = 0; k < K; k++) {
                 gate += x[i * K + k] * W_gate[j * K + k];
                 up += x[i * K + k] * W_up[j * K + k];
             }
  
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             // SwiGLU: SiLU(gate) * up
             float sig = 1.0f / (1.0f + expf(-gate));
             output[i * N + j] = gate * sig * up;
         }
     }
 #endif
 }

References hsum256_ps_fused().

Referenced by ck_mlp_swiglu_forward_fused_token().

◆ gemm_tn_avx512()

void gemm_tn_avx512	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 521 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_tn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
     // Vectorize over N (output columns)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         int j = 0;
         for (; j <= N - 16; j += 16) {
             __m512 sum_vec = bias ? _mm512_loadu_ps(&bias[j]) : _mm512_setzero_ps();
             for (int k = 0; k < K; k++) {
                 __m512 a_broadcast = _mm512_set1_ps(A[k * M + i]);
                 __m512 b_vec = _mm512_loadu_ps(&B[k * N + j]);
                 sum_vec = _mm512_fmadd_ps(a_broadcast, b_vec, sum_vec);
             }
             _mm512_storeu_ps(&C[i * N + j], sum_vec);
         }
         for (; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[k * M + i] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 #elif defined(__AVX__)
     // AVX1: vectorize over N (8 columns at a time)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         int j = 0;
         for (; j <= N - 8; j += 8) {
             __m256 sum_vec = bias ? _mm256_loadu_ps(&bias[j]) : _mm256_setzero_ps();
             for (int k = 0; k < K; k++) {
                 __m256 a_broadcast = _mm256_set1_ps(A[k * M + i]);
                 __m256 b_vec = _mm256_loadu_ps(&B[k * N + j]);
                 __m256 prod = _mm256_mul_ps(a_broadcast, b_vec);
                 sum_vec = _mm256_add_ps(sum_vec, prod);
             }
             _mm256_storeu_ps(&C[i * N + j], sum_vec);
         }
         for (; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[k * M + i] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 #else
     gemm_tn_parallel(A, B, bias, C, M, N, K);
 #endif
 }

References C, ck_strict_parity_enabled(), gemm_tn_parallel(), and gemm_tn_serial_double().

Referenced by fc1_backward_kernel(), and fc2_backward_kernel().

◆ gemm_tn_blocked()

void gemm_tn_blocked	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 581 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_tn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #if defined(__AVX512F__)
     const int block_size = 64;
 #elif defined(__AVX__)
     const int block_size = 32;
 #else
     const int block_size = 32;
 #endif
     // Initialize C with bias (parallelized)
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             C[i * N + j] = bias ? bias[j] : 0.0f;
         }
     }
     // Blocked multiply-accumulate (parallelized over M blocks)
 #pragma omp parallel for
     for (int ii = 0; ii < M; ii += block_size) {
         for (int kk = 0; kk < K; kk += block_size) {
             for (int jj = 0; jj < N; jj += block_size) {
                 int i_end = ck_min(ii + block_size, M);
                 int k_end = ck_min(kk + block_size, K);
                 int j_end = ck_min(jj + block_size, N);
  
                 for (int k = kk; k < k_end; k++) {
                     for (int i = ii; i < i_end; i++) {
                         float a_val = A[k * M + i];
 #if defined(__AVX512F__)
                         __m512 a_broadcast = _mm512_set1_ps(a_val);
                         int j;
                         for (j = jj; j <= j_end - 16; j += 16) {
                             __m512 b_vec = _mm512_loadu_ps(&B[k * N + j]);
                             __m512 c_vec = _mm512_loadu_ps(&C[i * N + j]);
                             c_vec = _mm512_fmadd_ps(a_broadcast, b_vec, c_vec);
                             _mm512_storeu_ps(&C[i * N + j], c_vec);
                         }
                         for (; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #elif defined(__AVX__)
                         __m256 a_broadcast = _mm256_set1_ps(a_val);
                         int j;
                         for (j = jj; j <= j_end - 8; j += 8) {
                             __m256 b_vec = _mm256_loadu_ps(&B[k * N + j]);
                             __m256 c_vec = _mm256_loadu_ps(&C[i * N + j]);
                             __m256 prod = _mm256_mul_ps(a_broadcast, b_vec);
                             c_vec = _mm256_add_ps(c_vec, prod);
                             _mm256_storeu_ps(&C[i * N + j], c_vec);
                         }
                         for (; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #else
                         for (int j = jj; j < j_end; j++) {
                             C[i * N + j] += a_val * B[k * N + j];
                         }
 #endif
                     }
                 }
             }
         }
     }
 }

References C, ck_min(), ck_strict_parity_enabled(), and gemm_tn_serial_double().

◆ gemm_tn_parallel()

void gemm_tn_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 499 of file gemm_kernels.c.

 {
     if (ck_strict_parity_enabled()) {
         gemm_tn_serial_double(A, B, bias, C, M, N, K);
         return;
     }
 #pragma omp parallel for
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; k++) {
                 sum += A[k * M + i] * B[k * N + j];
             }
             C[i * N + j] = sum;
         }
     }
 }

References C, ck_strict_parity_enabled(), and gemm_tn_serial_double().

Referenced by gemm_tn_avx512().

◆ gemv_fused_q5_0_bias_dispatch()

void gemv_fused_q5_0_bias_dispatch	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 508 of file gemv_fused_quant_bias.c.

 {
 #if defined(__AVX__)
     gemv_fused_q5_0_bias_avx(y, W, x, bias, M, K);
 #else
     gemv_fused_q5_0_bias(y, W, x, bias, M, K);
 #endif
 }

References gemv_fused_q5_0_bias().

◆ gemv_fused_q8_0_bias_dispatch()

void gemv_fused_q8_0_bias_dispatch	(	float *	y,
		const void *	W,
		const float *	x,
		const float *	bias,
		int	M,
		int	K
	)

Definition at line 523 of file gemv_fused_quant_bias.c.

 {
 #if defined(__AVX__)
     gemv_fused_q8_0_bias_avx(y, W, x, bias, M, K);
 #else
     gemv_fused_q8_0_bias(y, W, x, bias, M, K);
 #endif
 }

References gemv_fused_q8_0_bias().

◆ gemv_q4_0()

void gemv_q4_0	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV.

Definition at line 132 of file gemm_kernels_q4_0.c.

 {
 #ifdef __AVX512F__
     gemv_q4_0_avx512(y, W, x, M, K);
 #else
     gemv_q4_0_ref(y, W, x, M, K);
 #endif
 }

References gemv_q4_0_ref().

Referenced by dot_q4_0(), and gemm_q4_0().

◆ gemv_q4_k()

void gemv_q4_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV based on available SIMD.

Definition at line 285 of file gemm_kernels_q4k.c.

 {
 #ifdef __AVX512F__
     gemv_q4_k_avx512(y, W, x, M, K);
 #elif defined(__AVX__)
     gemv_q4_k_avx(y, W, x, M, K);
 #else
     gemv_q4_k_ref(y, W, x, M, K);
 #endif
 }

References gemv_q4_k_ref().

Referenced by attention_mlp_fused_q4k(), dot_q4_k(), gemm_q4_k_ref(), layer_fused_attn_mlp_qkv_q4k(), and rmsnorm_qkv_q4k_fused().

◆ gemv_q4_k_q8_k()

void gemv_q4_k_q8_k	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 239 of file gemm_kernels_q4k_q8k.c.

 {
 #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     /* VNNI: Best for decode (single token) - INT8 dot product acceleration */
     gemv_q4_k_q8_k_vnni(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q4_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     /* AVX version uses maddubs_epi16 (more efficient than SSE) */
     gemv_q4_k_q8_k_avx(y, W, x_q8, M, K);
 #elif defined(__SSE4_1__)
     gemv_q4_k_q8_k_sse(y, W, x_q8, M, K);
 #else
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

References gemv_q4_k_q8_k_avx(), gemv_q4_k_q8_k_avx2(), gemv_q4_k_q8_k_ref(), gemv_q4_k_q8_k_sse(), and gemv_q4_k_q8_k_vnni().

◆ gemv_q4_k_q8_k_parallel()

void gemv_q4_k_q8_k_parallel	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Definition at line 206 of file gemm_kernels_q4k_q8k.c.

 {
     if (!y || !W || !x_q8 || M <= 0 || K <= 0) {
         return;
     }
     if (ith < 0 || nth <= 0 || ith >= nth) {
         return;
     }
  
     /* Compute row range for this thread */
     const int dr = (M + nth - 1) / nth;
     const int r0 = dr * ith;
     const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
  
     if (r0 >= M) {
         return;  /* This thread has no work */
     }
  
     const block_q4_K *blocks = (const block_q4_K *)W;
     const block_q8_K *x = (const block_q8_K *)x_q8;
     const int blocks_per_row = K / QK_K;
  
     /* Only process rows [r0, r1) */
     for (int row = r0; row < r1; ++row) {
         const block_q4_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
         y[row] = dot_q4_k_q8_k_ref(w_row, x, K);
     }
 }

References dot_q4_k_q8_k_ref(), and QK_K.

◆ gemv_q4_k_q8_k_parallel_simd()

void gemv_q4_k_q8_k_parallel_simd	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Definition at line 263 of file gemm_kernels_q4k_avx.c.

 {
     /* Fall back to reference parallel version */
     gemv_q4_k_q8_k_parallel(y, W, x_q8, M, K, ith, nth);
 }

References gemv_q4_k_q8_k_parallel().

Referenced by decode_layer_parallel(), mlp_parallel(), and qkv_projection_parallel().

◆ gemv_q4_k_q8_k_ref()

void gemv_q4_k_q8_k_ref	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 177 of file gemm_kernels_q4k_q8k.c.

 {
     if (!y || !W || !x_q8 || M <= 0 || K <= 0) {
         return;
     }
  
     const block_q4_K *blocks = (const block_q4_K *)W;
     const block_q8_K *x = (const block_q8_K *)x_q8;
     const int blocks_per_row = K / QK_K;
  
     for (int row = 0; row < M; ++row) {
         const block_q4_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
         y[row] = dot_q4_k_q8_k_ref(w_row, x, K);
     }
 }

References dot_q4_k_q8_k_ref(), and QK_K.

◆ gemv_q5_0()

void gemv_q5_0	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV for Q5_0 weights based on CPU features.

Dispatch priority (best available):

AVX-512 (512-bit vectors) - Intel Skylake-X+
AVX2+FMA (256-bit vectors) - Intel Haswell+
AVX (256-bit vectors) - Intel Sandy Bridge+
SSE4.1 (128-bit vectors) - Intel Nehalem+
Reference (scalar) - Fallback

Uses ck_features.h for standardized feature detection.

Parameters

y	Output vector [M]
W	Weight matrix in Q5_0 format [M x K]
x	Input vector [K]
M	Number of output rows
K	Number of input columns (hidden dimension)

Definition at line 547 of file gemm_kernels_q5_0.c.

 {
 // Dispatch order: AVX512 > AVX2 > AVX > SSE > ref
 #if defined(__AVX512F__)
     gemv_q5_0_avx512(y, W, x, M, K);
 #elif defined(__AVX2__)
     gemv_q5_0_avx2(y, W, x, M, K);
 #elif defined(__AVX__)
     gemv_q5_0_avx(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     gemv_q5_0_sse_v2(y, W, x, M, K);
 #else
     gemv_q5_0_ref(y, W, x, M, K);
 #endif
 }

References gemv_q5_0_ref().

◆ gemv_q5_0_parallel()

void gemv_q5_0_parallel	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Parallel reference GEMV for Q5_0 × FP32.

Definition at line 576 of file gemm_kernels_q5_0.c.

 {
     if (!y || !W || !x || M <= 0 || K <= 0) return;
     if (ith < 0 || nth <= 0 || ith >= nth) return;
  
     const int dr = (M + nth - 1) / nth;
     const int r0 = dr * ith;
     const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
  
     if (r0 >= M) return;
  
     const block_q5_0 *blocks = (const block_q5_0 *)W;
     const int blocks_per_row = K / QK5_0;
  
     for (int row = r0; row < r1; row++) {
         float sum = 0.0f;
         for (int b = 0; b < blocks_per_row; b++) {
             const block_q5_0 *block = &blocks[row * blocks_per_row + b];
             const float d = CK_FP16_TO_FP32(block->d);
             const float *xp = &x[b * QK5_0];
  
             uint32_t qh;
             memcpy(&qh, block->qh, sizeof(qh));
  
             for (int j = 0; j < QK5_0 / 2; j++) {
                 const uint8_t packed = block->qs[j];
                 const int lo = (packed & 0x0F);
                 const int hi = (packed >> 4);
                 const int xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
                 const int xh_1 = ((qh >> (j + 12))) & 0x10;
                 const int w0 = (lo | xh_0) - 16;
                 const int w1 = (hi | xh_1) - 16;
                 sum += d * (w0 * xp[j] + w1 * xp[j + QK5_0/2]);
             }
         }
         y[row] = sum;
     }
 }

References CK_FP16_TO_FP32, block_q5_0::d, block_q5_0::qh, QK5_0, and block_q5_0::qs.

Referenced by gemv_q5_0_parallel_simd().

◆ gemv_q5_0_parallel_simd()

void gemv_q5_0_parallel_simd	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Parallel SIMD GEMV for Q5_0 × FP32 with prefetching.

Definition at line 622 of file gemm_kernels_q5_0.c.

 {
     if (!y || !W || !x || M <= 0 || K <= 0) return;
     if (ith < 0 || nth <= 0 || ith >= nth) return;
  
     const int dr = (M + nth - 1) / nth;
     const int r0 = dr * ith;
     const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
  
     if (r0 >= M) return;
  
     const block_q5_0 *blocks = (const block_q5_0 *)W;
     const int blocks_per_row = K / QK5_0;
  
 #if defined(__AVX__) || defined(__SSE4_1__)
     /* Prefetch first few rows */
     const int PREFETCH_ROWS = 4;
     for (int p = 0; p < PREFETCH_ROWS && r0 + p < r1; ++p) {
         const char *row_ptr = (const char *)(blocks + (r0 + p) * blocks_per_row);
         _mm_prefetch(row_ptr, _MM_HINT_T0);
         _mm_prefetch(row_ptr + 64, _MM_HINT_T0);
     }
  
     for (int row = r0; row < r1; ++row) {
         /* Prefetch rows ahead */
         if (row + PREFETCH_ROWS < r1) {
             const char *prefetch_ptr = (const char *)(blocks + (row + PREFETCH_ROWS) * blocks_per_row);
             _mm_prefetch(prefetch_ptr, _MM_HINT_T0);
             _mm_prefetch(prefetch_ptr + 64, _MM_HINT_T0);
         }
  
         /* Use SIMD dot product for this row */
 #if defined(__AVX512F__)
         /* Call single-row AVX512 implementation */
         gemv_q5_0_avx512(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
 #elif defined(__AVX2__)
         gemv_q5_0_avx2(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
 #elif defined(__AVX__)
         gemv_q5_0_avx(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
 #else
         gemv_q5_0_sse_v2(&y[row], (const char *)blocks + row * blocks_per_row * sizeof(block_q5_0), x, 1, K);
 #endif
     }
 #else
     /* Fallback to reference parallel */
     gemv_q5_0_parallel(y, W, x, M, K, ith, nth);
 #endif
 }

References gemv_q5_0_parallel(), and QK5_0.

◆ gemv_q5_0_q8_0()

void gemv_q5_0_q8_0	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Matrix-vector multiply with Q5_0 weights and Q8_0 input.

Parameters

y	Output vector [M]
W	Weight matrix in Q5_0 format [M x K]
x_q8	Input vector in Q8_0 format [K]
M	Number of output rows
K	Number of columns (must be multiple of 32)

Definition at line 1529 of file gemm_kernels_q5_0.c.

 {
     const block_q5_0 *w_blocks = (const block_q5_0 *)W;
     const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
     const int blocks_per_row = K / QK5_0;
  
     for (int row = 0; row < M; row++) {
         vec_dot_q5_0_q8_0(K, &y[row],
                           &w_blocks[row * blocks_per_row],
                           x_blocks);
     }
 }

References QK5_0, and vec_dot_q5_0_q8_0().

◆ gemv_q5_1()

void gemv_q5_1	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV.

Definition at line 184 of file gemm_kernels_q5_1.c.

 {
 #ifdef __AVX512F__
     gemv_q5_1_avx512(y, W, x, M, K);
 #else
     gemv_q5_1_ref(y, W, x, M, K);
 #endif
 }

References gemv_q5_1_ref().

◆ gemv_q5_k()

void gemv_q5_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 199 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #else
     gemv_q5_k_ref(y, W, x, M, K);
 #endif
 }

References gemv_q5_k_ref().

◆ gemv_q6_k()

void gemv_q6_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 169 of file gemm_kernels_q6k.c.

 {
     if (!y || !W || !x) {
         return;
     }
     if (M <= 0 || K <= 0) {
         return;
     }
     // TEMPORARILY DISABLE NEW AVX KERNELS - USE REFERENCE ONLY
  
     const block_q6_K *blocks = (const block_q6_K *)W;
     const int blocks_per_row = K / QK_K;
  
     for (int row = 0; row < M; ++row) {
         const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
 #if defined(__AVX__) && !defined(__AVX512F__)
         y[row] = dot_q6_k_avx(w_row, x, K);
 #else
         y[row] = dot_q6_k_ref(w_row, x, K);
 #endif
     }
 }

References dot_q6_k_ref(), and QK_K.

Referenced by gemm_q6_k().

◆ gemv_q6_k_q8_k()

void gemv_q6_k_q8_k	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

GEMV: y = W @ x where W is Q6_K and x is Q8_K.

Definition at line 980 of file gemm_kernels_q6k_q8k.c.

 {
     /* AVX-512 uses same algorithm as AVX2 (matches llama.cpp) */
 #if defined(__AVX512F__) && defined(__AVX512BW__)
     gemv_q6_k_q8_k_avx512(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q6_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     gemv_q6_k_q8_k_avx(y, W, x_q8, M, K);
 #elif defined(__SSSE3__)
     gemv_q6_k_q8_k_sse(y, W, x_q8, M, K);
 #else
     gemv_q6_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

References gemv_q6_k_q8_k_avx(), gemv_q6_k_q8_k_avx2(), gemv_q6_k_q8_k_avx512(), gemv_q6_k_q8_k_ref(), and gemv_q6_k_q8_k_sse().

◆ gemv_q6_k_q8_k_parallel()

void gemv_q6_k_q8_k_parallel	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Parallel reference GEMV for Q6_K × Q8_K.

Caller provides ith (thread index) and nth (total threads). Each thread processes rows [r0, r1).

Definition at line 1014 of file gemm_kernels_q6k_q8k.c.

 {
     if (!y || !W || !x_q8 || M <= 0 || K <= 0) return;
     if (ith < 0 || nth <= 0 || ith >= nth) return;
  
     /* Compute row range for this thread */
     const int dr = (M + nth - 1) / nth;
     const int r0 = dr * ith;
     const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
  
     if (r0 >= M) return;
  
     const block_q6_K *blocks = (const block_q6_K *)W;
     const block_q8_K *x = (const block_q8_K *)x_q8;
     const int blocks_per_row = K / QK_K;
  
     for (int row = r0; row < r1; ++row) {
         const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
         y[row] = dot_q6_k_q8_k_ref(w_row, x, K);
     }
 }

References dot_q6_k_q8_k_ref(), and QK_K.

◆ gemv_q6_k_q8_k_parallel_simd()

void gemv_q6_k_q8_k_parallel_simd	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K,
		int	ith,
		int	nth
	)

Parallel SIMD GEMV for Q6_K × Q8_K.

Uses best available SIMD (AVX/SSE) with row prefetching. Caller provides ith/nth from OpenMP region.

Definition at line 1046 of file gemm_kernels_q6k_q8k.c.

 {
     if (!y || !W || !x_q8 || M <= 0 || K <= 0) return;
     if (ith < 0 || nth <= 0 || ith >= nth) return;
  
     const int dr = (M + nth - 1) / nth;
     const int r0 = dr * ith;
     const int r1 = (r0 + dr < M) ? (r0 + dr) : M;
  
     if (r0 >= M) return;
  
     const block_q6_K *blocks = (const block_q6_K *)W;
     const block_q8_K *x = (const block_q8_K *)x_q8;
     const int blocks_per_row = K / QK_K;
  
 #if defined(__AVX__) || defined(__SSSE3__)
     /* Prefetch first few rows */
     const int PREFETCH_ROWS = 4;
     for (int p = 0; p < PREFETCH_ROWS && r0 + p < r1; ++p) {
         const char *row_ptr = (const char *)(blocks + (r0 + p) * blocks_per_row);
         _mm_prefetch(row_ptr, _MM_HINT_T0);
         _mm_prefetch(row_ptr + 64, _MM_HINT_T0);
     }
  
     for (int row = r0; row < r1; ++row) {
         /* Prefetch rows ahead */
         if (row + PREFETCH_ROWS < r1) {
             const char *prefetch_ptr = (const char *)(blocks + (row + PREFETCH_ROWS) * blocks_per_row);
             _mm_prefetch(prefetch_ptr, _MM_HINT_T0);
             _mm_prefetch(prefetch_ptr + 64, _MM_HINT_T0);
         }
  
         const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
 #if defined(__AVX2__)
         y[row] = dot_q6_k_q8_k_avx2(w_row, x, K);
 #elif defined(__AVX__)
         y[row] = dot_q6_k_q8_k_avx(w_row, x, K);
 #else
         y[row] = dot_q6_k_q8_k_sse(w_row, x, K);
 #endif
     }
 #else
     /* Fallback to reference */
     for (int row = r0; row < r1; ++row) {
         const block_q6_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
         y[row] = dot_q6_k_q8_k_ref(w_row, x, K);
     }
 #endif
 }

References dot_q6_k_q8_k_ref(), and QK_K.

◆ gemv_q8_0()

void gemv_q8_0	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV for Q8_0 weights based on CPU features.

Dispatch priority (best available):

AVX-512 (512-bit vectors) - Intel Skylake-X+
AVX2+FMA (256-bit vectors) - Intel Haswell+
AVX (256-bit vectors) - Intel Sandy Bridge+
SSE4.1 (128-bit vectors) - Intel Nehalem+
Reference (scalar) - Fallback

Uses ck_features.h for standardized feature detection.

Parameters

y	Output vector [M]
W	Weight matrix in Q8_0 format [M x K]
x	Input vector [K]
M	Number of output rows
K	Number of input columns (hidden dimension)

Definition at line 630 of file gemm_kernels_q8_0.c.

 {
 // Dispatch order: AVX512 > AVX2 > AVX > SSE > ref
 #if defined(__AVX512F__)
     gemv_q8_0_avx512(y, W, x, M, K);
 #elif defined(__AVX2__)
     gemv_q8_0_avx2(y, W, x, M, K);
 #elif defined(__AVX__)
     gemv_q8_0_avx(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     gemv_q8_0_sse(y, W, x, M, K);
 #else
     gemv_q8_0_ref(y, W, x, M, K);
 #endif
 }

References gemv_q8_0_ref().

◆ gemv_q8_0_q8_0()

void gemv_q8_0_q8_0	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Matrix-vector multiply with Q8_0 weights and Q8_0 input.

Parameters

y	Output vector [M]
W	Weight matrix in Q8_0 format [M x K]
x_q8	Input vector in Q8_0 format [K]
M	Number of output rows
K	Number of columns (must be multiple of 32)

Definition at line 1042 of file gemm_kernels_q8_0.c.

 {
     const block_q8_0 *w_blocks = (const block_q8_0 *)W;
     const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
     const int blocks_per_row = K / QK8_0;
  
     for (int row = 0; row < M; row++) {
         vec_dot_q8_0_q8_0(K, &y[row],
                           &w_blocks[row * blocks_per_row],
                           x_blocks);
     }
 }

References QK8_0, and vec_dot_q8_0_q8_0().

◆ im2patch()

void im2patch	(	const float *	image,
		float *	patches,
		int	C,
		int	H,
		int	W,
		int	P
	)

im2patch: Transforms an image into a sequence of flattened patches.

Image Layout: [C, H, W] (Row-major: W is fastest moving) Output Layout: [num_patches, C * P * P]

num_patches = (H/P) * (W/P) P = patch_size

Definition at line 28 of file vision_kernels.c.

 {
     int num_patches_h = H / P;
     int num_patches_w = W / P;
     int patch_dim = C * P * P;
  
     // ph, pw: patch grid coordinates
     for (int ph = 0; ph < num_patches_h; ++ph) {
         for (int pw = 0; pw < num_patches_w; ++pw) {
             
             int patch_idx = ph * num_patches_w + pw;
             float *dst_patch = patches + (size_t)patch_idx * patch_dim;
  
             // For each patch, grab pixels from all channels
             for (int c = 0; c < C; ++c) {
                 for (int py = 0; py < P; ++py) {
                     int y = ph * P + py;
                     int x = pw * P;
                     
                     // Input row start in the image
                     const float *src_row = image + (size_t)c * H * W + (size_t)y * W + x;
                     
                     // Destination row in the flattened patch sequence
                     float *dst_row = dst_patch + (size_t)c * P * P + (size_t)py * P;
                     
                     // Copy P pixels (one row of the patch)
                     memcpy(dst_row, src_row, P * sizeof(float));
                 }
             }
         }
     }
 }

References C.

◆ im2patch_bf16()

void im2patch_bf16	(	const uint16_t *	image,
		uint16_t *	patches,
		int	C,
		int	H,
		int	W,
		int	P
	)

Definition at line 22 of file vision_kernels_bf16.c.

 {
     if (!image || !patches || C <= 0 || H <= 0 || W <= 0 || P <= 0) {
         return;
     }
  
     int num_patches_h = H / P;
     int num_patches_w = W / P;
     int patch_dim = C * P * P;
  
     for (int ph = 0; ph < num_patches_h; ++ph) {
         for (int pw = 0; pw < num_patches_w; ++pw) {
             int patch_idx = ph * num_patches_w + pw;
             uint16_t *dst_patch = patches + (size_t)patch_idx * (size_t)patch_dim;
  
             for (int c = 0; c < C; ++c) {
                 for (int py = 0; py < P; ++py) {
                     int y = ph * P + py;
                     int x = pw * P;
  
                     const uint16_t *src_row = image + (size_t)c * (size_t)H * (size_t)W + (size_t)y * (size_t)W + (size_t)x;
                     uint16_t *dst_row = dst_patch + (size_t)c * (size_t)P * (size_t)P + (size_t)py * (size_t)P;
  
                     memcpy(dst_row, src_row, (size_t)P * sizeof(uint16_t));
                 }
             }
         }
     }
 }

References C.

◆ kv_cache_repack_head_major_inplace()

void kv_cache_repack_head_major_inplace	(	float *	buf,
		int	num_heads,
		int	tokens,
		int	cache_capacity,
		int	aligned_head_dim
	)

Definition at line 28 of file kv_cache_kernels.c.

 {
     if (!buf) {
         return;
     }
     if (num_heads <= 0 || tokens <= 0 || cache_capacity <= 0 || aligned_head_dim <= 0) {
         return;
     }
     if (tokens > cache_capacity) {
         tokens = cache_capacity;
     }
     if (tokens == cache_capacity) {
         return;
     }
  
     const size_t old_head_stride = (size_t)tokens * (size_t)aligned_head_dim;
     const size_t new_head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
     const size_t bytes = (size_t)tokens * (size_t)aligned_head_dim * sizeof(float);
  
     // Move head blocks from high to low to avoid overwriting source data
     // for heads that have not yet been moved.
     for (int h = num_heads - 1; h >= 0; --h) {
         float *src = buf + (size_t)h * old_head_stride;
         float *dst = buf + (size_t)h * new_head_stride;
         memmove(dst, src, bytes);
     }
 }

Referenced by qwen2_0_5b_decode_forward_prefill_impl().

◆ kv_cache_store()

void kv_cache_store	(	float *__restrict	kv_cache_k,
		float *__restrict	kv_cache_v,
		const float *__restrict	k,
		const float *__restrict	v,
		int	layer,
		int	pos,
		int	num_kv_heads,
		int	head_dim,
		int	max_seq_len
	)

Definition at line 101 of file kv_cache_kernels.c.

 {
     (void)layer;
     kv_cache_write_head_major(k, v,
                               kv_cache_k, kv_cache_v,
                               num_kv_heads,
                               pos,
                               max_seq_len,
                               head_dim,
                               head_dim);
 }

References kv_cache_write_head_major().

◆ kv_cache_write_head_major()

void kv_cache_write_head_major	(	const float *__restrict	k_token,
		const float *__restrict	v_token,
		float *__restrict	k_cache,
		float *__restrict	v_cache,
		int	num_kv_heads,
		int	token_index,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim
	)

Definition at line 60 of file kv_cache_kernels.c.

 {
     if (!k_token || !v_token || !k_cache || !v_cache) {
         return;
     }
     if (num_kv_heads <= 0 || token_index < 0 || cache_capacity <= 0) {
         return;
     }
     if (token_index >= cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
     const size_t token_stride = (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_kv_heads; ++h) {
         const float *k_src = k_token + (size_t)h * token_stride;
         const float *v_src = v_token + (size_t)h * token_stride;
  
         float *k_dst = k_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
         float *v_dst = v_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
  
         for (int d = 0; d < head_dim; ++d) {
             k_dst[d] = k_src[d];
             v_dst[d] = v_src[d];
         }
         for (int d = head_dim; d < aligned_head_dim; ++d) {
             k_dst[d] = 0.0f;
             v_dst[d] = 0.0f;
         }
     }
 }

◆ layernorm_backward_kernel()

void layernorm_backward_kernel	(	const float *	d_output,
		const float *	input,
		const float *	gamma,
		const float *	mean,
		const float *	rstd,
		float *	d_input,
		float *	d_gamma,
		float *	d_beta,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim
	)

Definition at line 668 of file layernorm_kernels.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned_D = aligned_embed_dim;
  
     // Per-token input gradients
     for (int t = 0; t < T; ++t) {
         float mean_t = mean[t];
         float rstd_t = rstd[t];
  
         float d_y_gamma_sum = 0.0f;
         float d_y_gamma_xhat_sum = 0.0f;
  
         // First pass: compute sums
         for (int d = 0; d < D; ++d) {
             float x = input[t * aligned_D + d];
             float x_hat = (x - mean_t) * rstd_t;
             float d_y = d_output[t * aligned_D + d];
             float d_y_gamma = d_y * gamma[d];
  
             d_y_gamma_sum += d_y_gamma;
             d_y_gamma_xhat_sum += d_y_gamma * x_hat;
         }
  
         // Second pass: compute input gradients
         float scale = rstd_t / (float)D;
         for (int d = 0; d < D; ++d) {
             float x = input[t * aligned_D + d];
             float x_hat = (x - mean_t) * rstd_t;
             float d_y = d_output[t * aligned_D + d];
  
             d_input[t * aligned_D + d] =
                 scale * ((float)D * d_y * gamma[d] - d_y_gamma_sum - x_hat * d_y_gamma_xhat_sum);
         }
  
         // Zero padding for aligned dimension beyond D
         for (int d = D; d < aligned_D; ++d) {
             d_input[t * aligned_D + d] = 0.0f;
         }
     }
  
     // Parameter gradients (gamma, beta)
     for (int d = 0; d < D; ++d) {
         float gamma_grad = 0.0f;
         float beta_grad = 0.0f;
  
         for (int t = 0; t < T; ++t) {
             float x = input[t * aligned_D + d];
             float x_hat = (x - mean[t]) * rstd[t];
             float d_y = d_output[t * aligned_D + d];
  
             gamma_grad += d_y * x_hat;
             beta_grad += d_y;
         }
  
         d_gamma[d] += gamma_grad;
         d_beta[d] += beta_grad;
     }
 }

Referenced by layernorm_backward_kernel_bf16().

◆ layernorm_backward_kernel_bf16()

void layernorm_backward_kernel_bf16	(	const uint16_t *	d_output,
		const uint16_t *	input,
		const float *	gamma,
		const float *	mean,
		const float *	rstd,
		uint16_t *	d_input,
		float *	d_gamma,
		float *	d_beta,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float *	scratch_d_output,
		float *	scratch_input,
		float *	scratch_d_input
	)

Definition at line 84 of file layernorm_kernels_bf16.c.

 {
     if (!scratch_d_output || !scratch_input || !scratch_d_input) return;
  
     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
  
     bf16_tensor_to_float(d_output, scratch_d_output, total);
     bf16_tensor_to_float(input, scratch_input, total);
  
     layernorm_backward_kernel(scratch_d_output, scratch_input, gamma, mean, rstd,
                               scratch_d_input, d_gamma, d_beta,
                               tokens, d_model, aligned_embed_dim);
  
     float_tensor_to_bf16(scratch_d_input, d_input, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_backward_kernel().

◆ layernorm_forward_rolled_slice()

void layernorm_forward_rolled_slice	(	const float *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		float *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

Definition at line 274 of file layernorm_kernels.c.

 {
 #if defined(__AVX512F__)
     layernorm_forward_rolled_slice_avx512(input_slice_base, gamma, beta,
                                            output_slice_base, mean_cache_slice, rstd_cache_slice,
                                            num_tokens_in_slice, d_model, aligned_embed_dim, eps);
 #elif defined(__AVX2__) || defined(__AVX__)
     layernorm_forward_rolled_slice_avx256(input_slice_base, gamma, beta,
                                            output_slice_base, mean_cache_slice, rstd_cache_slice,
                                            num_tokens_in_slice, d_model, aligned_embed_dim, eps);
 #else
     layernorm_naive_serial(input_slice_base, gamma, beta,
                            output_slice_base, mean_cache_slice, rstd_cache_slice,
                            num_tokens_in_slice, d_model, aligned_embed_dim, eps);
 #endif
 }

References layernorm_naive_serial().

Referenced by layernorm_forward_rolled_slice_bf16().

◆ layernorm_forward_rolled_slice_bf16()

void layernorm_forward_rolled_slice_bf16	(	const uint16_t *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		uint16_t *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		int	aligned_embed_dim,
		float	eps,
		float *	scratch_input,
		float *	scratch_output
	)

Definition at line 30 of file layernorm_kernels_bf16.c.

 {
     if (!scratch_input || !scratch_output) return;
  
     size_t total = (size_t)num_tokens_in_slice * (size_t)aligned_embed_dim;
  
     bf16_tensor_to_float(input_slice_base, scratch_input, total);
     layernorm_forward_rolled_slice(scratch_input, gamma, beta,
                                    scratch_output, mean_cache_slice, rstd_cache_slice,
                                    num_tokens_in_slice, d_model, aligned_embed_dim, eps);
     float_tensor_to_bf16(scratch_output, output_slice_base, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_rolled_slice().

◆ layernorm_forward_unrolled_slice()

void layernorm_forward_unrolled_slice	(	const float *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		float *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		float	eps
	)

Definition at line 598 of file layernorm_kernels.c.

 {
 #if defined(__AVX512F__)
     layernorm_forward_unrolled_slice_avx512(input_slice_base, gamma, beta,
                                              output_slice_base, mean_cache_slice, rstd_cache_slice,
                                              num_tokens_in_slice, d_model, eps);
 #elif defined(__AVX2__) || defined(__AVX__)
     layernorm_forward_unrolled_slice_avx256(input_slice_base, gamma, beta,
                                              output_slice_base, mean_cache_slice, rstd_cache_slice,
                                              num_tokens_in_slice, d_model, eps);
 #else
     layernorm_forward_unrolled_slice_scalar(input_slice_base, gamma, beta,
                                             output_slice_base, mean_cache_slice, rstd_cache_slice,
                                             num_tokens_in_slice, d_model, eps);
 #endif
 }

References layernorm_forward_unrolled_slice_scalar().

Referenced by layernorm_forward_unrolled_slice_bf16().

◆ layernorm_forward_unrolled_slice_bf16()

void layernorm_forward_unrolled_slice_bf16	(	const uint16_t *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		uint16_t *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		float	eps,
		float *	scratch_input,
		float *	scratch_output
	)

Definition at line 57 of file layernorm_kernels_bf16.c.

 {
     if (!scratch_input || !scratch_output) return;
  
     size_t total = (size_t)num_tokens_in_slice * (size_t)d_model;
  
     bf16_tensor_to_float(input_slice_base, scratch_input, total);
     layernorm_forward_unrolled_slice(scratch_input, gamma, beta,
                                      scratch_output, mean_cache_slice, rstd_cache_slice,
                                      num_tokens_in_slice, d_model, eps);
     float_tensor_to_bf16(scratch_output, output_slice_base, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_unrolled_slice().

◆ layernorm_naive_serial()

void layernorm_naive_serial	(	const float *	input,
		const float *	gamma,
		const float *	beta,
		float *	output,
		float *	mean_cache,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

Definition at line 51 of file layernorm_kernels.c.

 {
     for (int t = 0; t < tokens; ++t) {
         const float *in_ptr = input + t * aligned_embed_dim;
         float *out_ptr = output + t * aligned_embed_dim;
  
         float sum_val = 0.0f;
         for (int i = 0; i < d_model; ++i) {
             sum_val += in_ptr[i];
         }
         float mean = sum_val / (float)d_model;
  
         float sum_sq_diff = 0.0f;
         for (int i = 0; i < d_model; ++i) {
             float diff = in_ptr[i] - mean;
             sum_sq_diff += diff * diff;
         }
         float variance = sum_sq_diff / (float)d_model + eps;
  
         double var_double = (double)variance;
         float inv_std = (float)(1.0 / sqrt(var_double));
  
         for (int i = 0; i < d_model; ++i) {
             float normalized_val = (in_ptr[i] - mean) * inv_std;
             out_ptr[i] = normalized_val * gamma[i] + beta[i];
         }
  
         if (mean_cache) {
             mean_cache[t] = mean;
         }
         if (rstd_cache) {
             rstd_cache[t] = inv_std;
         }
         /* Keep aligned padding quiet so future GEMMs see deterministic memory. */
         if (aligned_embed_dim > d_model) {
             /* Keep padded lanes zeroed so subsequent GEMMs never read stale data. */
             for (int i = d_model; i < aligned_embed_dim; ++i) {
                 out_ptr[i] = 0.0f;
             }
         }
     }
 }

Referenced by layernorm_forward_rolled_slice().

◆ layernorm_naive_serial_matched_precision()

void layernorm_naive_serial_matched_precision	(	const float *	input,
		const float *	gamma,
		const float *	beta,
		float *	output,
		float *	mean_cache,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		float	eps
	)

Definition at line 624 of file layernorm_kernels.c.

 {
     for (int t = 0; t < tokens; ++t) {
         const float *in_ptr = input + t * d_model;
         float *out_ptr = output + t * d_model;
  
         float sum_val = 0.0f;
         for (int i = 0; i < d_model; ++i) {
             sum_val += in_ptr[i];
         }
         float mean = sum_val / (float)d_model;
  
         float sum_sq_diff = 0.0f;
         for (int i = 0; i < d_model; ++i) {
             float diff = in_ptr[i] - mean;
             sum_sq_diff += diff * diff;
         }
         float variance = sum_sq_diff / (float)d_model + eps;
  
         double var_double = (double)variance;
         float inv_std = (float)(1.0 / sqrt(var_double));
  
         for (int i = 0; i < d_model; ++i) {
             float normalized_val = (in_ptr[i] - mean) * inv_std;
             out_ptr[i] = normalized_val * gamma[i] + beta[i];
         }
  
         if (mean_cache) {
             mean_cache[t] = mean;
         }
         if (rstd_cache) {
             rstd_cache[t] = inv_std;
         }
     }
 }

Referenced by layernorm_forward_unrolled_slice_scalar().

◆ mlp_token_parallel()

void mlp_token_parallel	(	const float *	input,
		const float *	W_fc1,
		const float *	b_fc1,
		const float *	W_fc2,
		const float *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads
	)

Definition at line 41 of file mlp_kernels.c.

 {
     int D = aligned_dim;
     int fourD = 4 * D;
  
     // FC1: [T × D] · [D × 4D] -> [T × 4D]
     // Our GEMM layout: A[M×K], B[N×K], so B is [4D × D].
     gemm_blocked_serial(input, W_fc1, b_fc1,
                         fc1_output,
                         T,      // M
                         fourD,  // N
                         D);     // K
  
     // GELU in-place on FC1 output
     gelu_fast_inplace(fc1_output, (size_t)T * (size_t)fourD);
  
     // FC2: [T × 4D] · [4D × D] -> [T × D]
     gemm_blocked_serial(fc1_output, W_fc2, b_fc2,
                         output,
                         T,  // M
                         D,  // N
                         fourD); // K
 }

References gelu_fast_inplace(), and gemm_blocked_serial().

◆ mlp_token_parallel_bf16()

void mlp_token_parallel_bf16	(	const uint16_t *	input,
		const uint16_t *	W_fc1,
		const uint16_t *	b_fc1,
		const uint16_t *	W_fc2,
		const uint16_t *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads,
		float *	scratch_bias1_f,
		float *	scratch_bias2_f,
		uint16_t *	scratch_fc1_bf16
	)

Optimized MLP Forward (BF16 weights, FP32 activations)

Caller-provided scratch buffers: scratch_bias1_f: [4*D] floats scratch_bias2_f: [D] floats scratch_fc1_bf16: [T * 4*D] uint16_t (BF16)

Definition at line 91 of file mlp_kernels_bf16.c.

 {
     if (!input || !W_fc1 || !b_fc1 || !W_fc2 || !b_fc2 || !fc1_output || !output) return;
     if (!scratch_bias1_f || !scratch_bias2_f || !scratch_fc1_bf16) return;
  
     (void)num_threads;
     const int D = aligned_dim;
     const int fourD = 4 * D;
  
     /* Convert biases to FP32 */
     for (int i = 0; i < fourD; ++i) {
         scratch_bias1_f[i] = bf16_to_float(b_fc1[i]);
     }
     for (int i = 0; i < D; ++i) {
         scratch_bias2_f[i] = bf16_to_float(b_fc2[i]);
     }
  
     /* FC1: [T, D] x [4D, D].T -> [T, 4D] */
     gemm_bf16_fp32out(input, W_fc1, scratch_bias1_f, fc1_output, T, fourD, D);
  
     /* GELU activation */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *row = fc1_output + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 x = _mm512_loadu_ps(row + j);
             _mm512_storeu_ps(row + j, gelu_avx512(x));
         }
         for (; j < fourD; ++j) {
             row[j] = gelu_scalar(row[j]);
         }
     }
 #else
     for (int t = 0; t < T; ++t) {
         for (int j = 0; j < fourD; ++j) {
             fc1_output[t * fourD + j] = gelu_scalar(fc1_output[t * fourD + j]);
         }
     }
 #endif
  
     /* Convert FP32 activations to BF16 */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *src = fc1_output + (size_t)t * fourD;
         uint16_t *dst = scratch_fc1_bf16 + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 fp32 = _mm512_loadu_ps(src + j);
             __m512i as_int = _mm512_castps_si512(fp32);
             __m512i lsb = _mm512_srli_epi32(as_int, 16);
             lsb = _mm512_and_si512(lsb, _mm512_set1_epi32(1));
             __m512i rounding = _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), lsb);
             __m512i rounded = _mm512_add_epi32(as_int, rounding);
             __m512i shifted = _mm512_srli_epi32(rounded, 16);
             __m256i bf16 = _mm512_cvtepi32_epi16(shifted);
             _mm256_storeu_si256((__m256i *)(dst + j), bf16);
         }
         for (; j < fourD; ++j) {
             dst[j] = float_to_bf16(src[j]);
         }
     }
 #else
     for (size_t i = 0; i < (size_t)T * fourD; ++i) {
         scratch_fc1_bf16[i] = float_to_bf16(fc1_output[i]);
     }
 #endif
  
     /* FC2: BF16 GEMM with FP32 output */
     gemm_bf16_fp32out(scratch_fc1_bf16, W_fc2, scratch_bias2_f, output, T, D, fourD);
 }

References bf16_to_float(), float_to_bf16(), gelu_scalar(), and gemm_bf16_fp32out().

◆ mlp_token_parallel_bf16_fp32act()

void mlp_token_parallel_bf16_fp32act	(	const uint16_t *	input,
		const uint16_t *	W_fc1,
		const uint16_t *	b_fc1,
		const uint16_t *	W_fc2,
		const uint16_t *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads,
		float *	scratch_input_f,
		float *	scratch_bias1_f,
		float *	scratch_bias2_f,
		uint16_t *	scratch_fc1_bf16
	)

Alternative: Fully FP32 activations throughout

Caller-provided scratch buffers: scratch_input_f: [T * D] floats scratch_bias1_f: [4*D] floats scratch_bias2_f: [D] floats scratch_fc1_bf16: [T * 4*D] uint16_t (BF16)

Definition at line 186 of file mlp_kernels_bf16.c.

 {
     if (!input || !W_fc1 || !b_fc1 || !W_fc2 || !b_fc2 || !fc1_output || !output) return;
     if (!scratch_input_f || !scratch_bias1_f || !scratch_bias2_f || !scratch_fc1_bf16) return;
  
     (void)num_threads;
     const int D = aligned_dim;
     const int fourD = 4 * D;
  
     /* Convert input and biases to FP32 */
     bf16_tensor_to_float(input, scratch_input_f, (size_t)T * D);
     bf16_tensor_to_float(b_fc1, scratch_bias1_f, fourD);
     bf16_tensor_to_float(b_fc2, scratch_bias2_f, D);
  
     /* FC1 */
     gemm_bf16_fp32out(input, W_fc1, scratch_bias1_f, fc1_output, T, fourD, D);
  
     /* GELU */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *row = fc1_output + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 x = _mm512_loadu_ps(row + j);
             _mm512_storeu_ps(row + j, gelu_avx512(x));
         }
         for (; j < fourD; ++j) {
             row[j] = gelu_scalar(row[j]);
         }
     }
 #else
     for (size_t i = 0; i < (size_t)T * fourD; ++i) {
         fc1_output[i] = gelu_scalar(fc1_output[i]);
     }
 #endif
  
     /* Convert fc1_output to BF16 for FC2 */
     float_tensor_to_bf16(fc1_output, scratch_fc1_bf16, (size_t)T * fourD);
     gemm_bf16_fp32out(scratch_fc1_bf16, W_fc2, scratch_bias2_f, output, T, D, fourD);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), gelu_scalar(), and gemm_bf16_fp32out().

◆ mlp_token_parallel_exact()

void mlp_token_parallel_exact	(	const float *	input,
		const float *	W_fc1,
		const float *	b_fc1,
		const float *	W_fc2,
		const float *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads
	)

Definition at line 76 of file mlp_kernels.c.

 {
     (void)num_threads;
     int D = aligned_dim;
     int fourD = 4 * D;
  
     // FC1: [T × D] · [D × 4D] -> [T × 4D]
     gemm_blocked_serial(input, W_fc1, b_fc1,
                         fc1_output,
                         T,      // M
                         fourD,  // N
                         D);     // K
  
     // Exact GELU using standard library tanhf
     gelu_exact_inplace(fc1_output, (size_t)T * (size_t)fourD);
  
     // FC2: [T × 4D] · [4D × D] -> [T × D]
     gemm_blocked_serial(fc1_output, W_fc2, b_fc2,
                         output,
                         T,  // M
                         D,  // N
                         fourD); // K
 }

References gelu_exact_inplace(), and gemm_blocked_serial().

◆ moe_accumulate_expert_f32()

void moe_accumulate_expert_f32	(	float *	output,
		const float *	expert_output,
		float	routing_weight,
		int	hidden_dim
	)

Accumulate expert output: output += routing_weight * expert_output.

Parameters

output	Token output buffer [hidden_dim], accumulated in place
expert_output	Expert's output for this token [hidden_dim]
routing_weight	Softmax routing weight for this expert
hidden_dim	Hidden dimension

Definition at line 256 of file axpy_kernels.c.

 {
     axpy_f32(output, expert_output, routing_weight, hidden_dim);
 }

References axpy_f32().

◆ patch2im()

void patch2im	(	const float *	d_patches,
		float *	d_image,
		int	C,
		int	H,
		int	W,
		int	P
	)

patch2im: Accumulates gradients from patches back into the image. (Backward pass)

d_patches: [num_patches, C * P * P] d_image: [C, H, W] (Accumulated)

Definition at line 69 of file vision_kernels.c.

 {
     int num_patches_h = H / P;
     int num_patches_w = W / P;
     int patch_dim = C * P * P;
  
     // Zero out the image first as we are accumulating gradients
     memset(d_image, 0, (size_t)C * H * W * sizeof(float));
  
     for (int ph = 0; ph < num_patches_h; ++ph) {
         for (int pw = 0; pw < num_patches_w; ++pw) {
             
             int patch_idx = ph * num_patches_w + pw;
             const float *src_patch = d_patches + (size_t)patch_idx * patch_dim;
  
             for (int c = 0; c < C; ++c) {
                 for (int py = 0; py < P; ++py) {
                     int y = ph * P + py;
                     int x = pw * P;
                     
                     float *dst_row = d_image + (size_t)c * H * W + (size_t)y * W + x;
                     const float *src_row = src_patch + (size_t)c * P * P + (size_t)py * P;
                     
                     // Add the patch gradient to the image gradient
                     for (int px = 0; px < P; ++px) {
                         dst_row[px] += src_row[px];
                     }
                 }
             }
         }
     }
 }

References C.

◆ patch2im_bf16()

void patch2im_bf16	(	const uint16_t *	d_patches,
		uint16_t *	d_image,
		int	C,
		int	H,
		int	W,
		int	P
	)

Definition at line 57 of file vision_kernels_bf16.c.

 {
     if (!d_patches || !d_image || C <= 0 || H <= 0 || W <= 0 || P <= 0) {
         return;
     }
  
     int num_patches_h = H / P;
     int num_patches_w = W / P;
     int patch_dim = C * P * P;
  
     memset(d_image, 0, (size_t)C * (size_t)H * (size_t)W * sizeof(uint16_t));
  
     for (int ph = 0; ph < num_patches_h; ++ph) {
         for (int pw = 0; pw < num_patches_w; ++pw) {
             int patch_idx = ph * num_patches_w + pw;
             const uint16_t *src_patch = d_patches + (size_t)patch_idx * (size_t)patch_dim;
  
             for (int c = 0; c < C; ++c) {
                 for (int py = 0; py < P; ++py) {
                     int y = ph * P + py;
                     int x = pw * P;
  
                     uint16_t *dst_row = d_image + (size_t)c * (size_t)H * (size_t)W + (size_t)y * (size_t)W + (size_t)x;
                     const uint16_t *src_row = src_patch + (size_t)c * (size_t)P * (size_t)P + (size_t)py * (size_t)P;
  
                     for (int px = 0; px < P; ++px) {
                         float acc = bf16_to_float(dst_row[px]) + bf16_to_float(src_row[px]);
                         dst_row[px] = float_to_bf16(acc);
                     }
                 }
             }
         }
     }
 }

References bf16_to_float(), C, and float_to_bf16().

◆ quantize_batch_q8_0()

void quantize_batch_q8_0	(	const float *	x,
		void *	vy,
		int	num_rows,
		int	k
	)

Batch quantize FP32 to Q8_0 format (row-major output)

Quantizes multiple rows of FP32 data to Q8_0 format, placing each row's Q8_0 output at the correct byte offset for GEMM compatibility.

Memory layout: Input: [num_rows, k] FP32, row-major (stride = k * sizeof(float)) Output: [num_rows, q8_row_bytes] Q8_0, row-major (stride = q8_row_bytes)

where q8_row_bytes = (k / 32) * sizeof(block_q8_0) = (k / 32) * 34

Parameters

x	Input FP32 values [num_rows * k]
vy	Output Q8_0 blocks [num_rows * (k/32) blocks]
num_rows	Number of rows (batch size / tokens)
k	Elements per row (must be multiple of 32)

Definition at line 192 of file gemm_kernels_q8_0.c.

 {
     const size_t row_bytes_in = (size_t)k * sizeof(float);
     const size_t row_bytes_out = (size_t)(k / QK8_0) * sizeof(block_q8_0);
  
     uint8_t *out = (uint8_t *)vy;
     const uint8_t *in = (const uint8_t *)x;
  
     for (int row = 0; row < num_rows; ++row) {
         quantize_row_q8_0(
             (const float *)(in + row * row_bytes_in),
             (void *)(out + row * row_bytes_out),
             k
         );
     }
 }

References QK8_0, and quantize_row_q8_0().

◆ quantize_batch_q8_k()

void quantize_batch_q8_k	(	const float *	x,
		void *	vy,
		int	num_rows,
		int	k
	)

Batch quantize FP32 to Q8_K format (row-major output)

Same as quantize_batch_q8_0 but for Q8_K format (super-blocks).

Parameters

x	Input FP32 values [num_rows * k]
vy	Output Q8_K blocks
num_rows	Number of rows (batch size / tokens)
k	Elements per row (must be multiple of 256)

Definition at line 219 of file gemm_kernels_q8_0.c.

 {
     /* Q8_K: 256 elements per super-block, each block is larger */
     const size_t row_bytes_in = (size_t)k * sizeof(float);
     /* Q8_K block size = 2 (d) + 256 (qs) + 32 (bsums/2) = ~274 bytes for 256 elements */
     /* Actual: sizeof(block_q8_K) from ckernel_quant.h */
     const size_t row_bytes_out = (size_t)(k / 256) * sizeof(block_q8_K);
  
     uint8_t *out = (uint8_t *)vy;
     const uint8_t *in = (const uint8_t *)x;
  
     for (int row = 0; row < num_rows; ++row) {
         quantize_row_q8_k(
             (const float *)(in + row * row_bytes_in),
             (void *)(out + row * row_bytes_out),
             k
         );
     }
 }

References quantize_row_q8_k().

◆ quantize_row_q8_0()

void quantize_row_q8_0	(	const float *	x,
		void *	vy,
		int	k
	)

Quantize FP32 to Q8_0 format (scalar reference)

Parameters

x	Input FP32 values
vy	Output Q8_0 blocks
k	Number of elements (must be multiple of 32)

Definition at line 59 of file gemm_kernels_q8_0.c.

 {
     block_q8_0 *y = (block_q8_0 *)vy;
     const int nb = k / QK8_0;  /* QK8_0 = 32 */
  
 #if defined(__AVX__)
     const __m256 sign_bit = _mm256_set1_ps(-0.0f);
     const __m256 v_half = _mm256_set1_ps(0.5f);
     const __m256 v_min = _mm256_set1_ps(-127.0f);
     const __m256 v_max = _mm256_set1_ps(127.0f);
  
     for (int i = 0; i < nb; i++) {
         __m256 v0 = _mm256_loadu_ps(x + 0);
         __m256 v1 = _mm256_loadu_ps(x + 8);
         __m256 v2 = _mm256_loadu_ps(x + 16);
         __m256 v3 = _mm256_loadu_ps(x + 24);
         x += QK8_0;
  
         __m256 max_abs = _mm256_andnot_ps(sign_bit, v0);
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v1));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v2));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v3));
  
         __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(max_abs, 1),
                                  _mm256_castps256_ps128(max_abs));
         max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
         max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
         const float max_scalar = _mm_cvtss_f32(max4);
  
         const float d = max_scalar / 127.0f;
         const float id = max_scalar != 0.0f ? 127.0f / max_scalar : 0.0f;
         y[i].d = CK_FP32_TO_FP16(d);
  
         const __m256 mul = _mm256_set1_ps(id);
         v0 = _mm256_mul_ps(v0, mul);
         v1 = _mm256_mul_ps(v1, mul);
         v2 = _mm256_mul_ps(v2, mul);
         v3 = _mm256_mul_ps(v3, mul);
  
         v0 = _mm256_min_ps(_mm256_max_ps(v0, v_min), v_max);
         v1 = _mm256_min_ps(_mm256_max_ps(v1, v_min), v_max);
         v2 = _mm256_min_ps(_mm256_max_ps(v2, v_min), v_max);
         v3 = _mm256_min_ps(_mm256_max_ps(v3, v_min), v_max);
  
         /* Round half away from zero to match the scalar path */
         v0 = _mm256_add_ps(v0, _mm256_or_ps(_mm256_and_ps(v0, sign_bit), v_half));
         v1 = _mm256_add_ps(v1, _mm256_or_ps(_mm256_and_ps(v1, sign_bit), v_half));
         v2 = _mm256_add_ps(v2, _mm256_or_ps(_mm256_and_ps(v2, sign_bit), v_half));
         v3 = _mm256_add_ps(v3, _mm256_or_ps(_mm256_and_ps(v3, sign_bit), v_half));
  
         __m256i i0 = _mm256_cvttps_epi32(v0);
         __m256i i1 = _mm256_cvttps_epi32(v1);
         __m256i i2 = _mm256_cvttps_epi32(v2);
         __m256i i3 = _mm256_cvttps_epi32(v3);
  
 #if defined(__AVX2__)
         i0 = _mm256_packs_epi32(i0, i1);
         i2 = _mm256_packs_epi32(i2, i3);
         i0 = _mm256_packs_epi16(i0, i2);
  
         const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
         i0 = _mm256_permutevar8x32_epi32(i0, perm);
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
         __m128i ni0 = _mm256_castsi256_si128(i0);
         __m128i ni1 = _mm256_extractf128_si256(i0, 1);
         __m128i ni2 = _mm256_castsi256_si128(i1);
         __m128i ni3 = _mm256_extractf128_si256(i1, 1);
         __m128i ni4 = _mm256_castsi256_si128(i2);
         __m128i ni5 = _mm256_extractf128_si256(i2, 1);
         __m128i ni6 = _mm256_castsi256_si128(i3);
         __m128i ni7 = _mm256_extractf128_si256(i3, 1);
  
         ni0 = _mm_packs_epi32(ni0, ni1);
         ni2 = _mm_packs_epi32(ni2, ni3);
         ni4 = _mm_packs_epi32(ni4, ni5);
         ni6 = _mm_packs_epi32(ni6, ni7);
  
         ni0 = _mm_packs_epi16(ni0, ni2);
         ni4 = _mm_packs_epi16(ni4, ni6);
  
         _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
         _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
 #endif
     }
 #else
     for (int i = 0; i < nb; i++) {
         const float *xb = x + i * QK8_0;
  
         /* Find max absolute value in block */
         float amax = 0.0f;
         for (int j = 0; j < QK8_0; j++) {
             float av = xb[j] >= 0 ? xb[j] : -xb[j];
             if (av > amax) amax = av;
         }
  
         /* Compute scale: d = max / 127 */
         float d = amax / 127.0f;
         float id = d != 0.0f ? 127.0f / amax : 0.0f;
  
         /* Store scale as FP16 */
         y[i].d = CK_FP32_TO_FP16(d);
  
         /* Quantize values */
         for (int j = 0; j < QK8_0; j++) {
             float v = xb[j] * id;
             /* Round to nearest int and clamp to [-127, 127] */
             int q = (int)(v + (v >= 0 ? 0.5f : -0.5f));
             if (q > 127) q = 127;
             if (q < -127) q = -127;
             y[i].qs[j] = (int8_t)q;
         }
     }
 #endif
 }

References CK_FP32_TO_FP16, block_q8_0::d, id, QK8_0, and block_q8_0::qs.

Referenced by fused_mlp_swiglu_prefill_w1w2_quant(), fused_rmsnorm_qkv_prefill_head_major_quant(), and quantize_attn_out_head_major_q8_0().

◆ quantize_row_q8_k()

void quantize_row_q8_k	(	const float *	x,
		void *	y,
		int	k
	)

Definition at line 107 of file gemm_kernels_q4k_q8k.c.

                                                         {
 #if defined(__SSE4_1__)
     quantize_row_q8_k_sse(x, vy, k);
 #else
     quantize_row_q8_k_ref(x, vy, k);
 #endif
 }

References quantize_row_q8_k_ref(), and quantize_row_q8_k_sse().

◆ relu_backward()

void relu_backward	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 84 of file relu_kernels.c.

 {
     size_t i = 0;
  
 #if defined(__AVX512F__)
     __m512 vzero = _mm512_setzero_ps();
     for (; i + 15 < n; i += 16) {
         __m512 vx = _mm512_loadu_ps(input + i);
         __m512 vdy = _mm512_loadu_ps(d_output + i);
         __mmask16 mask = _mm512_cmp_ps_mask(vx, vzero, _CMP_GT_OQ);
         __m512 vdx = _mm512_maskz_mov_ps(mask, vdy);
         _mm512_storeu_ps(d_input + i, vdx);
     }
 #elif defined(__AVX2__) || defined(__AVX__)
     __m256 vzero = _mm256_setzero_ps();
     for (; i + 7 < n; i += 8) {
         __m256 vx = _mm256_loadu_ps(input + i);
         __m256 vdy = _mm256_loadu_ps(d_output + i);
         // Result is all 1s (0xFFFFFFFF) if true, 0 if false.
         __m256 mask = _mm256_cmp_ps(vx, vzero, _CMP_GT_OQ);
         __m256 vdx = _mm256_and_ps(mask, vdy);
         _mm256_storeu_ps(d_input + i, vdx);
     }
 #endif
  
     // Scalar fallback
     for (; i < n; ++i) {
         d_input[i] = (input[i] > 0.0f) ? d_output[i] : 0.0f;
     }
 }

References mask.

◆ relu_backward_bf16()

void relu_backward_bf16	(	const uint16_t *	input,
		const uint16_t *	d_output,
		uint16_t *	d_input,
		size_t	n
	)

Definition at line 45 of file relu_kernels_bf16.c.

 {
     if (!input || !d_output || !d_input) {
         return;
     }
     for (size_t i = 0; i < n; ++i) {
         float x = bf16_to_float(input[i]);
         float dy = bf16_to_float(d_output[i]);
         d_input[i] = float_to_bf16(x > 0.0f ? dy : 0.0f);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ relu_forward()

void relu_forward	(	const float *	input,
		float *	output,
		size_t	n
	)

Definition at line 26 of file relu_kernels.c.

 {
     size_t i = 0;
  
 #if defined(__AVX512F__)
     __m512 vzero = _mm512_setzero_ps();
     for (; i + 15 < n; i += 16) {
         __m512 vx = _mm512_loadu_ps(input + i);
         __m512 vy = _mm512_max_ps(vx, vzero);
         _mm512_storeu_ps(output + i, vy);
     }
 #elif defined(__AVX2__) || defined(__AVX__)
     __m256 vzero = _mm256_setzero_ps();
     for (; i + 7 < n; i += 8) {
         __m256 vx = _mm256_loadu_ps(input + i);
         __m256 vy = _mm256_max_ps(vx, vzero);
         _mm256_storeu_ps(output + i, vy);
     }
 #endif
  
     // Scalar fallback
     for (; i < n; ++i) {
         float x = input[i];
         output[i] = (x > 0.0f) ? x : 0.0f;
     }
 }

◆ relu_forward_bf16()

void relu_forward_bf16	(	const uint16_t *	input,
		uint16_t *	output,
		size_t	n
	)

Definition at line 23 of file relu_kernels_bf16.c.

 {
     if (!input || !output) {
         return;
     }
     for (size_t i = 0; i < n; ++i) {
         float x = bf16_to_float(input[i]);
         output[i] = float_to_bf16(x > 0.0f ? x : 0.0f);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ relu_forward_inplace()

void relu_forward_inplace	(	float *	data,
		size_t	n
	)

Definition at line 54 of file relu_kernels.c.

 {
     size_t i = 0;
  
 #if defined(__AVX512F__)
     __m512 vzero = _mm512_setzero_ps();
     for (; i + 15 < n; i += 16) {
         __m512 vx = _mm512_loadu_ps(data + i);
         __m512 vy = _mm512_max_ps(vx, vzero);
         _mm512_storeu_ps(data + i, vy);
     }
 #elif defined(__AVX2__) || defined(__AVX__)
     __m256 vzero = _mm256_setzero_ps();
     for (; i + 7 < n; i += 8) {
         __m256 vx = _mm256_loadu_ps(data + i);
         __m256 vy = _mm256_max_ps(vx, vzero);
         _mm256_storeu_ps(data + i, vy);
     }
 #endif
  
     // Scalar fallback
     for (; i < n; ++i) {
         float x = data[i];
         if (x < 0.0f) {
             data[i] = 0.0f;
         }
     }
 }

◆ relu_forward_inplace_bf16()

void relu_forward_inplace_bf16	(	uint16_t *	data,
		size_t	n
	)

Definition at line 34 of file relu_kernels_bf16.c.

 {
     if (!data) {
         return;
     }
     for (size_t i = 0; i < n; ++i) {
         float x = bf16_to_float(data[i]);
         data[i] = float_to_bf16(x > 0.0f ? x : 0.0f);
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ rmsnorm_backward()

void rmsnorm_backward	(	const float *	d_output,
		const float *	input,
		const float *	gamma,
		const float *	rstd_cache,
		float *	d_input,
		float *	d_gamma,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim
	)

RMSNorm backward pass

Test:

test_rmsnorm.py::TestRMSNormBackward::test_backward_tokens

test_rmsnorm.py::TestRMSNormBackward::test_backward_single

test_parity.py::test_rmsnorm_backward_parity

Computes dX and dGamma given dY, X, gamma, and cached rstd. dX_i = rstd * (dY_i * gamma_i - x_hat_i * m) dGamma_i = sum_t (dY_i * x_hat_i)

After changes: make test && make llamacpp-parity-full

Definition at line 184 of file rmsnorm_kernels.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     // Zero parameter gradients
 #if defined(__AVX512F__)
     {
         int d = 0;
         for (; d + 16 <= D; d += 16) {
             _mm512_storeu_ps(&d_gamma[d], _mm512_setzero_ps());
         }
         for (; d < D; ++d) {
             d_gamma[d] = 0.0f;
         }
     }
 #elif defined(__AVX__)
     {
         int d = 0;
         for (; d + 8 <= D; d += 8) {
             _mm256_storeu_ps(&d_gamma[d], _mm256_setzero_ps());
         }
         for (; d < D; ++d) {
             d_gamma[d] = 0.0f;
         }
     }
 #else
     for (int d = 0; d < D; ++d) {
         d_gamma[d] = 0.0f;
     }
 #endif
  
     for (int t = 0; t < T; ++t) {
         const float *x = input + (size_t)t * aligned;
         const float *dY = d_output + (size_t)t * aligned;
         float *dX = d_input + (size_t)t * aligned;
  
         float rstd = rstd_cache[t];
  
 #if defined(__AVX512F__)
         // Compute m = (1/D) * sum_j (dY_j * gamma_j * x_hat_j)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         __m512 sum_vec = _mm512_setzero_ps();
         int d = 0;
  
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 dyv = _mm512_loadu_ps(&dY[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             // sum += dY * gamma * x_hat
             __m512 prod = _mm512_mul_ps(dyv, gv);
             sum_vec = _mm512_fmadd_ps(prod, x_hat, sum_vec);
         }
         float sum_dY_g_xhat = _mm512_reduce_add_ps(sum_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             float x_hat = x[d] * rstd;
             sum_dY_g_xhat += dY[d] * gamma[d] * x_hat;
         }
         float m = sum_dY_g_xhat / (float)D;
  
         // Compute dX and accumulate dGamma (vectorized)
         __m512 m_vec = _mm512_set1_ps(m);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 dyv = _mm512_loadu_ps(&dY[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 dgv = _mm512_loadu_ps(&d_gamma[d]);
  
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
  
             // dX = rstd * (dY * gamma - x_hat * m)
             __m512 dy_g = _mm512_mul_ps(dyv, gv);
             __m512 xhat_m = _mm512_mul_ps(x_hat, m_vec);
             __m512 diff = _mm512_sub_ps(dy_g, xhat_m);
             __m512 dxv = _mm512_mul_ps(rstd_vec, diff);
             _mm512_storeu_ps(&dX[d], dxv);
  
             // d_gamma += dY * x_hat
             dgv = _mm512_fmadd_ps(dyv, x_hat, dgv);
             _mm512_storeu_ps(&d_gamma[d], dgv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             float x_hat = x[d] * rstd;
             float dy = dY[d];
             dX[d] = rstd * (dy * gamma[d] - x_hat * m);
             d_gamma[d] += dy * x_hat;
         }
  
 #elif defined(__AVX__)
         // Compute m = (1/D) * sum_j (dY_j * gamma_j * x_hat_j)
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         __m256 sum_vec = _mm256_setzero_ps();
         int d = 0;
  
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 dyv = _mm256_loadu_ps(&dY[d]);
             __m256 gv = _mm256_loadu_ps(&gamma[d]);
             __m256 x_hat = _mm256_mul_ps(xv, rstd_vec);
             // sum += dY * gamma * x_hat (no FMA, use mul + mul + add)
             __m256 prod = _mm256_mul_ps(dyv, gv);
             __m256 prod2 = _mm256_mul_ps(prod, x_hat);
             sum_vec = _mm256_add_ps(sum_vec, prod2);
         }
         float sum_dY_g_xhat = hsum256_ps_rmsnorm(sum_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             float x_hat = x[d] * rstd;
             sum_dY_g_xhat += dY[d] * gamma[d] * x_hat;
         }
         float m = sum_dY_g_xhat / (float)D;
  
         // Compute dX and accumulate dGamma (vectorized)
         __m256 m_vec = _mm256_set1_ps(m);
         d = 0;
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 dyv = _mm256_loadu_ps(&dY[d]);
             __m256 gv = _mm256_loadu_ps(&gamma[d]);
             __m256 dgv = _mm256_loadu_ps(&d_gamma[d]);
  
             __m256 x_hat = _mm256_mul_ps(xv, rstd_vec);
  
             // dX = rstd * (dY * gamma - x_hat * m)
             __m256 dy_g = _mm256_mul_ps(dyv, gv);
             __m256 xhat_m = _mm256_mul_ps(x_hat, m_vec);
             __m256 diff = _mm256_sub_ps(dy_g, xhat_m);
             __m256 dxv = _mm256_mul_ps(rstd_vec, diff);
             _mm256_storeu_ps(&dX[d], dxv);
  
             // d_gamma += dY * x_hat
             __m256 dy_xhat = _mm256_mul_ps(dyv, x_hat);
             dgv = _mm256_add_ps(dgv, dy_xhat);
             _mm256_storeu_ps(&d_gamma[d], dgv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             float x_hat = x[d] * rstd;
             float dy = dY[d];
             dX[d] = rstd * (dy * gamma[d] - x_hat * m);
             d_gamma[d] += dy * x_hat;
         }
  
 #else
         // Scalar fallback
         // Compute m = (1/D) * sum_j (dY_j * gamma_j * x_hat_j)
         double sum_dY_g_xhat = 0.0;
         for (int d = 0; d < D; ++d) {
             float x_hat = x[d] * rstd;
             sum_dY_g_xhat += (double)dY[d] * (double)gamma[d] * (double)x_hat;
         }
         float m = (float)(sum_dY_g_xhat / (double)D);
  
         // Compute dX and accumulate dGamma
         for (int d = 0; d < D; ++d) {
             float x_hat = x[d] * rstd;
             float dy = dY[d];
             dX[d] = rstd * (dy * gamma[d] - x_hat * m);
             d_gamma[d] += dy * x_hat;
         }
 #endif
  
         // Zero padding gradients (if any)
         for (int d = D; d < aligned; ++d) {
             dX[d] = 0.0f;
         }
     }
 }

Referenced by ck_layer_backward_rmsnorm_swiglu(), rmsnorm_backward_int4(), and rmsnorm_backward_int8().

◆ rmsnorm_backward_bf16()

void rmsnorm_backward_bf16	(	const uint16_t *	d_output,
		const uint16_t *	input,
		const float *	gamma,
		const float *	rstd_cache,
		uint16_t *	d_input,
		float *	d_gamma,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim
	)

Definition at line 113 of file rmsnorm_kernels_bf16.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     if (!d_output || !input || !gamma || !rstd_cache || !d_input || !d_gamma) {
         return;
     }
  
     // Zero parameter gradients
 #if defined(__AVX512F__)
     {
         int d = 0;
         for (; d + 16 <= D; d += 16) {
             _mm512_storeu_ps(&d_gamma[d], _mm512_setzero_ps());
         }
         for (; d < D; ++d) {
             d_gamma[d] = 0.0f;
         }
     }
 #else
     for (int d = 0; d < D; ++d) {
         d_gamma[d] = 0.0f;
     }
 #endif
  
     for (int t = 0; t < T; ++t) {
         const uint16_t *x_bf16 = input + (size_t)t * aligned;
         const uint16_t *dY_bf16 = d_output + (size_t)t * aligned;
         uint16_t *dX_bf16 = d_input + (size_t)t * aligned;
         float rstd = rstd_cache[t];
  
 #if defined(__AVX512F__)
         // Compute m = (1/D) * sum_j (dY_j * gamma_j * x_hat_j)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         __m512 sum_vec = _mm512_setzero_ps();
         int d = 0;
  
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             __m512 dyv = bf16_loadu_cvt_fp32(&dY_bf16[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             // sum += dY * gamma * x_hat
             __m512 prod = _mm512_mul_ps(dyv, gv);
             sum_vec = _mm512_fmadd_ps(prod, x_hat, sum_vec);
         }
         float sum_dY_g_xhat = _mm512_reduce_add_ps(sum_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             sum_dY_g_xhat += dy * gamma[d] * x_hat;
         }
         float m = sum_dY_g_xhat / (float)D;
  
         // Compute dX and accumulate dGamma (vectorized)
         __m512 m_vec = _mm512_set1_ps(m);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             __m512 dyv = bf16_loadu_cvt_fp32(&dY_bf16[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 dgv = _mm512_loadu_ps(&d_gamma[d]);
  
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
  
             // dX = rstd * (dY * gamma - x_hat * m)
             __m512 dy_g = _mm512_mul_ps(dyv, gv);
             __m512 xhat_m = _mm512_mul_ps(x_hat, m_vec);
             __m512 diff = _mm512_sub_ps(dy_g, xhat_m);
             __m512 dxv = _mm512_mul_ps(rstd_vec, diff);
             fp32_cvt_storeu_bf16(&dX_bf16[d], dxv);
  
             // d_gamma += dY * x_hat
             dgv = _mm512_fmadd_ps(dyv, x_hat, dgv);
             _mm512_storeu_ps(&d_gamma[d], dgv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             float dx = rstd * (dy * gamma[d] - x_hat * m);
             dX_bf16[d] = float_to_bf16(dx);
             d_gamma[d] += dy * x_hat;
         }
  
 #else
         // Scalar fallback
         double sum_dY_g_xhat = 0.0;
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             sum_dY_g_xhat += (double)dy * (double)gamma[d] * (double)x_hat;
         }
         float m = (float)(sum_dY_g_xhat / (double)D);
  
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             float dx = rstd * (dy * gamma[d] - x_hat * m);
             dX_bf16[d] = float_to_bf16(dx);
             d_gamma[d] += dy * x_hat;
         }
 #endif
  
         // Zero padding gradients
         for (int d = D; d < aligned; ++d) {
             dX_bf16[d] = 0;
         }
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ rmsnorm_backward_int4()

void rmsnorm_backward_int4	(	const uint8_t *	d_output,
		const uint8_t *	input,
		const float *	gamma,
		const float *	rstd_cache,
		uint8_t *	d_input,
		float *	d_gamma,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float *	scratch_d_output,
		float *	scratch_input,
		float *	scratch_d_input
	)

Definition at line 104 of file rmsnorm_kernels_int4.c.

 {
     if (!d_output || !input || !gamma || !rstd_cache || !d_input || !d_gamma) return;
     if (!scratch_d_output || !scratch_input || !scratch_d_input) return;
  
     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
  
     convert_int4_to_float(d_output, scratch_d_output, total);
     convert_int4_to_float(input, scratch_input, total);
  
     for (int d = 0; d < d_model; ++d) {
         d_gamma[d] = 0.0f;
     }
  
     rmsnorm_backward(scratch_d_output,
                      scratch_input,
                      gamma,
                      rstd_cache,
                      scratch_d_input,
                      d_gamma,
                      tokens,
                      d_model,
                      aligned_embed_dim);
  
     convert_float_to_int4(scratch_d_input, d_input, total);
 }

References convert_float_to_int4(), convert_int4_to_float(), and rmsnorm_backward().

◆ rmsnorm_backward_int8()

void rmsnorm_backward_int8	(	const int8_t *	d_output,
		const int8_t *	input,
		const float *	gamma,
		const float *	rstd_cache,
		int8_t *	d_input,
		float *	d_gamma,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float *	scratch_d_output,
		float *	scratch_input,
		float *	scratch_d_input
	)

Definition at line 84 of file rmsnorm_kernels_int8.c.

 {
     if (!d_output || !input || !gamma || !rstd_cache || !d_input || !d_gamma) return;
     if (!scratch_d_output || !scratch_input || !scratch_d_input) return;
  
     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
  
     convert_int8_to_float(d_output, scratch_d_output, total);
     convert_int8_to_float(input, scratch_input, total);
  
     // Zero gamma gradient before accumulation.
     for (int d = 0; d < d_model; ++d) {
         d_gamma[d] = 0.0f;
     }
  
     rmsnorm_backward(scratch_d_output,
                      scratch_input,
                      gamma,
                      rstd_cache,
                      scratch_d_input,
                      d_gamma,
                      tokens,
                      d_model,
                      aligned_embed_dim);
  
     convert_float_to_int8(scratch_d_input, d_input, total);
 }

References convert_float_to_int8(), convert_int8_to_float(), and rmsnorm_backward().

◆ rmsnorm_forward()

void rmsnorm_forward	(	const float *	input,
		const float *	gamma,
		float *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

RMSNorm forward pass

Test:

test_rmsnorm.py::TestRMSNormForward::test_fp32_tokens

test_rmsnorm.py::TestRMSNormForward::test_fp32_single

test_rmsnorm.py::TestRMSNormForward::test_perf_rolled

test_layernorm.py::TestLayerNormForward::test_rmsnorm_compat

test_parity.py::test_rmsnorm_parity

RMSNorm: y[i] = gamma[i] * x[i] / sqrt(mean(x^2) + eps)

After changes: make test && make llamacpp-parity-full

Definition at line 50 of file rmsnorm_kernels.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     for (int t = 0; t < T; ++t) {
         const float *x = input + (size_t)t * aligned;
         float *y = output + (size_t)t * aligned;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 sum_sq_vec = _mm512_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
         }
         float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             __m512 yv = _mm512_mul_ps(x_hat, gv);
             _mm512_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #elif defined(__AVX__)
         // AVX: Process 8 floats at a time
         __m256 sum_sq_vec = _mm256_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares (no FMA in AVX1, use mul + add)
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 xv_sq = _mm256_mul_ps(xv, xv);
             sum_sq_vec = _mm256_add_ps(sum_sq_vec, xv_sq);
         }
         float sum_sq = hsum256_ps_rmsnorm(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         d = 0;
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 gv = _mm256_loadu_ps(&gamma[d]);
             __m256 x_hat = _mm256_mul_ps(xv, rstd_vec);
             __m256 yv = _mm256_mul_ps(x_hat, gv);
             _mm256_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #else
         // Scalar fallback
         double sum_sq = 0.0;
         for (int d = 0; d < D; ++d) {
             double v = (double)x[d];
             sum_sq += v * v;
         }
         double mean_sq = sum_sq / (double)D;
         double r = sqrt(mean_sq + (double)eps);
         float rstd = (float)(1.0 / r);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale
         for (int d = 0; d < D; ++d) {
             float x_hat = x[d] * rstd;
             y[d] = x_hat * gamma[d];
         }
 #endif
  
         // Zero padding (if any)
         for (int d = D; d < aligned; ++d) {
             y[d] = 0.0f;
         }
     }
 }

◆ rmsnorm_forward_bf16()

void rmsnorm_forward_bf16	(	const uint16_t *	input,
		const float *	gamma,
		uint16_t *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

Definition at line 24 of file rmsnorm_kernels_bf16.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     for (int t = 0; t < T; ++t) {
         const uint16_t *x_bf16 = input + (size_t)t * aligned;
         float *rstd_ptr = rstd_cache ? (rstd_cache + t) : NULL;
         uint16_t *out_bf16 = output + (size_t)t * aligned;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 sum_sq_vec = _mm512_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
         }
         float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             sum_sq += x * x;
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_ptr) {
             *rstd_ptr = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             __m512 yv = _mm512_mul_ps(x_hat, gv);
             fp32_cvt_storeu_bf16(&out_bf16[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float y = x * rstd * gamma[d];
             out_bf16[d] = float_to_bf16(y);
         }
  
 #else
         // Scalar fallback
         double sum_sq = 0.0;
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             sum_sq += (double)x * (double)x;
         }
         double mean_sq = sum_sq / (double)D;
         double r = sqrt(mean_sq + (double)eps);
         float rstd = (float)(1.0 / r);
         if (rstd_ptr) {
             *rstd_ptr = rstd;
         }
  
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float y = x_hat * gamma[d];
             out_bf16[d] = float_to_bf16(y);
         }
 #endif
  
         // Zero padding
         for (int d = D; d < aligned; ++d) {
             out_bf16[d] = 0;
         }
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ rmsnorm_forward_int4()

void rmsnorm_forward_int4	(	const uint8_t *	input,
		const float *	gamma,
		uint8_t *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps,
		float *	scratch_input,
		float *	scratch_output
	)

Definition at line 78 of file rmsnorm_kernels_int4.c.

 {
     if (!input || !gamma || !output) return;
     if (!scratch_input || !scratch_output) return;
  
     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
  
     convert_int4_to_float(input, scratch_input, total);
     rmsnorm_forward(scratch_input, gamma, scratch_output, rstd_cache,
                     tokens, d_model, aligned_embed_dim, eps);
     convert_float_to_int4(scratch_output, output, total);
 }

References convert_float_to_int4(), convert_int4_to_float(), and rmsnorm_forward().

◆ rmsnorm_forward_int8()

void rmsnorm_forward_int8	(	const int8_t *	input,
		const float *	gamma,
		int8_t *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps,
		float *	scratch_input,
		float *	scratch_output
	)

Definition at line 58 of file rmsnorm_kernels_int8.c.

 {
     if (!input || !gamma || !output) return;
     if (!scratch_input || !scratch_output) return;
  
     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
  
     convert_int8_to_float(input, scratch_input, total);
     rmsnorm_forward(scratch_input, gamma, scratch_output, rstd_cache,
                     tokens, d_model, aligned_embed_dim, eps);
     convert_float_to_int8(scratch_output, output, total);
 }

References convert_float_to_int8(), convert_int8_to_float(), and rmsnorm_forward().

◆ rope_backward()

void rope_backward	(	const float *	d_out,
		float *	d_x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE backward (inverse rotation)

Test:

test_rope.py::TestRoPEBackward::test_rope_backward

test_rope.py::TestRoPEBackward::test_rope_backward_vs_separate

RoPE backward: inverse rotation (rotate by -θ). Since cos(-θ) = cos(θ) and sin(-θ) = -sin(θ): d_x[2i] = d0 * c + d1 * s d_x[2i+1] = -d0 * s + d1 * c

After changes: make test

Definition at line 238 of file rope_kernels.c.

 {
     size_t head_stride = (size_t)num_tokens * (size_t)aligned_head_dim;
     int half_dim = head_dim / 2;
  
     for (int h = 0; h < num_heads; ++h) {
         for (int t = 0; t < num_tokens; ++t) {
             int pos = pos_offset + t;
             const float *cos_row = cos_cache + pos * half_dim;
             const float *sin_row = sin_cache + pos * half_dim;
  
             size_t idx = h * head_stride + (size_t)t * (size_t)aligned_head_dim;
             const float *d_out_row = d_out + idx;
             float *d_x_row = d_x + idx;
  
 #if defined(__AVX512F__)
             int i = 0;
             for (; i + 16 <= half_dim; i += 16) {
                 __m512 d0 = _mm512_loadu_ps(&d_out_row[i]);
                 __m512 d1 = _mm512_loadu_ps(&d_out_row[i + half_dim]);
                 __m512 c = _mm512_loadu_ps(&cos_row[i]);
                 __m512 s = _mm512_loadu_ps(&sin_row[i]);
  
                 // Inverse: d_x[i] = d0 * c + d1 * s
                 __m512 r0 = _mm512_fmadd_ps(d0, c, _mm512_mul_ps(d1, s));
                 // Inverse: d_x[i+half] = -d0 * s + d1 * c
                 __m512 r1 = _mm512_fmsub_ps(d1, c, _mm512_mul_ps(d0, s));
  
                 _mm512_storeu_ps(&d_x_row[i], r0);
                 _mm512_storeu_ps(&d_x_row[i + half_dim], r1);
             }
             for (; i < half_dim; ++i) {
                 float d0 = d_out_row[i];
                 float d1 = d_out_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
                 d_x_row[i] = d0 * c + d1 * s;
                 d_x_row[i + half_dim] = -d0 * s + d1 * c;
             }
  
 #elif defined(__AVX__)
             int i = 0;
             for (; i + 8 <= half_dim; i += 8) {
                 __m256 d0 = _mm256_loadu_ps(&d_out_row[i]);
                 __m256 d1 = _mm256_loadu_ps(&d_out_row[i + half_dim]);
                 __m256 c = _mm256_loadu_ps(&cos_row[i]);
                 __m256 s = _mm256_loadu_ps(&sin_row[i]);
  
                 // Inverse: d_x[i] = d0 * c + d1 * s
                 __m256 d0c = _mm256_mul_ps(d0, c);
                 __m256 d1s = _mm256_mul_ps(d1, s);
                 __m256 r0 = _mm256_add_ps(d0c, d1s);
  
                 // Inverse: d_x[i+half] = -d0 * s + d1 * c = d1 * c - d0 * s
                 __m256 d1c = _mm256_mul_ps(d1, c);
                 __m256 d0s = _mm256_mul_ps(d0, s);
                 __m256 r1 = _mm256_sub_ps(d1c, d0s);
  
                 _mm256_storeu_ps(&d_x_row[i], r0);
                 _mm256_storeu_ps(&d_x_row[i + half_dim], r1);
             }
             for (; i < half_dim; ++i) {
                 float d0 = d_out_row[i];
                 float d1 = d_out_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
                 d_x_row[i] = d0 * c + d1 * s;
                 d_x_row[i + half_dim] = -d0 * s + d1 * c;
             }
  
 #else
             for (int i = 0; i < half_dim; ++i) {
                 float d0 = d_out_row[i];
                 float d1 = d_out_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
  
                 // Inverse rotation: rotate by -θ
                 d_x_row[i] = d0 * c + d1 * s;
                 d_x_row[i + half_dim] = -d0 * s + d1 * c;
             }
 #endif
  
             for (int i = head_dim; i < aligned_head_dim; ++i) {
                 d_x_row[i] = 0.0f;
             }
         }
     }
 }

Referenced by rope_backward_bf16(), and rope_backward_qk().

◆ rope_backward_bf16()

void rope_backward_bf16	(	const uint16_t *	d_out,
		uint16_t *	d_x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_d_out,
		float *	scratch_d_x
	)

Definition at line 52 of file rope_kernels_bf16.c.

 {
     if (!scratch_d_out || !scratch_d_x) return;
  
     size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     bf16_tensor_to_float(d_out, scratch_d_out, total);
     rope_backward(scratch_d_out, scratch_d_x, cos_cache, sin_cache,
                   num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     float_tensor_to_bf16(scratch_d_x, d_x, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_backward().

Referenced by rope_backward_qk_bf16().

◆ rope_backward_inplace()

void rope_backward_inplace	(	float *	d_x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE backward in-place (overwrite with inverse rotation)

Test:: test_rope.py::TestRoPEBackward::test_rope_backward_inplace

In-place backward: overwrite d_out with inverse-rotated gradients. Useful when d_x == d_out is acceptable (saves memory).

After changes: make test

Definition at line 345 of file rope_kernels.c.

 {
     size_t head_stride = (size_t)num_tokens * (size_t)aligned_head_dim;
     int half_dim = head_dim / 2;
  
     for (int h = 0; h < num_heads; ++h) {
         for (int t = 0; t < num_tokens; ++t) {
             int pos = pos_offset + t;
             const float *cos_row = cos_cache + pos * half_dim;
             const float *sin_row = sin_cache + pos * half_dim;
  
             float *d_row = d_x + h * head_stride + (size_t)t * (size_t)aligned_head_dim;
  
 #if defined(__AVX512F__)
             int i = 0;
             for (; i + 16 <= half_dim; i += 16) {
                 __m512 d0 = _mm512_loadu_ps(&d_row[i]);
                 __m512 d1 = _mm512_loadu_ps(&d_row[i + half_dim]);
                 __m512 c = _mm512_loadu_ps(&cos_row[i]);
                 __m512 s = _mm512_loadu_ps(&sin_row[i]);
  
                 __m512 r0 = _mm512_fmadd_ps(d0, c, _mm512_mul_ps(d1, s));
                 __m512 r1 = _mm512_fmsub_ps(d1, c, _mm512_mul_ps(d0, s));
  
                 _mm512_storeu_ps(&d_row[i], r0);
                 _mm512_storeu_ps(&d_row[i + half_dim], r1);
             }
             for (; i < half_dim; ++i) {
                 float d0 = d_row[i];
                 float d1 = d_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
                 d_row[i] = d0 * c + d1 * s;
                 d_row[i + half_dim] = -d0 * s + d1 * c;
             }
  
 #elif defined(__AVX__)
             int i = 0;
             for (; i + 8 <= half_dim; i += 8) {
                 __m256 d0 = _mm256_loadu_ps(&d_row[i]);
                 __m256 d1 = _mm256_loadu_ps(&d_row[i + half_dim]);
                 __m256 c = _mm256_loadu_ps(&cos_row[i]);
                 __m256 s = _mm256_loadu_ps(&sin_row[i]);
  
                 __m256 d0c = _mm256_mul_ps(d0, c);
                 __m256 d1s = _mm256_mul_ps(d1, s);
                 __m256 r0 = _mm256_add_ps(d0c, d1s);
  
                 __m256 d1c = _mm256_mul_ps(d1, c);
                 __m256 d0s = _mm256_mul_ps(d0, s);
                 __m256 r1 = _mm256_sub_ps(d1c, d0s);
  
                 _mm256_storeu_ps(&d_row[i], r0);
                 _mm256_storeu_ps(&d_row[i + half_dim], r1);
             }
             for (; i < half_dim; ++i) {
                 float d0 = d_row[i];
                 float d1 = d_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
                 d_row[i] = d0 * c + d1 * s;
                 d_row[i + half_dim] = -d0 * s + d1 * c;
             }
  
 #else
             for (int i = 0; i < half_dim; ++i) {
                 float d0 = d_row[i];
                 float d1 = d_row[i + half_dim];
                 float c = cos_row[i];
                 float s = sin_row[i];
  
                 // Inverse rotation: rotate by -θ
                 d_row[i] = d0 * c + d1 * s;
                 d_row[i + half_dim] = -d0 * s + d1 * c;
             }
 #endif
  
             for (int i = head_dim; i < aligned_head_dim; ++i) {
                 d_row[i] = 0.0f;
             }
         }
     }
 }

◆ rope_backward_qk()

void rope_backward_qk	(	const float *	d_q_out,
		const float *	d_k_out,
		float *	d_q,
		float *	d_k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE backward for both dQ and dK

Test:: test_rope.py::TestRoPEBackward::test_rope_backward_qk

Combined RoPE backward for both dQ and dK gradients.

After changes: make test

Definition at line 497 of file rope_kernels.c.

 {
     rope_backward(d_q_out, d_q, cos_cache, sin_cache, num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     rope_backward(d_k_out, d_k, cos_cache, sin_cache, num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
 }

References rope_backward().

Referenced by ck_layer_backward_rmsnorm_swiglu().

◆ rope_backward_qk_bf16()

void rope_backward_qk_bf16	(	const uint16_t *	d_q_out,
		const uint16_t *	d_k_out,
		uint16_t *	d_q,
		uint16_t *	d_k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_dq_out,
		float *	scratch_dq,
		float *	scratch_dk_out,
		float *	scratch_dk
	)

Definition at line 103 of file rope_kernels_bf16.c.

 {
     if (!d_q_out || !d_k_out || !d_q || !d_k) return;
  
     rope_backward_bf16(d_q_out, d_q, cos_cache, sin_cache,
                        num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
                        scratch_dq_out, scratch_dq);
     rope_backward_bf16(d_k_out, d_k, cos_cache, sin_cache,
                        num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
                        scratch_dk_out, scratch_dk);
 }

References rope_backward_bf16().

◆ rope_forward()

void rope_forward	(	float *	x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE forward (head-major layout, in-place)

Test:

test_rope.py::TestRoPEForward::test_rope_forward

test_rope.py::TestRoPEForward::test_rope_vs_separate

test_parity.py::test_rope_parity

Applies rotary position embeddings in-place to Q or K tensor. x: [num_heads, num_tokens, head_dim] head-major

After changes: make test && make llamacpp-parity-full

Definition at line 180 of file rope_kernels.c.

 {
     size_t head_stride = (size_t)num_tokens * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         rope_apply_head(x + h * head_stride,
                         cos_cache, sin_cache,
                         num_tokens, head_dim, aligned_head_dim, pos_offset);
     }
 }

References rope_apply_head().

◆ rope_forward_bf16()

void rope_forward_bf16	(	uint16_t *	x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch
	)

Definition at line 28 of file rope_kernels_bf16.c.

 {
     if (!scratch) return;
  
     size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     bf16_tensor_to_float(x, scratch, total);
     rope_forward(scratch, cos_cache, sin_cache,
                  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     float_tensor_to_bf16(scratch, x, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_forward().

Referenced by rope_forward_qk_bf16().

◆ rope_forward_qk()

void rope_forward_qk	(	float *	q,
		float *	k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE forward for both Q and K (common inference pattern)

Test:

test_rope.py::TestRoPEForward::test_rope_forward_qk

test_fused_attention_decode.py::TestFusedAttentionDecode::test_qk_rope

test_parity.py::test_rope_qk_parity

Combined RoPE forward for both Q and K in one call. q: [num_heads, num_tokens, head_dim] k: [num_kv_heads, num_tokens, head_dim]

After changes: make test && make llamacpp-parity-full

Definition at line 448 of file rope_kernels.c.

 {
     rope_forward(q, cos_cache, sin_cache, num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     rope_forward(k, cos_cache, sin_cache, num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
 }

References rope_forward().

◆ rope_forward_qk_bf16()

void rope_forward_qk_bf16	(	uint16_t *	q,
		uint16_t *	k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_q,
		float *	scratch_k
	)

Definition at line 79 of file rope_kernels_bf16.c.

 {
     if (!q || !k) return;
  
     rope_forward_bf16(q, cos_cache, sin_cache,
                       num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_q);
     rope_forward_bf16(k, cos_cache, sin_cache,
                       num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_k);
 }

References rope_forward_bf16().

◆ rope_forward_qk_strided()

void rope_forward_qk_strided	(	float *	q,
		float *	k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		int	q_stride_tokens,
		int	k_stride_tokens
	)

RoPE forward for both Q and K with custom strides (KV cache layouts)

Test:

test_rope.py::TestRoPEForward::test_rope_forward_qk_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_qk_rope_strided

Combined QK RoPE with configurable strides for KV cache layouts.

After changes: make test

Definition at line 472 of file rope_kernels.c.

 {
     rope_forward_strided(q, cos_cache, sin_cache, num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, q_stride_tokens);
     rope_forward_strided(k, cos_cache, sin_cache, num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, k_stride_tokens);
 }

References rope_forward_strided().

◆ rope_forward_strided()

void rope_forward_strided	(	float *	x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		int	head_stride_tokens
	)

RoPE forward with custom head stride (for KV cache layouts)

Test:

test_rope.py::TestRoPEForward::test_rope_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_rope_decode

Variant with configurable head_stride_tokens for non-contiguous head layouts.

After changes: make test

Definition at line 207 of file rope_kernels.c.

 {
     size_t head_stride = (size_t)head_stride_tokens * (size_t)aligned_head_dim;
  
     for (int h = 0; h < num_heads; ++h) {
         rope_apply_head(x + h * head_stride,
                         cos_cache, sin_cache,
                         num_tokens, head_dim, aligned_head_dim, pos_offset);
     }
 }

References rope_apply_head().

Referenced by rope_forward_qk_strided().

◆ rope_precompute_cache()

void rope_precompute_cache	(	float *	cos_cache,
		float *	sin_cache,
		int	max_seq_len,
		int	head_dim,
		float	base
	)

Precompute RoPE cos/sin cache

Test:

test_rope.py::TestRoPECache::test_cache_computation

test_rope.py::TestRoPECache::test_cache_values

Precomputes cos(m * theta_i) and sin(m * theta_i) for positions 0..max_seq_len-1. cos_cache, sin_cache: [max_seq_len, head_dim/2]

After changes: make test

Definition at line 52 of file rope_kernels.c.

 {
     int half_dim = head_dim / 2;
  
     long double base_ld = (long double)base;
     long double head_dim_ld = (long double)head_dim;
     long double log_base = logl(base_ld);
     for (int pos = 0; pos < max_seq_len; ++pos) {
         for (int i = 0; i < half_dim; ++i) {
             long double exponent = ((long double)(2 * i)) / head_dim_ld;
             long double freq = expl(-exponent * log_base);
             float freq_f = (float)freq;
             float angle_f = (float)pos * freq_f;
             cos_cache[pos * half_dim + i] = cosf(angle_f);
             sin_cache[pos * half_dim + i] = sinf(angle_f);
         }
     }
 }

◆ scal_copy_f32()

void scal_copy_f32	(	float *	y,
		const float *	x,
		float	alpha,
		int	n
	)

Scaled copy: y = alpha * x.

Parameters

y	Output vector [n]
x	Input vector [n]
alpha	Scalar multiplier
n	Vector length

Definition at line 105 of file axpy_kernels.c.

 {
     if (!y || !x || n <= 0) {
         return;
     }
  
     int i = 0;
  
 #ifdef __AVX512F__
     __m512 valpha = _mm512_set1_ps(alpha);
     for (; i + 16 <= n; i += 16) {
         __m512 vx = _mm512_loadu_ps(&x[i]);
         __m512 vy = _mm512_mul_ps(vx, valpha);
         _mm512_storeu_ps(&y[i], vy);
     }
 #endif
  
 #ifdef __AVX2__
     __m256 valpha256 = _mm256_set1_ps(alpha);
     for (; i + 8 <= n; i += 8) {
         __m256 vx = _mm256_loadu_ps(&x[i]);
         __m256 vy = _mm256_mul_ps(vx, valpha256);
         _mm256_storeu_ps(&y[i], vy);
     }
 #endif
  
     for (; i < n; i++) {
         y[i] = alpha * x[i];
     }
 }

Referenced by weighted_sum_f32().

◆ sigmoid_backward()

void sigmoid_backward	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 138 of file sigmoid_kernels.c.

 {
 #if defined(__AVX512F__)
     sigmoid_backward_avx512(input, d_output, d_input, n);
 #else
     for (size_t i = 0; i < n; ++i) {
         float x = input[i];
         float s = sigmoid_scalar(x);
         float s_prime = s * (1.0f - s);
         d_input[i] = d_output[i] * s_prime;
     }
 #endif
 }

References sigmoid_scalar().

Referenced by sigmoid_backward_bf16().

◆ sigmoid_backward_bf16()

void sigmoid_backward_bf16	(	const uint16_t *	input,
		const uint16_t *	d_output,
		uint16_t *	d_input,
		size_t	n,
		float *	scratch_input,
		float *	scratch_d_output,
		float *	scratch_d_input
	)

Definition at line 45 of file sigmoid_kernels_bf16.c.

 {
     if (!input || !d_output || !d_input || n == 0) return;
     if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
  
     bf16_tensor_to_float(input, scratch_input, n);
     bf16_tensor_to_float(d_output, scratch_d_output, n);
     sigmoid_backward(scratch_input, scratch_d_output, scratch_d_input, n);
     float_tensor_to_bf16(scratch_d_input, d_input, n);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and sigmoid_backward().

◆ sigmoid_forward()

void sigmoid_forward	(	const float *	input,
		float *	output,
		size_t	n
	)

Definition at line 122 of file sigmoid_kernels.c.

 {
 #if defined(__AVX512F__)
     sigmoid_forward_avx512(input, output, n);
 #else
     for (size_t i = 0; i < n; ++i) {
         output[i] = sigmoid_scalar(input[i]);
     }
 #endif
 }

References sigmoid_scalar().

Referenced by sigmoid_forward_bf16().

◆ sigmoid_forward_bf16()

void sigmoid_forward_bf16	(	const uint16_t *	input,
		uint16_t *	output,
		size_t	n,
		float *	scratch_input,
		float *	scratch_output
	)

Definition at line 27 of file sigmoid_kernels_bf16.c.

 {
     if (!input || !output || n == 0) return;
     if (!scratch_input || !scratch_output) return;
  
     bf16_tensor_to_float(input, scratch_input, n);
     sigmoid_forward(scratch_input, scratch_output, n);
     float_tensor_to_bf16(scratch_output, output, n);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and sigmoid_forward().

◆ sigmoid_scalar()

float sigmoid_scalar ( float x )

Definition at line 26 of file sigmoid_kernels.c.

 {
     return 1.0f / (1.0f + expf(-x));
 }

Referenced by sigmoid_backward(), sigmoid_forward(), swiglu_backward(), swiglu_backward_bf16(), swiglu_backward_exact(), swiglu_forward(), swiglu_forward_bf16(), and swiglu_forward_exact().

◆ softmax_cross_entropy_loss()

void softmax_cross_entropy_loss	(	const float *	logits,
		const int32_t *	targets,
		int	tokens,
		int	vocab_size,
		float *	d_logits,
		float *	loss_out
	)

Definition at line 21 of file loss_kernels.c.

 {
     if (!logits || !targets || !d_logits || tokens <= 0 || vocab_size <= 0) {
         if (loss_out) {
             *loss_out = 0.0f;
         }
         return;
     }
  
     double total_loss = 0.0;
  
     for (int t = 0; t < tokens; ++t) {
         const float *row = logits + (size_t)t * (size_t)vocab_size;
         float *drow = d_logits + (size_t)t * (size_t)vocab_size;
         int target = targets[t];
  
         float max_logit = row[0];
         for (int v = 1; v < vocab_size; ++v) {
             if (row[v] > max_logit) {
                 max_logit = row[v];
             }
         }
  
         double sum_exp = 0.0;
         for (int v = 0; v < vocab_size; ++v) {
             float e = expf(row[v] - max_logit);
             drow[v] = e;
             sum_exp += e;
         }
  
         float inv_sum = 1.0f / (float)sum_exp;
         for (int v = 0; v < vocab_size; ++v) {
             drow[v] *= inv_sum;
         }
  
         if (target >= 0 && target < vocab_size) {
             total_loss += -logf(drow[target] + 1e-10f);
             drow[target] -= 1.0f;
         }
  
         float scale = 1.0f / (float)tokens;
         for (int v = 0; v < vocab_size; ++v) {
             drow[v] *= scale;
         }
     }
  
     if (loss_out) {
         *loss_out = (float)(total_loss / (double)tokens);
     }
 }

References vocab_size.

Referenced by softmax_cross_entropy_loss_bf16().

◆ softmax_cross_entropy_loss_bf16()

void softmax_cross_entropy_loss_bf16	(	const uint16_t *	logits,
		const int32_t *	targets,
		int	tokens,
		int	vocab_size,
		uint16_t *	d_logits,
		float *	loss_out,
		float *	scratch_logits,
		float *	scratch_d_logits
	)

Definition at line 25 of file loss_kernels_bf16.c.

 {
     if (!logits || !targets || !d_logits || tokens <= 0 || vocab_size <= 0) {
         if (loss_out) *loss_out = 0.0f;
         return;
     }
     if (!scratch_logits || !scratch_d_logits) {
         if (loss_out) *loss_out = 0.0f;
         return;
     }
  
     const size_t count = (size_t)tokens * (size_t)vocab_size;
  
     bf16_tensor_to_float(logits, scratch_logits, count);
     softmax_cross_entropy_loss(scratch_logits, targets, tokens, vocab_size, scratch_d_logits, loss_out);
     float_tensor_to_bf16(scratch_d_logits, d_logits, count);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), softmax_cross_entropy_loss(), and vocab_size.

◆ swiglu_backward()

void swiglu_backward	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		int	tokens,
		int	dim
	)

SwiGLU backward pass

Test:

test_swiglu.py::TestSwiGLUBackward::test_backward_tokens

test_swiglu.py::TestSwiGLUBackward::test_backward_single

test_parity.py::test_swiglu_backward_parity

Computes dGate and dUp given dY. dGate = dy * b * silu'(a), dUp = dy * silu(a)

After changes: make test && make llamacpp-parity-full

Definition at line 215 of file swiglu_kernels.c.

 {
     int T = tokens;
     int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const float *row = input + (size_t)t * (2 * D);
         const float *dy_row = d_output + (size_t)t * D;
         float *dx_row = d_input + (size_t)t * (2 * D);
         int d = 0;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 one = _mm512_set1_ps(1.0f);
         for (; d + 16 <= D; d += 16) {
             __m512 a = _mm512_loadu_ps(&row[d]);         // gate
             __m512 b = _mm512_loadu_ps(&row[D + d]);     // value
             __m512 dy = _mm512_loadu_ps(&dy_row[d]);
  
             __m512 s = sigmoid512_fast(a);              // sigmoid(a)
             __m512 silu = _mm512_mul_ps(a, s);          // silu(a) = a * s
             __m512 s_prime = _mm512_mul_ps(s, _mm512_sub_ps(one, s)); // s * (1 - s)
             __m512 silu_prime = _mm512_fmadd_ps(a, s_prime, s);       // s + a * s_prime
  
             // dA = dy * b * silu_prime
             __m512 dA = _mm512_mul_ps(dy, _mm512_mul_ps(b, silu_prime));
             // dB = dy * silu
             __m512 dB = _mm512_mul_ps(dy, silu);
  
             _mm512_storeu_ps(&dx_row[d], dA);
             _mm512_storeu_ps(&dx_row[D + d], dB);
         }
 #elif defined(__AVX2__)
         // AVX2: Process 8 floats at a time
         __m256 one = _mm256_set1_ps(1.0f);
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
             __m256 dy = _mm256_loadu_ps(&dy_row[d]);
  
             __m256 s = sigmoid256_fast(a);              // sigmoid(a)
             __m256 silu = _mm256_mul_ps(a, s);          // silu(a) = a * s
             __m256 s_prime = _mm256_mul_ps(s, _mm256_sub_ps(one, s)); // s * (1 - s)
             __m256 silu_prime = _mm256_fmadd_ps(a, s_prime, s);       // s + a * s_prime
  
             // dA = dy * b * silu_prime
             __m256 dA = _mm256_mul_ps(dy, _mm256_mul_ps(b, silu_prime));
             // dB = dy * silu
             __m256 dB = _mm256_mul_ps(dy, silu);
  
             _mm256_storeu_ps(&dx_row[d], dA);
             _mm256_storeu_ps(&dx_row[D + d], dB);
         }
 #elif defined(__AVX__)
         // AVX1: Vectorize arithmetic, use scalar sigmoid
         __m256 one = _mm256_set1_ps(1.0f);
         float a_arr[8] __attribute__((aligned(32)));
         float s_arr[8] __attribute__((aligned(32)));
  
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
             __m256 dy = _mm256_loadu_ps(&dy_row[d]);
  
             // Compute sigmoid scalarly
             _mm256_store_ps(a_arr, a);
             for (int j = 0; j < 8; ++j) {
                 s_arr[j] = sigmoid_scalar(a_arr[j]);
             }
             __m256 s = _mm256_load_ps(s_arr);
  
             __m256 silu = _mm256_mul_ps(a, s);                        // silu(a) = a * s
             __m256 s_prime = _mm256_mul_ps(s, _mm256_sub_ps(one, s)); // s * (1 - s)
             // silu_prime = s + a * s_prime (no FMA in AVX1)
             __m256 a_s_prime = _mm256_mul_ps(a, s_prime);
             __m256 silu_prime = _mm256_add_ps(s, a_s_prime);
  
             // dA = dy * b * silu_prime
             __m256 dA = _mm256_mul_ps(dy, _mm256_mul_ps(b, silu_prime));
             // dB = dy * silu
             __m256 dB = _mm256_mul_ps(dy, silu);
  
             _mm256_storeu_ps(&dx_row[d], dA);
             _mm256_storeu_ps(&dx_row[D + d], dB);
         }
 #endif
  
         // Scalar fallback for remaining elements
         for (; d < D; ++d) {
             float a = row[d];       // gate
             float b = row[D + d];   // value
             float dy = dy_row[d];
  
             float s = sigmoid_scalar(a);               // sigmoid(a)
             float silu = a * s;                       // silu(a)
             float s_prime = s * (1.0f - s);           // sigmoid'(a)
             float silu_prime = s + a * s_prime;       // silu'(a)
  
             float dA = dy * b * silu_prime;
             float dB = dy * silu;
  
             dx_row[d] = dA;
             dx_row[D + d] = dB;
         }
     }
 }

References __attribute__(), sigmoid_scalar(), and silu().

Referenced by ck_layer_backward_rmsnorm_swiglu().

◆ swiglu_backward_bf16()

void swiglu_backward_bf16	(	const uint16_t *	input,
		const uint16_t *	d_output,
		uint16_t *	d_input,
		int	tokens,
		int	dim
	)

Definition at line 108 of file swiglu_kernels_bf16.c.

 {
     if (!input || !d_output || !d_input || tokens <= 0 || dim <= 0) {
         return;
     }
  
     const int T = tokens;
     const int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const uint16_t *row = input + (size_t)t * (size_t)(2 * D);
         const uint16_t *dy_row = d_output + (size_t)t * (size_t)D;
         uint16_t *dx_row = d_input + (size_t)t * (size_t)(2 * D);
         int d = 0;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 one = _mm512_set1_ps(1.0f);
         for (; d + 16 <= D; d += 16) {
             __m512 a = bf16_loadu_cvt_fp32(&row[d]);         // gate
             __m512 b = bf16_loadu_cvt_fp32(&row[D + d]);     // value
             __m512 dy = bf16_loadu_cvt_fp32(&dy_row[d]);
  
             __m512 s = sigmoid512_fast_bf16(a);             // sigmoid(a)
             __m512 silu = _mm512_mul_ps(a, s);              // silu(a) = a * s
             __m512 s_prime = _mm512_mul_ps(s, _mm512_sub_ps(one, s)); // s * (1 - s)
             __m512 silu_prime = _mm512_fmadd_ps(a, s_prime, s);       // s + a * s_prime
  
             // dA = dy * b * silu_prime
             __m512 dA = _mm512_mul_ps(dy, _mm512_mul_ps(b, silu_prime));
             // dB = dy * silu
             __m512 dB = _mm512_mul_ps(dy, silu);
  
             fp32_cvt_storeu_bf16(&dx_row[d], dA);
             fp32_cvt_storeu_bf16(&dx_row[D + d], dB);
         }
 #endif
  
         // Scalar fallback for remaining elements
         for (; d < D; ++d) {
             float a = bf16_to_float(row[d]);
             float b = bf16_to_float(row[D + d]);
             float dy = bf16_to_float(dy_row[d]);
  
             float s = sigmoid_scalar(a);
             float silu = a * s;
             float s_prime = s * (1.0f - s);
             float silu_prime = s + a * s_prime;
  
             float dA = dy * b * silu_prime;
             float dB = dy * silu;
  
             dx_row[d] = float_to_bf16(dA);
             dx_row[D + d] = float_to_bf16(dB);
         }
     }
 }

References bf16_to_float(), float_to_bf16(), sigmoid_scalar(), and silu().

◆ swiglu_backward_exact()

void swiglu_backward_exact	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		int	tokens,
		int	dim
	)

SwiGLU backward pass (exact version using stdlib sigmoid)

Test:

test_swiglu.py::TestSwiGLUBackward::test_exact_vs_fast

test_swiglu.py::TestSwiGLUBackward::test_exact_single

Uses standard library expf for numerical accuracy reference.

After changes: make test

Definition at line 373 of file swiglu_kernels.c.

 {
     int T = tokens;
     int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const float *row = input + (size_t)t * (2 * D);
         const float *dy_row = d_output + (size_t)t * D;
         float *dx_row = d_input + (size_t)t * (2 * D);
  
         for (int d = 0; d < D; ++d) {
             float a = row[d];       // gate
             float b = row[D + d];   // value
             float dy = dy_row[d];
  
             // Use standard library expf via sigmoid_scalar
             float s = sigmoid_scalar(a);               // sigmoid(a)
             float silu = a * s;                       // silu(a)
             float s_prime = s * (1.0f - s);           // sigmoid'(a)
             float silu_prime = s + a * s_prime;       // silu'(a)
  
             float dA = dy * b * silu_prime;
             float dB = dy * silu;
  
             dx_row[d] = dA;
             dx_row[D + d] = dB;
         }
     }
 }

References sigmoid_scalar(), and silu().

◆ swiglu_forward()

void swiglu_forward	(	const float *	input,
		float *	output,
		int	tokens,
		int	dim
	)

SwiGLU forward pass

Test:

test_swiglu.py::TestSwiGLUForward::test_forward_tokens

test_swiglu.py::TestSwiGLUForward::test_forward_single

test_mlp.py::TestMLPForward::test_swiglu_mlp

test_fused_swiglu_decode.py::TestFusedSwiGLUDecode::test_fused_swiglu_decode

test_parity.py::test_swiglu_parity

SwiGLU: y = silu(gate) * up where silu(x) = x * sigmoid(x)

After changes: make test && make llamacpp-parity-full

Definition at line 131 of file swiglu_kernels.c.

 {
     int T = tokens;
     int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const float *row = input + (size_t)t * (2 * D);
         float *out_row = output + (size_t)t * D;
         int d = 0;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         for (; d + 16 <= D; d += 16) {
             __m512 a = _mm512_loadu_ps(&row[d]);         // gate
             __m512 b = _mm512_loadu_ps(&row[D + d]);     // value
  
             __m512 s = sigmoid512_fast(a);              // sigmoid(a)
             __m512 silu = _mm512_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m512 y = _mm512_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm512_storeu_ps(&out_row[d], y);
         }
 #elif defined(__AVX2__)
         // AVX2: Process 8 floats at a time
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
  
             __m256 s = sigmoid256_fast(a);              // sigmoid(a)
             __m256 silu = _mm256_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m256 y = _mm256_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm256_storeu_ps(&out_row[d], y);
         }
 #elif defined(__AVX__)
         // AVX1: Vectorize arithmetic, use scalar sigmoid
         float a_arr[8] __attribute__((aligned(32)));
         float s_arr[8] __attribute__((aligned(32)));
  
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
  
             // Compute sigmoid scalarly
             _mm256_store_ps(a_arr, a);
             for (int j = 0; j < 8; ++j) {
                 s_arr[j] = sigmoid_scalar(a_arr[j]);
             }
             __m256 s = _mm256_load_ps(s_arr);
  
             __m256 silu = _mm256_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m256 y = _mm256_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm256_storeu_ps(&out_row[d], y);
         }
 #endif
  
         // Scalar fallback for remaining elements
         for (; d < D; ++d) {
             float a = row[d];       // gate
             float b = row[D + d];   // value
  
             float s = sigmoid_scalar(a);         // sigmoid(a)
             float silu = a * s;                  // silu(a) = a * sigmoid(a)
  
             out_row[d] = silu * b;
         }
     }
 }

References __attribute__(), sigmoid_scalar(), and silu().

◆ swiglu_forward_bf16()

void swiglu_forward_bf16	(	const uint16_t *	input,
		uint16_t *	output,
		int	tokens,
		int	dim
	)

Definition at line 66 of file swiglu_kernels_bf16.c.

 {
     if (!input || !output || tokens <= 0 || dim <= 0) {
         return;
     }
  
     const int T = tokens;
     const int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const uint16_t *row = input + (size_t)t * (size_t)(2 * D);
         uint16_t *out_row = output + (size_t)t * (size_t)D;
         int d = 0;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         for (; d + 16 <= D; d += 16) {
             __m512 a = bf16_loadu_cvt_fp32(&row[d]);         // gate
             __m512 b = bf16_loadu_cvt_fp32(&row[D + d]);     // value
  
             __m512 s = sigmoid512_fast_bf16(a);             // sigmoid(a)
             __m512 silu = _mm512_mul_ps(a, s);              // silu(a) = a * sigmoid(a)
             __m512 y = _mm512_mul_ps(silu, b);              // y = silu(a) * b
  
             fp32_cvt_storeu_bf16(&out_row[d], y);
         }
 #endif
  
         // Scalar fallback for remaining elements
         for (; d < D; ++d) {
             float a = bf16_to_float(row[d]);
             float b = bf16_to_float(row[D + d]);
             float s = sigmoid_scalar(a);
             float silu = a * s;
             out_row[d] = float_to_bf16(silu * b);
         }
     }
 }

References bf16_to_float(), float_to_bf16(), sigmoid_scalar(), and silu().

◆ swiglu_forward_exact()

void swiglu_forward_exact	(	const float *	input,
		float *	output,
		int	tokens,
		int	dim
	)

SwiGLU forward pass (exact version using stdlib sigmoid)

Test:

test_swiglu.py::TestSwiGLUForward::test_exact_vs_fast

test_swiglu.py::TestSwiGLUForward::test_exact_single

Uses standard library expf for numerical accuracy reference.

After changes: make test

Definition at line 339 of file swiglu_kernels.c.

 {
     int T = tokens;
     int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const float *row = input + (size_t)t * (2 * D);
         float *out_row = output + (size_t)t * D;
  
         for (int d = 0; d < D; ++d) {
             float a = row[d];       // gate
             float b = row[D + d];   // value
  
             // Use standard library expf via sigmoid_scalar
             float s = sigmoid_scalar(a);         // sigmoid(a) = 1/(1+expf(-a))
             float silu = a * s;                  // silu(a) = a * sigmoid(a)
  
             out_row[d] = silu * b;
         }
     }
 }

References sigmoid_scalar(), and silu().

◆ topk_batched_f32()

void topk_batched_f32	(	const float *	scores,
		int	num_tokens,
		int	n_experts,
		int	k,
		int *	indices,
		float *	weights
	)

Batched top-K selection for multiple tokens.

Parameters

scores	Input scores [num_tokens, n_experts]
num_tokens	Number of tokens
n_experts	Number of experts
k	Number of experts to select per token
indices	Output: selected expert indices [num_tokens, k]
weights	Output: routing weights [num_tokens, k] (can be NULL for no softmax)

Definition at line 191 of file topk_kernels.c.

 {
     if (!scores || !indices || num_tokens <= 0 || n_experts <= 0 || k <= 0) {
         return;
     }
  
     for (int t = 0; t < num_tokens; t++) {
         const float *token_scores = scores + t * n_experts;
         int *token_indices = indices + t * k;
  
         if (weights) {
             float *token_weights = weights + t * k;
             topk_softmax_f32(token_scores, n_experts, k, token_indices, token_weights);
         } else {
             topk_f32(token_scores, n_experts, k, token_indices, NULL);
         }
     }
 }

References topk_f32(), and topk_softmax_f32().

◆ topk_f32()

void topk_f32	(	const float *	scores,
		int	n,
		int	k,
		int *	indices,
		float *	values
	)

Find top-K indices and values from a score vector.

Parameters

scores	Input scores [n]
n	Number of scores (e.g., number of experts)
k	Number of top scores to select
indices	Output: indices of top-K scores [k], sorted descending by value
values	Output: top-K score values [k], sorted descending (can be NULL)

Definition at line 49 of file topk_kernels.c.

 {
     if (!scores || !indices || n <= 0 || k <= 0) {
         return;
     }
  
     /* Clamp k to n */
     if (k > n) {
         k = n;
     }
  
     /* Initialize with first k elements */
     float local_values[k];
     for (int i = 0; i < k; i++) {
         indices[i] = i;
         local_values[i] = scores[i];
     }
  
     /* Find the minimum in our current top-k */
     int min_idx = 0;
     for (int i = 1; i < k; i++) {
         if (local_values[i] < local_values[min_idx]) {
             min_idx = i;
         }
     }
  
     /* Scan remaining elements */
     for (int i = k; i < n; i++) {
         if (scores[i] > local_values[min_idx]) {
             /* Replace the minimum */
             indices[min_idx] = i;
             local_values[min_idx] = scores[i];
  
             /* Find new minimum */
             min_idx = 0;
             for (int j = 1; j < k; j++) {
                 if (local_values[j] < local_values[min_idx]) {
                     min_idx = j;
                 }
             }
         }
     }
  
     /* Sort results in descending order (simple insertion sort for small k) */
     for (int i = 1; i < k; i++) {
         float val = local_values[i];
         int idx = indices[i];
         int j = i - 1;
         while (j >= 0 && local_values[j] < val) {
             local_values[j + 1] = local_values[j];
             indices[j + 1] = indices[j];
             j--;
         }
         local_values[j + 1] = val;
         indices[j + 1] = idx;
     }
  
     /* Copy values if output requested */
     if (values) {
         for (int i = 0; i < k; i++) {
             values[i] = local_values[i];
         }
     }
 }

Referenced by topk_batched_f32(), and topk_softmax_f32().

◆ topk_softmax_f32()

void topk_softmax_f32	(	const float *	scores,
		int	n,
		int	k,
		int *	indices,
		float *	weights
	)

Find top-K indices with softmax-normalized weights.

Parameters

scores	Input scores [n] (router logits)
n	Number of scores
k	Number of top scores to select
indices	Output: indices of top-K scores [k]
weights	Output: softmax-normalized weights for selected [k], sum to 1.0

Definition at line 134 of file topk_kernels.c.

 {
     if (!scores || !indices || !weights || n <= 0 || k <= 0) {
         return;
     }
  
     if (k > n) {
         k = n;
     }
  
     /* First get top-K indices and values */
     float values[k];
     topk_f32(scores, n, k, indices, values);
  
     /* Compute softmax over the selected values */
     /* Find max for numerical stability */
     float max_val = values[0];
     for (int i = 1; i < k; i++) {
         if (values[i] > max_val) {
             max_val = values[i];
         }
     }
  
     /* Compute exp and sum */
     float sum = 0.0f;
     for (int i = 0; i < k; i++) {
         weights[i] = expf(values[i] - max_val);
         sum += weights[i];
     }
  
     /* Normalize */
     float inv_sum = 1.0f / sum;
     for (int i = 0; i < k; i++) {
         weights[i] *= inv_sum;
     }
 }

References topk_f32().

Referenced by topk_batched_f32().

◆ unfused_rmsnorm_qkv_prefill()

void unfused_rmsnorm_qkv_prefill	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Wk,
		const float *	Wv,
		float *	x_norm,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	hidden,
		int	q_dim,
		int	kv_dim,
		float	eps
	)

Unfused version for benchmarking comparison.

Definition at line 667 of file prefill_fused_gemm.c.

 {
     /* Step 1: Full RMSNorm → writes x_norm to memory */
     rmsnorm_tile(x, gamma, x_norm, seq_len, hidden, hidden, eps);
  
     /* Step 2: Separate GEMMs with N-outer tiling for weight reuse */
     /* Q projection */
     for (int n_start = 0; n_start < q_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= q_dim)
                          ? PREFILL_TILE_N : (q_dim - n_start);
         const float *W_tile = Wq + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = Q + (size_t)m_start * q_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, q_dim);
         }
     }
  
     /* K projection */
     for (int n_start = 0; n_start < kv_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= kv_dim)
                          ? PREFILL_TILE_N : (kv_dim - n_start);
         const float *W_tile = Wk + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = K + (size_t)m_start * kv_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, kv_dim);
         }
     }
  
     /* V projection */
     for (int n_start = 0; n_start < kv_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= kv_dim)
                          ? PREFILL_TILE_N : (kv_dim - n_start);
         const float *W_tile = Wv + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = V + (size_t)m_start * kv_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, kv_dim);
         }
     }
 }

References gemm_tile_nt_strided(), PREFILL_TILE_M, PREFILL_TILE_N, and rmsnorm_tile().

◆ vec_dot_q6_k_q8_k()

void vec_dot_q6_k_q8_k	(	int	n,
		float *	s,
		const void *	vx,
		const void *	vy
	)

Q6_K x Q8_K dot product (single row)

Definition at line 954 of file gemm_kernels_q6k_q8k.c.

 {
     if (!s || !vx || !vy || n <= 0) {
         return;
     }
  
     const block_q6_K *x = (const block_q6_K *)vx;
     const block_q8_K *y = (const block_q8_K *)vy;
  
     /* Dispatch based on available SIMD */
 #if defined(__AVX512F__) && defined(__AVX512BW__)
     *s = dot_q6_k_q8_k_avx512(x, y, n);
 #elif defined(__AVX2__)
     *s = dot_q6_k_q8_k_avx2(x, y, n);
 #elif defined(__AVX__) && !defined(__AVX2__)
     *s = dot_q6_k_q8_k_avx(x, y, n);
 #elif defined(__SSSE3__)
     *s = dot_q6_k_q8_k_sse(x, y, n);
 #else
     *s = dot_q6_k_q8_k_ref(x, y, n);
 #endif
 }

References dot_q6_k_q8_k_ref().

◆ weighted_sum_f32()

void weighted_sum_f32	(	float *	y,
		const float **	vectors,
		const float *	weights,
		int	k,
		int	n
	)

Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i])

Parameters

y	Output vector [n]
vectors	Array of k input vector pointers, each [n]
weights	Array of k scalar weights
k	Number of vectors to combine
n	Vector length

Definition at line 155 of file axpy_kernels.c.

 {
     if (!y || !vectors || !weights || k <= 0 || n <= 0) {
         return;
     }
  
     /* Initialize with first vector */
     scal_copy_f32(y, vectors[0], weights[0], n);
  
     /* Accumulate rest */
     for (int i = 1; i < k; i++) {
         axpy_f32(y, vectors[i], weights[i], n);
     }
 }

References axpy_f32(), and scal_copy_f32().

Data Structures

Functions

Function Documentation

◆ add_backward_bf16()

◆ add_forward_2d_bf16()

◆ add_forward_bf16()

◆ add_forward_f32()

◆ add_inplace_bf16()

◆ add_inplace_f32()

◆ add_scaled_forward_bf16()

◆ add_scaled_inplace_bf16()

◆ argmax_f32()

◆ attention_backward_causal_head_major()

◆ attention_backward_causal_head_major_gqa()

◆ attention_backward_causal_head_major_gqa_bf16()

◆ attention_flash_decode()

◆ attention_forward_causal_head_major()

◆ attention_forward_causal_head_major_exact()

◆ attention_forward_causal_head_major_gqa()

◆ attention_forward_causal_head_major_gqa_bf16()

◆ attention_forward_causal_head_major_gqa_exact()

◆ attention_forward_causal_head_major_gqa_flash()

◆ attention_forward_causal_head_major_gqa_flash_strided()

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

◆ attention_forward_decode_head_major_gqa_flash()

◆ attention_forward_decode_head_major_gqa_flash_sliding()

◆ attention_forward_decode_head_major_gqa_regular()

◆ axpy_2d_f32()

◆ axpy_f32()

◆ axpy_zero_f32()

◆ backward_causal_softmax_head_major()

◆ backward_causal_softmax_head_major_bf16()

◆ causal_softmax_head_major()

◆ causal_softmax_head_major_bf16()

◆ causal_softmax_head_major_exact()

◆ ck_attention_flash_decode_wrapper()

◆ ck_flash_attn_choose_tile_k()

◆ ck_flash_attn_fast_exp_kind()

◆ ck_gemm_nt_head_major_q5_0()

◆ ck_gemm_nt_head_major_q8_0()

◆ ck_get_num_threads()

◆ ck_get_physical_cores()

◆ ck_set_num_threads()

◆ ck_set_strict_parity()

◆ ck_strict_parity_enabled()

◆ ckernel_backend_native()

◆ dequant_q4_0_row()

◆ dequant_q4_1_row()

◆ dequant_q4_k_row()

◆ dequant_q5_0_row()

◆ dequant_q5_1_row()

◆ dequant_q6_k_row()

◆ dequant_q8_0_row()

◆ embedding_backward()

◆ embedding_backward_bf16()

◆ embedding_forward()

◆ embedding_forward_bf16()

◆ embedding_forward_q4_k()

◆ embedding_forward_q6_k()

◆ embedding_forward_q8_0()

◆ fc1_backward_kernel()

◆ fc2_backward_kernel()

◆ fused_mlp_swiglu_decode()

◆ fused_mlp_swiglu_decode_tiled()

◆ fused_mlp_swiglu_decode_v2()

◆ fused_mlp_swiglu_prefill()

◆ fused_mlp_swiglu_prefill_bias()

◆ fused_mlp_swiglu_prefill_w1w2_quant()

◆ fused_mlp_swiglu_prefill_w1w2_quant_scratch_size()

◆ fused_mlp_swiglu_scratch_size()

◆ fused_rmsnorm_qkv_prefill()

◆ fused_rmsnorm_qkv_prefill_head_major()

◆ fused_rmsnorm_qkv_prefill_head_major_quant()

◆ fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size()

◆ fused_rmsnorm_qkv_scratch_size()

◆ geglu_backward_fp32()

◆ geglu_forward_bf16()

◆ geglu_forward_fp32()

◆ gelu_backward_exact()

◆ gelu_backward_exact_bf16()