C-Kernel-Engine Parity Testing API Implementation. More...

#include "ck_parity_api.h"
#include "ckernel_quant.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>

Functions
void	attention_forward_causal_head_major_gqa_flash_strided (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)

void	attention_forward_causal_head_major_gqa_flash_strided_sliding (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens, int sliding_window)

void	attention_forward_decode_head_major_gqa_flash_sliding (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim, int sliding_window)

int	ck_get_block_q4_k_size (void)
	Get Q4_K block size in bytes. More...

int	ck_get_block_q5_1_size (void)
	Get Q5_1 block size in bytes (24 bytes per 32 weights) More...

int	ck_get_block_q5_k_size (void)
	Get Q5_K block size in bytes (176 bytes per 256 weights) More...

int	ck_get_block_q6_k_size (void)
	Get Q6_K block size in bytes. More...

int	ck_get_block_q8_k_size (void)
	Get Q8_K block size in bytes. More...

int	ck_get_qk5_1 (void)
	Get QK5_1 (elements per Q5_1 block) More...

int	ck_get_qk_k (void)
	Get QK_K (elements per super-block) More...

void	ck_test_attention_causal (const float q, const float k, const float v, float out, int num_heads, int num_kv_heads, int tokens, int seq_len, int head_dim)
	Multi-head causal attention for prefill (head-major layout) More...

void	ck_test_attention_decode_sliding (const float q_token, const float k_cache, const float v_cache, float out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int sliding_window)
	Test sliding-window attention (decode mode) More...

void	ck_test_attention_sliding_window (const float q, const float k, const float v, float out, int num_heads, int num_kv_heads, int tokens, int seq_len, int head_dim, int sliding_window)
	Test sliding-window attention (prefill) More...

void	ck_test_dequant_q4_0 (const void src, float dst, int n)
	Dequantize Q4_0 data to FP32. More...

void	ck_test_dequant_q4_k (const void src, float dst, int n)
	Dequantize Q4_K data to FP32. More...

void	ck_test_dequant_q5_1 (const void src, float dst, int n)
	Dequantize Q5_1 data to FP32. More...

void	ck_test_dequant_q6_k (const void src, float dst, int n)
	Dequantize Q6_K data to FP32. More...

void	ck_test_geglu (const float x, float out, int n_tokens, int dim)
	Test GeGLU activation. More...

void	ck_test_geglu_backward (const float x, const float d_out, float *d_x, int n_tokens, int dim)
	Test GeGLU backward. More...

void	ck_test_gemm_q4_k (const void weight_q4k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q4_K GEMM - batched matrix multiply with quantized weights. More...

void	ck_test_gemm_q5_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Test Q5_0 x Q8_0 GEMM (batch matrix multiply) More...

void	ck_test_gemm_q5_1 (const void weight_q5_1, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Test Q5_1 x Q8_0 GEMM (batch matrix multiply) More...

void	ck_test_gemm_q5_k (const void weight_q5_k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Test Q5_K x Q8_K GEMM (batch matrix multiply) More...

void	ck_test_gemm_q6_k (const void weight_q6k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Test Q6_K x Q8_K GEMM (batch matrix multiply) More...

void	ck_test_gemm_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Test Q8_0 x Q8_0 GEMM (batch matrix multiply) More...

void	ck_test_gemv_q4_k (const void weight_q4k, const float input_f32, float *output, int cols)
	Q4_K GEMV - dot product of quantized weights and FP32 input. More...

void	ck_test_gemv_q5_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols)
	Q5_0 GEMV - matrix-vector multiply with Q5_0 weights. More...

void	ck_test_gemv_q5_0_q8_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols)
	Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...

void	ck_test_gemv_q5_1 (const void weight_q5_1, const float input_f32, float *output, int rows, int cols)
	Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks) More...

void	ck_test_gemv_q5_k (const void weight_q5_k, const float input_f32, float *output, int rows, int cols)
	Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks) More...

void	ck_test_gemv_q6_k (const void weight_q6k, const float input_f32, float *output, int cols)
	Q6_K GEMV. More...

void	ck_test_gemv_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols)
	Q8_0 GEMV - matrix-vector multiply with Q8_0 weights. More...

void	ck_test_gemv_q8_0_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols)
	Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...

void	ck_test_outproj_mlp_fused_q5_0 (const float attn_out, const float residual, const float ln2_gamma, const void wo, const void w1, const void w2, float *output, int tokens, int num_heads, int head_dim, int embed_dim, int intermediate, float eps, int w2_is_q6k)
	Test mega-fused OutProj + MLP kernel (Q5_0 weights) More...

void	ck_test_quantize_q8_k (const float src, void dst, int n)
	Quantize FP32 to Q8_K (for activations) More...

void	ck_test_rmsnorm (const float input, const float weight, float *output, int n_tokens, int dim, float eps)
	RMSNorm. More...

void	ck_test_rope (float q, float k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
	RoPE (Rotary Position Embedding) More...

void	ck_test_rope_interleaved (float q, float k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
	RoPE with interleaved format (for llama.cpp compatibility) More...

void	ck_test_softmax (const float input, float output, int n)
	Softmax (simple, non-causal) More...

void	ck_test_swiglu (const float gate_up, float output, int n_tokens, int intermediate_dim)
	SwiGLU activation. More...

void	ck_test_vec_dot_q5_0_q8_0 (const void weight_q5_0, const void input_q8_0, float *output, int cols)
	Direct Q5_0 x Q8_0 dot product test (takes pre-quantized Q8_0 input) More...

void	ck_test_vec_dot_q8_0_q8_0 (const void weight_q8_0, const void input_q8_0, float *output, int cols)
	Direct Q8_0 x Q8_0 dot product test (takes pre-quantized Q8_0 input) More...

void	dequant_q4_0_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_0 row (multiple blocks) More...

void	dequant_q4_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q4_K row (multiple blocks) More...

void	dequant_q5_1_row (const void src, float dst, size_t n_elements)
	Dequantize Q5_1 row (multiple blocks) More...

void	dequant_q6_k_row (const void src, float dst, size_t n_elements)
	Dequantize Q6_K row (multiple blocks) More...

void	geglu_backward_fp32 (const float x, const float d_out, float *d_x, int n_tokens, int dim)

void	geglu_forward_fp32 (const float x, float out, int tokens, int dim)

void	gemm_nt_q4_k_q8_k (const void A_q8, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q5_0_q8_0 (const void A_q8, const void B_q5, const float bias, float C, int M, int N, int K)
	Batch GEMM with Q5_0 weights and Q8_0 activations for prefill. More...

void	gemm_nt_q5_1 (const float A, const void B, const float bias, float C, int M, int N, int K)
	GEMM with transposed Q5_1 weights: C = A @ B^T. More...

void	gemm_nt_q5_k (const float A, const void B, const float bias, float C, int M, int N, int K)

void	gemm_nt_q6_k_q8_k (const void A_q8, const void B, const float bias, float C, int M, int N, int K)
	NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K. More...

void	gemm_nt_q8_0_q8_0 (const void A_q8, const void B_q8, const float bias, float C, int M, int N, int K)
	gemm_nt_q8_0_q8_0 with optional bias (matches header signature) More...

void	gemv_q4_k_q8_k (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q5_0 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV for Q5_0 weights based on CPU features. More...

void	gemv_q5_0_q8_0 (float y, const void W, const void *x_q8, int M, int K)
	Matrix-vector multiply with Q5_0 weights and Q8_0 input. More...

void	gemv_q5_1 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV. More...

void	gemv_q5_k (float y, const void W, const float *x, int M, int K)

void	gemv_q6_k_q8_k (float y, const void W, const void *x_q8, int M, int K)
	GEMV: y = W @ x where W is Q6_K and x is Q8_K. More...

void	gemv_q8_0 (float y, const void W, const float *x, int M, int K)
	Auto-dispatch GEMV for Q8_0 weights based on CPU features. More...

void	gemv_q8_0_q8_0 (float y, const void W, const void *x_q8, int M, int K)
	Matrix-vector multiply with Q8_0 weights and Q8_0 input. More...

void	mega_fused_outproj_mlp_prefill (float output, const float attn_out, const float residual, const float ln2_gamma, const void wo, const float bo, int wo_dt, const void w1, const float b1, int w1_dt, const void w2, const float b2, int w2_dt, int tokens, int embed_dim, int aligned_embed_dim, int num_heads, int aligned_head_dim, int intermediate_dim, int aligned_intermediate_dim, float eps, void *scratch)

size_t	mega_fused_outproj_mlp_prefill_scratch_size (int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim, int aligned_intermediate_dim)
	Get scratch buffer size for mega_fused_outproj_mlp_prefill. More...

void	quantize_row_q8_0 (const float x, void vy, int k)
	Quantize FP32 to Q8_0 format (scalar reference) More...

void	quantize_row_q8_k (const float x, void vy, int k)

void	rmsnorm_forward (const float input, const float gamma, float output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	rope_forward_qk (float q, float k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)

void	rope_precompute_cache (float cos_cache, float sin_cache, int max_seq_len, int head_dim, float base)

void	swiglu_forward (const float input, float output, int tokens, int dim)

void	vec_dot_q5_0_q8_0 (int n, float s, const void vx, const void *vy)
	Auto-dispatch quantized dot product Q5_0 x Q8_0. More...

void	vec_dot_q8_0_q8_0 (int n, float s, const void vx, const void *vy)
	Auto-dispatch quantized dot product Q8_0 x Q8_0. More...

Detailed Description

C-Kernel-Engine Parity Testing API Implementation.

Wraps CK kernels for parity testing against llama.cpp/ggml.

Definition in file ck_parity_api.c.

Function Documentation

◆ attention_forward_causal_head_major_gqa_flash_strided()

void attention_forward_causal_head_major_gqa_flash_strided	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens
	)

Flash attention forward with custom KV stride (for KV cache)

Test:

test_flash_attention.py::TestFlashAttention::test_flash_strided

test_kv_cache_attention.py::TestKVCacheAttention::test_flash_attention

Variant with configurable kv_stride_tokens for KV cache layouts where K/V may not be contiguous in memory.

After changes: make test

Definition at line 859 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
     // Select SIMD implementation based on compile-time CPU features
 #if defined(__AVX512F__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx512
 #elif defined(__AVX2__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx2
 #elif defined(__AVX__)
     #define FLASH_QUERY_IMPL attention_flash_query_causal_avx
 #else
     #define FLASH_QUERY_IMPL attention_flash_query_causal
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             FLASH_QUERY_IMPL(q_vec, k_head, v_head,
                              /*kv_tokens=*/i + 1,
                              head_dim, aligned_head_dim,
                              scale, out_vec);
         }
     }
  
 #undef FLASH_QUERY_IMPL
 }

Referenced by ck_test_attention_causal().

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

void attention_forward_causal_head_major_gqa_flash_strided_sliding	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		int	sliding_window
	)

Flash attention forward with sliding window (prefill)

Test:: test_attention.py::TestAttentionForward::test_sliding_window_prefill

Sliding-window attention for prefill: each token attends to the last W tokens. When sliding_window <= 0, behaves like regular causal attention.

After changes: make test

Definition at line 1316 of file attention_kernels.c.

 {
     if (!q || !k || !v || !output) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || num_tokens <= 0) {
         return;
     }
     if (kv_stride_tokens < num_tokens) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const int T = num_tokens;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
  
 #if defined(__AVX512F__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_FLASH_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *k_head = k + (size_t)kv_head * kv_head_stride;
         const float *v_head = v + (size_t)kv_head * kv_head_stride;
  
         for (int i = 0; i < T; ++i) {
             const float *q_vec = q + qkv_index(h, i, 0, T, aligned_head_dim);
             float *out_vec = output + qkv_index(h, i, 0, T, aligned_head_dim);
             SLIDING_FLASH_IMPL(q_vec, k_head, v_head,
                                /*query_pos=*/i,
                                /*kv_tokens=*/T,
                                head_dim, aligned_head_dim,
                                scale, out_vec,
                                sliding_window);
         }
     }
  
 #undef SLIDING_FLASH_IMPL
 }

Referenced by ck_test_attention_sliding_window().

◆ attention_forward_decode_head_major_gqa_flash_sliding()

void attention_forward_decode_head_major_gqa_flash_sliding	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	aligned_head_dim,
		int	sliding_window
	)

Flash attention decode with sliding window

Test:: test_attention.py::TestAttentionForward::test_sliding_window_decode

Single query token attends to the last W tokens in the KV cache. For decode: effective_kv_tokens = min(kv_tokens, sliding_window)

After changes: make test

Definition at line 1382 of file attention_kernels.c.

 {
     if (!q_token || !k_cache || !v_cache || !out_token) {
         return;
     }
     if (num_heads <= 0 || num_kv_heads <= 0 || cache_capacity <= 0) {
         return;
     }
     if (kv_tokens <= 0 || kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
         return;
     }
  
     const float scale = 1.0f / sqrtf((float)head_dim);
     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
  
     // Compute effective KV tokens based on sliding window
     int effective_kv_tokens = kv_tokens;
     if (sliding_window > 0 && sliding_window < kv_tokens) {
         effective_kv_tokens = sliding_window;
     }
  
     // Guard against empty window (shouldn't happen with kv_tokens >= 1)
     if (effective_kv_tokens <= 0) {
         return;
     }
  
     // Offset to start reading from the last effective_kv_tokens entries
     int kv_start_offset = kv_tokens - effective_kv_tokens;
  
 #if defined(__AVX512F__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx512
 #elif defined(__AVX2__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx2
 #elif defined(__AVX__)
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding_avx
 #else
     #define SLIDING_DECODE_IMPL attention_flash_query_sliding
 #endif
  
     for (int h = 0; h < num_heads; ++h) {
         int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);
         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;
         // Offset K/V pointer to start from the first token in the sliding window
         const float *k_head = k_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         const float *v_head = v_cache + (size_t)kv_head * head_stride
                             + (size_t)kv_start_offset * (size_t)aligned_head_dim;
         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;
  
         // Use query_pos relative to the windowed KV (last token = effective_kv_tokens - 1)
         // sliding_window = 0 since we've already windowed via K/V pointer offset
         SLIDING_DECODE_IMPL(q_head, k_head, v_head,
                             /*query_pos=*/effective_kv_tokens - 1,
                             /*kv_tokens=*/effective_kv_tokens,
                             head_dim, aligned_head_dim,
                             scale, out_head,
                             /*sliding_window=*/0);
     }
  
 #undef SLIDING_DECODE_IMPL
 }

Referenced by ck_test_attention_decode_sliding().

◆ ck_get_block_q4_k_size()

int ck_get_block_q4_k_size ( void )

Get Q4_K block size in bytes.

Definition at line 961 of file ck_parity_api.c.

 {
     return sizeof(block_q4_K);
 }

◆ ck_get_block_q5_1_size()

int ck_get_block_q5_1_size ( void )

Get Q5_1 block size in bytes (24 bytes per 32 weights)

Definition at line 986 of file ck_parity_api.c.

 {
     return sizeof(block_q5_1);
 }

◆ ck_get_block_q5_k_size()

int ck_get_block_q5_k_size ( void )

Get Q5_K block size in bytes (176 bytes per 256 weights)

Definition at line 981 of file ck_parity_api.c.

 {
     return sizeof(block_q5_K);
 }

◆ ck_get_block_q6_k_size()

int ck_get_block_q6_k_size ( void )

Get Q6_K block size in bytes.

Definition at line 966 of file ck_parity_api.c.

 {
     return sizeof(block_q6_K);
 }

◆ ck_get_block_q8_k_size()

int ck_get_block_q8_k_size ( void )

Get Q8_K block size in bytes.

Definition at line 971 of file ck_parity_api.c.

 {
     return sizeof(block_q8_K);
 }

◆ ck_get_qk5_1()

int ck_get_qk5_1 ( void )

Get QK5_1 (elements per Q5_1 block)

Definition at line 991 of file ck_parity_api.c.

 {
     return QK5_1;
 }

References QK5_1.

◆ ck_get_qk_k()

int ck_get_qk_k ( void )

Get QK_K (elements per super-block)

Definition at line 976 of file ck_parity_api.c.

 {
     return QK_K;
 }

References QK_K.

◆ ck_test_attention_causal()

void ck_test_attention_causal	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	out,
		int	num_heads,
		int	num_kv_heads,
		int	tokens,
		int	seq_len,
		int	head_dim
	)

Multi-head causal attention for prefill (head-major layout)

Layout (head-major, matches llama.cpp test): Q: [num_heads, tokens, head_dim] K: [num_kv_heads, seq_len, head_dim] V: [num_kv_heads, seq_len, head_dim] out: [num_heads, tokens, head_dim]

Supports GQA (grouped-query attention) where num_heads > num_kv_heads. Causal masking: token t can only attend to positions 0..t (inclusive).

Parameters

q	Query [num_heads, tokens, head_dim]
k	Key [num_kv_heads, seq_len, head_dim]
v	Value [num_kv_heads, seq_len, head_dim]
out	Output [num_heads, tokens, head_dim]
num_heads	Number of query heads
num_kv_heads	Number of key/value heads (for GQA)
tokens	Number of query tokens
seq_len	Key/value sequence length (for prefill: seq_len == tokens)
head_dim	Dimension per head

Definition at line 736 of file ck_parity_api.c.

 {
     /* For prefill, seq_len == tokens, and kv_stride == tokens.
      * The CK kernel expects strided KV layout with kv_stride_tokens parameter.
      * For parity testing with contiguous tensors, kv_stride = seq_len.
      */
     attention_forward_causal_head_major_gqa_flash_strided(
         q, k, v, out,
         num_heads, num_kv_heads, tokens,
         head_dim, head_dim,  /* aligned_head_dim = head_dim for testing */
         seq_len              /* kv_stride_tokens = seq_len for contiguous KV */
     );
 }

References attention_forward_causal_head_major_gqa_flash_strided().

◆ ck_test_attention_decode_sliding()

void ck_test_attention_decode_sliding	(	const float *	q_token,
		const float *	k_cache,
		const float *	v_cache,
		float *	out_token,
		int	num_heads,
		int	num_kv_heads,
		int	kv_tokens,
		int	cache_capacity,
		int	head_dim,
		int	sliding_window
	)

Test sliding-window attention (decode mode)

Single query token attending to KV cache with sliding window.

Definition at line 794 of file ck_parity_api.c.

 {
     attention_forward_decode_head_major_gqa_flash_sliding(
         q_token, k_cache, v_cache, out_token,
         num_heads, num_kv_heads,
         kv_tokens, cache_capacity, head_dim, head_dim,
         sliding_window
     );
 }

References attention_forward_decode_head_major_gqa_flash_sliding().

◆ ck_test_attention_sliding_window()

void ck_test_attention_sliding_window	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	out,
		int	num_heads,
		int	num_kv_heads,
		int	tokens,
		int	seq_len,
		int	head_dim,
		int	sliding_window
	)

Test sliding-window attention (prefill)

Layout (head-major, matching CK-Engine): Q: [num_heads, tokens, head_dim] K: [num_kv_heads, seq_len, head_dim] V: [num_kv_heads, seq_len, head_dim] out: [num_heads, tokens, head_dim]

Each token attends only to the last sliding_window tokens.

Definition at line 769 of file ck_parity_api.c.

 {
     attention_forward_causal_head_major_gqa_flash_strided_sliding(
         q, k, v, out,
         num_heads, num_kv_heads, tokens,
         head_dim, head_dim,  /* aligned_head_dim = head_dim for testing */
         seq_len,             /* kv_stride_tokens = seq_len for contiguous KV */
         sliding_window
     );
 }

References attention_forward_causal_head_major_gqa_flash_strided_sliding().

◆ ck_test_dequant_q4_0()

void ck_test_dequant_q4_0	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q4_0 data to FP32.

Definition at line 122 of file ck_parity_api.c.

 {
     dequant_q4_0_row(src, dst, (size_t)n);
 }

References dequant_q4_0_row().

◆ ck_test_dequant_q4_k()

void ck_test_dequant_q4_k	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q4_K data to FP32.

Parameters

src	Input Q4_K blocks
dst	Output FP32 values
n	Number of elements (must be multiple of 256)

Definition at line 112 of file ck_parity_api.c.

 {
     dequant_q4_k_row(src, dst, (size_t)n);
 }

References dequant_q4_k_row().

◆ ck_test_dequant_q5_1()

void ck_test_dequant_q5_1	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q5_1 data to FP32.

Definition at line 127 of file ck_parity_api.c.

 {
     dequant_q5_1_row(src, dst, (size_t)n);
 }

References dequant_q5_1_row().

◆ ck_test_dequant_q6_k()

void ck_test_dequant_q6_k	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q6_K data to FP32.

Definition at line 117 of file ck_parity_api.c.

 {
     dequant_q6_k_row(src, dst, (size_t)n);
 }

References dequant_q6_k_row().

◆ ck_test_geglu()

void ck_test_geglu	(	const float *	x,
		float *	out,
		int	n_tokens,
		int	dim
	)

Test GeGLU activation.

Computes: output = GELU(a) * b where input contains [a, b] concatenated along the last dimension.

Definition at line 819 of file ck_parity_api.c.

 {
     geglu_forward_fp32(x, out, n_tokens, dim);
 }

References geglu_forward_fp32().

◆ ck_test_geglu_backward()

void ck_test_geglu_backward	(	const float *	x,
		const float *	d_out,
		float *	d_x,
		int	n_tokens,
		int	dim
	)

Test GeGLU backward.

Computes gradients dL/dx given dL/d(out) where out = GELU(a) * b

Definition at line 832 of file ck_parity_api.c.

 {
     geglu_backward_fp32(x, d_out, d_x, n_tokens, dim);
 }

References geglu_backward_fp32().

◆ ck_test_gemm_q4_k()

void ck_test_gemm_q4_k	(	const void *	weight_q4k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q4_K GEMM - batched matrix multiply with quantized weights.

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters

weight_q4k	Q4_K quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)
n_tokens	Batch size

Definition at line 392 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_k(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q4_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q4_k_q8_k(q8_data, weight_q4k, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemm_nt_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q5_0()

void ck_test_gemm_q5_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Test Q5_0 x Q8_0 GEMM (batch matrix multiply)

Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_0 weights.

Definition at line 491 of file ck_parity_api.c.

 {
     /* Allocate Q8_0 buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_0(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q5_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q5_0_q8_0(q8_data, weight_q5_0, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemm_nt_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemm_q5_1()

void ck_test_gemm_q5_1	(	const void *	weight_q5_1,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Test Q5_1 x Q8_0 GEMM (batch matrix multiply)

Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_1 weights. gemm_nt_q5_1 expects FP32 activations (not quantized).

Definition at line 542 of file ck_parity_api.c.

 {
     /* gemm_nt_q5_1 expects FP32 activations, not quantized.
      * Pass input_f32 directly as-is (already FP32).
      */
     gemm_nt_q5_1(input_f32, weight_q5_1, NULL, output, n_tokens, rows, cols);
 }

References gemm_nt_q5_1().

◆ ck_test_gemm_q5_k()

void ck_test_gemm_q5_k	(	const void *	weight_q5_k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Test Q5_K x Q8_K GEMM (batch matrix multiply)

Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_K weights. gemm_nt_q5_k expects FP32 activations (not quantized).

Definition at line 525 of file ck_parity_api.c.

 {
     /* gemm_nt_q5_k expects FP32 activations, not quantized.
      * Pass input_f32 directly as-is (already FP32).
      */
     gemm_nt_q5_k(input_f32, weight_q5_k, NULL, output, n_tokens, rows, cols);
 }

References gemm_nt_q5_k().

◆ ck_test_gemm_q6_k()

void ck_test_gemm_q6_k	(	const void *	weight_q6k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Test Q6_K x Q8_K GEMM (batch matrix multiply)

Q6_K GEMM - batched matrix multiply with Q6_K weights.

Used for MLP W2 (down projection) with Q6_K weights.

Definition at line 425 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_k(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q6_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q6_k_q8_k(q8_data, weight_q6k, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemm_nt_q6_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q8_0()

void ck_test_gemm_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Test Q8_0 x Q8_0 GEMM (batch matrix multiply)

Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)

Used for attention V projection with Q8_0 weights.

Definition at line 458 of file ck_parity_api.c.

 {
     /* Allocate Q8_0 buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_0(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q8_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q8_0_q8_0(q8_data, weight_q8_0, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemm_nt_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q4_k()

void ck_test_gemv_q4_k	(	const void *	weight_q4k,
		const float *	input_f32,
		float *	output,
		int	cols
	)

Q4_K GEMV - dot product of quantized weights and FP32 input.

Internally quantizes input to Q8_K, then computes dot product.

Parameters

weight_q4k	Q4_K quantized weights [cols]
input_f32	FP32 input vector [cols]
output	Output scalar [1]
cols	Number of columns (must be multiple of 256)

Definition at line 145 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_blocks * sizeof(block_q8_K));
     if (!q8_data) {
         *output = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_K */
     quantize_row_q8_k(input_f32, q8_data, cols);
  
     /* Compute dot product using GEMV with M=1 */
     gemv_q4_k_q8_k(output, weight_q4k, q8_data, 1, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemv_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemv_q5_0()

void ck_test_gemv_q5_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_0 GEMV - matrix-vector multiply with Q5_0 weights.

Parameters

weight_q5_0	Q5_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 192 of file ck_parity_api.c.

 {
     /* Match llama.cpp's test_gemv_q5_0:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (vec_dot_q5_0_q8_0)
      *
      * This ensures parity with llama.cpp which always uses the
      * quantized path, NOT the FP32 dequantization path.
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel (same as ck_test_gemv_q5_0_q8_0) */
     gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_0_q8_0()

void ck_test_gemv_q5_0_q8_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters

weight_q5_0	Q5_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols] - will be quantized to Q8_0
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 248 of file ck_parity_api.c.

 {
     /* This matches llama.cpp's approach:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (integer math)
      * 3. Scale at the end
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel */
     gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_1()

void ck_test_gemv_q5_1	(	const void *	weight_q5_1,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks)

Uses Q8_0 for activations (like Q5_0).

Parameters

weight_q5_1	Q5_1 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 333 of file ck_parity_api.c.

 {
     /*
      * IMPORTANT: gemv_q5_1() expects raw FP32 activations, NOT pre-quantized Q8_0.
      * See comment in ck_test_gemv_q5_k() above for explanation.
      */
     for (int r = 0; r < rows; r++) {
         gemv_q5_1(&output[r],
                   (const char *)weight_q5_1 + r * (cols / QK5_1) * sizeof(block_q5_1),
                   input_f32, 1, cols);
     }
 }

References gemv_q5_1(), and QK5_1.

◆ ck_test_gemv_q5_k()

void ck_test_gemv_q5_k	(	const void *	weight_q5_k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks)

Uses Q8_K for activations (like Q4_K).

Parameters

weight_q5_k	Q5_K quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)

Definition at line 301 of file ck_parity_api.c.

 {
     /*
      * IMPORTANT: gemv_q5_k() expects raw FP32 activations, NOT pre-quantized Q8_K.
      *
      * This is different from gemv_q4_k_q8_k() and gemv_q5_0_q8_0() which are
      * "quantized dot product" kernels that take block_q8_K or block_q8_0 input.
      *
      * WHY THIS IS ERROR-PRONE:
      *   When copying from ck_test_gemv_q5_0() (which calls gemv_q5_0_q8_0),
      *   it is natural to assume Q5_K also needs pre-quantization. But the
      *   function name tells you: gemv_q5_k() takes float*, while
      *   gemv_q5_0_q8_0() takes block_q8_0*. If the kernel name does not
      *   have "_q8_0" or "_q8_k" suffix, it expects FP32 input.
      *
      * PARITY NOTE:
      *   llama.cpp reference uses ggml_vec_dot_q5_K_q8_K which quantizes
      *   the input to Q8_K internally. Our FP32 path will have slightly
      *   different numerical results. Use tolerance ~1e-2 for comparison.
      *   To get exact parity, implement gemv_q5_k_q8_k() (quantized dot product).
      */
     for (int r = 0; r < rows; r++) {
         gemv_q5_k(&output[r],
                   (const char *)weight_q5_k + r * (cols / CK_QK_K) * sizeof(block_q5_K),
                   input_f32, 1, cols);
     }
 }

References CK_QK_K, and gemv_q5_k().

◆ ck_test_gemv_q6_k()

void ck_test_gemv_q6_k	(	const void *	weight_q6k,
		const float *	input_f32,
		float *	output,
		int	cols
	)

Q6_K GEMV.

Definition at line 167 of file ck_parity_api.c.

 {
     /* Q6_K GEMV is not yet implemented in CK - provide reference impl */
     /* For now, dequantize and compute in FP32 */
     float *weight_f32 = (float *)malloc(cols * sizeof(float));
     if (!weight_f32) {
         *output = 0.0f;
         return;
     }
  
     dequant_q6_k_row(weight_q6k, weight_f32, cols);
  
     /* Dot product in FP32 */
     double sum = 0.0;
     for (int i = 0; i < cols; i++) {
         sum += (double)weight_f32[i] * (double)input_f32[i];
     }
     *output = (float)sum;
  
     free(weight_f32);
 }

References dequant_q6_k_row().

◆ ck_test_gemv_q8_0()

void ck_test_gemv_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q8_0 GEMV - matrix-vector multiply with Q8_0 weights.

Parameters

weight_q8_0	Q8_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 220 of file ck_parity_api.c.

 {
     /* Match llama.cpp's test_gemv_q8_0:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (vec_dot_q8_0_q8_0)
      *
      * This ensures parity with llama.cpp which always uses the
      * quantized path, NOT the FP32 dequantization path.
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel (same as ck_test_gemv_q8_0_q8_0) */
     gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q8_0_q8_0()

void ck_test_gemv_q8_0_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters

weight_q8_0	Q8_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols] - will be quantized to Q8_0
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 274 of file ck_parity_api.c.

 {
     /* This matches llama.cpp's approach:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (integer math)
      * 3. Scale at the end
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel */
     gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_outproj_mlp_fused_q5_0()

void ck_test_outproj_mlp_fused_q5_0	(	const float *	attn_out,
		const float *	residual,
		const float *	ln2_gamma,
		const void *	wo,
		const void *	w1,
		const void *	w2,
		float *	output,
		int	tokens,
		int	num_heads,
		int	head_dim,
		int	embed_dim,
		int	intermediate,
		float	eps,
		int	w2_is_q6k
	)

Test mega-fused OutProj + MLP kernel (Q5_0 weights)

This is a simplified wrapper for parity testing that:

Uses Q5_0 for W_o and W1 weights
Uses Q4_K for W2 weights
Allocates scratch internally

Parameters

attn_out	Attention output [num_heads, tokens, head_dim] (FP32, head-major)
residual	Residual input [tokens, embed_dim] (FP32)
ln2_gamma	RMSNorm gamma [embed_dim] (FP32)
wo	OutProj weights [embed_dim, embed_dim] (Q5_0)
w1	MLP W1 weights [2*intermediate, embed_dim] (Q5_0)
w2	MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K)
output	Output [tokens, embed_dim] (FP32)
tokens	Number of tokens
num_heads	Number of attention heads
head_dim	Dimension per head
embed_dim	Embedding dimension (= num_heads * head_dim)
intermediate	MLP intermediate dimension
eps	RMSNorm epsilon
w2_is_q6k	If true, W2 is Q6_K; if false, W2 is Q4_K

Definition at line 894 of file ck_parity_api.c.

 {
     /* CK uses dtype enum: CK_DT_Q5_0 = 11, CK_DT_Q4_K = 7, CK_DT_Q6_K = 8 */
     const int CK_DT_Q5_0_VAL = 11;
     const int CK_DT_Q4_K_VAL = 7;
     const int CK_DT_Q6_K_VAL = 8;
  
     /* For parity testing, aligned = actual (no padding) */
     int aligned_embed_dim = embed_dim;
     int aligned_head_dim = head_dim;
     int aligned_intermediate = intermediate;
  
     /* Ensure intermediate is multiple of 256 (QK_K) for K-quants */
     if ((intermediate % 256) != 0) {
         aligned_intermediate = ((intermediate + 255) / 256) * 256;
     }
  
     /* Allocate scratch */
     size_t scratch_size = mega_fused_outproj_mlp_prefill_scratch_size(
         tokens, aligned_embed_dim, num_heads, aligned_head_dim, aligned_intermediate);
  
     void *scratch = malloc(scratch_size);
     if (!scratch) {
         return;
     }
  
     /* Call the mega-fused kernel */
     mega_fused_outproj_mlp_prefill(
         output,
         attn_out,
         residual,
         ln2_gamma,
         wo, NULL, CK_DT_Q5_0_VAL,          /* W_o with Q5_0 */
         w1, NULL, CK_DT_Q5_0_VAL,          /* W1 with Q5_0 */
         w2, NULL, w2_is_q6k ? CK_DT_Q6_K_VAL : CK_DT_Q4_K_VAL,  /* W2 with Q4_K or Q6_K */
         tokens,
         embed_dim,
         aligned_embed_dim,
         num_heads,
         aligned_head_dim,
         intermediate,
         aligned_intermediate,
         eps,
         scratch
     );
  
     free(scratch);
 }

References mega_fused_outproj_mlp_prefill(), and mega_fused_outproj_mlp_prefill_scratch_size().

◆ ck_test_quantize_q8_k()

void ck_test_quantize_q8_k	(	const float *	src,
		void *	dst,
		int	n
	)

Quantize FP32 to Q8_K (for activations)

Parameters

src	Input FP32 values
dst	Output Q8_K blocks
n	Number of elements (must be multiple of 256)

Definition at line 136 of file ck_parity_api.c.

 {
     quantize_row_q8_k(src, dst, n);
 }

References quantize_row_q8_k().

◆ ck_test_rmsnorm()

void ck_test_rmsnorm	(	const float *	input,
		const float *	weight,
		float *	output,
		int	n_tokens,
		int	dim,
		float	eps
	)

RMSNorm.

Computes: output = (input / rms(input)) * weight where rms(x) = sqrt(mean(x^2) + eps)

Parameters

input	Input tensor [n_tokens, dim]
weight	Normalization weights [dim]
output	Output tensor [n_tokens, dim]
n_tokens	Number of tokens
dim	Hidden dimension
eps	Epsilon for numerical stability

Definition at line 557 of file ck_parity_api.c.

 {
     /* CK rmsnorm_forward has aligned_embed_dim parameter
      * For testing, use dim as aligned_embed_dim (no padding) */
     rmsnorm_forward(input, weight, output, NULL, n_tokens, dim, dim, eps);
 }

References rmsnorm_forward().

◆ ck_test_rope()

void ck_test_rope	(	float *	q,
		float *	k,
		int	n_tokens,
		int	n_heads,
		int	n_heads_kv,
		int	head_dim,
		int	pos_offset,
		float	theta
	)

RoPE (Rotary Position Embedding)

Applies rotary position embeddings to Q and K tensors.

NOTE: CK uses rotate-half format (split first/second halves) while some implementations use interleaved format. The test harness should account for this.

Parameters

q	Query tensor [n_tokens, n_heads * head_dim], modified in-place
k	Key tensor [n_tokens, n_heads_kv * head_dim], modified in-place
n_tokens	Number of tokens
n_heads	Number of query heads
n_heads_kv	Number of key/value heads
head_dim	Dimension per head
pos_offset	Starting position for RoPE
theta	RoPE base frequency (typically 10000.0)

Definition at line 567 of file ck_parity_api.c.

 {
     /* Precompute cos/sin cache */
     int half_dim = head_dim / 2;
     int max_seq = pos_offset + n_tokens;
  
     float *cos_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
     float *sin_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
     if (!cos_cache || !sin_cache) {
         free(cos_cache);
         free(sin_cache);
         return;
     }
  
     rope_precompute_cache(cos_cache, sin_cache, max_seq, head_dim, theta);
  
     /* CK RoPE expects layout [num_heads, num_tokens, head_dim]
      * Reshape from [n_tokens, n_heads * head_dim] to [n_heads, n_tokens, head_dim]
      */
     float *q_reorder = (float *)malloc(n_heads * n_tokens * head_dim * sizeof(float));
     float *k_reorder = (float *)malloc(n_heads_kv * n_tokens * head_dim * sizeof(float));
  
     if (q_reorder && k_reorder) {
         /* Reorder Q: [T, H*D] -> [H, T, D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     q_reorder[h * n_tokens * head_dim + t * head_dim + d] =
                         q[t * n_heads * head_dim + h * head_dim + d];
                 }
             }
         }
  
         /* Reorder K: [T, H_kv*D] -> [H_kv, T, D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads_kv; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     k_reorder[h * n_tokens * head_dim + t * head_dim + d] =
                         k[t * n_heads_kv * head_dim + h * head_dim + d];
                 }
             }
         }
  
         /* Apply RoPE */
         rope_forward_qk(q_reorder, k_reorder,
                         cos_cache, sin_cache,
                         n_heads, n_heads_kv, n_tokens,
                         head_dim, head_dim, pos_offset);
  
         /* Reorder back: [H, T, D] -> [T, H*D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     q[t * n_heads * head_dim + h * head_dim + d] =
                         q_reorder[h * n_tokens * head_dim + t * head_dim + d];
                 }
             }
         }
  
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads_kv; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     k[t * n_heads_kv * head_dim + h * head_dim + d] =
                         k_reorder[h * n_tokens * head_dim + t * head_dim + d];
                 }
             }
         }
     }
  
     free(q_reorder);
     free(k_reorder);
     free(cos_cache);
     free(sin_cache);
 }

References rope_forward_qk(), and rope_precompute_cache().

◆ ck_test_rope_interleaved()

void ck_test_rope_interleaved	(	float *	q,
		float *	k,
		int	n_tokens,
		int	n_heads,
		int	n_heads_kv,
		int	head_dim,
		int	pos_offset,
		float	theta
	)

RoPE with interleaved format (for llama.cpp compatibility)

Uses interleaved format: (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)

Definition at line 644 of file ck_parity_api.c.

 {
     /* Interleaved RoPE format (matches llama.cpp):
      * (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)
      * Applied to consecutive pairs of elements
      */
  
     /* Precompute inverse frequencies */
     float *inv_freq = (float *)malloc((head_dim / 2) * sizeof(float));
     if (!inv_freq) return;
  
     for (int i = 0; i < head_dim / 2; i++) {
         inv_freq[i] = 1.0f / powf(theta, (float)(2 * i) / head_dim);
     }
  
     /* Apply RoPE to Q */
     for (int t = 0; t < n_tokens; t++) {
         int pos = pos_offset + t;
         for (int h = 0; h < n_heads; h++) {
             float *qh = q + t * n_heads * head_dim + h * head_dim;
  
             for (int i = 0; i < head_dim / 2; i++) {
                 float freq = pos * inv_freq[i];
                 float cos_val = cosf(freq);
                 float sin_val = sinf(freq);
  
                 /* Interleaved format */
                 float x0 = qh[i * 2];
                 float x1 = qh[i * 2 + 1];
                 qh[i * 2]     = x0 * cos_val - x1 * sin_val;
                 qh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
             }
         }
     }
  
     /* Apply RoPE to K */
     for (int t = 0; t < n_tokens; t++) {
         int pos = pos_offset + t;
         for (int h = 0; h < n_heads_kv; h++) {
             float *kh = k + t * n_heads_kv * head_dim + h * head_dim;
  
             for (int i = 0; i < head_dim / 2; i++) {
                 float freq = pos * inv_freq[i];
                 float cos_val = cosf(freq);
                 float sin_val = sinf(freq);
  
                 float x0 = kh[i * 2];
                 float x1 = kh[i * 2 + 1];
                 kh[i * 2]     = x0 * cos_val - x1 * sin_val;
                 kh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
             }
         }
     }
  
     free(inv_freq);
 }

◆ ck_test_softmax()

void ck_test_softmax	(	const float *	input,
		float *	output,
		int	n
	)

Softmax (simple, non-causal)

Computes: output[i] = exp(input[i]) / sum(exp(input))

Parameters

input	Input tensor [n]
output	Output tensor [n]
n	Number of elements

Definition at line 710 of file ck_parity_api.c.

 {
     /* Find max for numerical stability */
     float max_val = input[0];
     for (int i = 1; i < n; i++) {
         if (input[i] > max_val) max_val = input[i];
     }
  
     /* Compute exp and sum */
     float sum = 0.0f;
     for (int i = 0; i < n; i++) {
         output[i] = expf(input[i] - max_val);
         sum += output[i];
     }
  
     /* Normalize */
     float inv_sum = 1.0f / sum;
     for (int i = 0; i < n; i++) {
         output[i] *= inv_sum;
     }
 }

◆ ck_test_swiglu()

void ck_test_swiglu	(	const float *	gate_up,
		float *	output,
		int	n_tokens,
		int	intermediate_dim
	)

SwiGLU activation.

Computes: output = SiLU(gate) * up where SiLU(x) = x * sigmoid(x)

Parameters

gate_up	Input tensor [n_tokens, 2 * intermediate_dim] Layout: [gate_0..gate_D-1, up_0..up_D-1] per token
output	Output tensor [n_tokens, intermediate_dim]
n_tokens	Number of tokens
intermediate_dim	Intermediate dimension

Definition at line 703 of file ck_parity_api.c.

 {
     swiglu_forward(gate_up, output, n_tokens, intermediate_dim);
 }

References swiglu_forward().

◆ ck_test_vec_dot_q5_0_q8_0()

void ck_test_vec_dot_q5_0_q8_0	(	const void *	weight_q5_0,
		const void *	input_q8_0,
		float *	output,
		int	cols
	)

Direct Q5_0 x Q8_0 dot product test (takes pre-quantized Q8_0 input)

Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.

Parameters

weight_q5_0	Q5_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Definition at line 364 of file ck_parity_api.c.

 {
     vec_dot_q5_0_q8_0(cols, output, weight_q5_0, input_q8_0);
 }

References vec_dot_q5_0_q8_0().

◆ ck_test_vec_dot_q8_0_q8_0()

void ck_test_vec_dot_q8_0_q8_0	(	const void *	weight_q8_0,
		const void *	input_q8_0,
		float *	output,
		int	cols
	)

Direct Q8_0 x Q8_0 dot product test (takes pre-quantized Q8_0 input)

Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

Parameters

weight_q8_0	Q8_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Definition at line 380 of file ck_parity_api.c.

 {
     vec_dot_q8_0_q8_0(cols, output, weight_q8_0, input_q8_0);
 }

References vec_dot_q8_0_q8_0().

◆ dequant_q4_0_row()

void dequant_q4_0_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_0 row (multiple blocks)

Parameters

src	Q4_0 data
dst	FP32 output
n_elements	Number of elements to dequantize

Definition at line 61 of file dequant_kernels.c.

 {
     const block_q4_0 *blocks = (const block_q4_0 *)src;
     const size_t n_blocks = n_elements / QK4_0;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_0_block(&blocks[b], &dst[b * QK4_0]);
     }
 }

Referenced by ck_test_dequant_q4_0(), and dequant_row().

◆ dequant_q4_k_row()

void dequant_q4_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q4_K row (multiple blocks)

Definition at line 370 of file dequant_kernels.c.

 {
     const block_q4_K *blocks = (const block_q4_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q4_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

Referenced by ck_test_dequant_q4_k(), and dequant_row().

◆ dequant_q5_1_row()

void dequant_q5_1_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q5_1 row (multiple blocks)

Definition at line 255 of file dequant_kernels.c.

 {
     const block_q5_1 *blocks = (const block_q5_1 *)src;
     const size_t n_blocks = n_elements / QK5_1;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q5_1_block(&blocks[b], &dst[b * QK5_1]);
     }
 }

Referenced by ck_test_dequant_q5_1(), and dequant_row().

◆ dequant_q6_k_row()

void dequant_q6_k_row	(	const void *	src,
		float *	dst,
		size_t	n_elements
	)

Dequantize Q6_K row (multiple blocks)

Definition at line 420 of file dequant_kernels.c.

 {
     const block_q6_K *blocks = (const block_q6_K *)src;
     const size_t n_blocks = n_elements / QK_K;
  
     for (size_t b = 0; b < n_blocks; b++) {
         dequant_q6_k_block(&blocks[b], &dst[b * QK_K]);
     }
 }

Referenced by ck_test_dequant_q6_k(), ck_test_gemv_q6_k(), and dequant_row().

◆ geglu_backward_fp32()

void geglu_backward_fp32	(	const float *	x,
		const float *	d_out,
		float *	d_x,
		int	tokens,
		int	dim
	)

GeGLU backward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_backward_fp32

dL/dx given dL/d(out) where out = GELU(a) * b Chain rule: dL/da = dL/dout * d(GELU)/da * b dL/db = dL/dout * GELU(a)

After changes: make test

Definition at line 843 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         const float *d_out_ptr = d_out + (size_t)t * dim;
         float *d_x_ptr = d_x + (size_t)t * inner_dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float dout = d_out_ptr[d];
  
             // GELU(a) derivative components
             float a2 = a * a;
             float a3 = a2 * a;
             float g = sqrt_2_over_pi * (a + coeff * a3);
             float tanh_g = tanhf(g);
             float sech2_g = 1.0f - tanh_g * tanh_g;
             float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * a2);
  
             // d(GELU)/da = 0.5 * (1 + tanh(g)) + 0.5 * a * sech^2(g) * g'
             float d_gelu = 0.5f * (1.0f + tanh_g) + 0.5f * a * sech2_g * g_prime;
  
             // dL/da = dL/dout * d(GELU)/da * b
             d_x_ptr[d] = dout * d_gelu * b;
  
             // dL/db = dL/dout * GELU(a)
             float gelu_a = 0.5f * a * (1.0f + tanh_g);
             d_x_ptr[dim + d] = dout * gelu_a;
         }
     }
 }

Referenced by ck_test_geglu_backward().

◆ geglu_forward_fp32()

void geglu_forward_fp32	(	const float *	x,
		float *	out,
		int	tokens,
		int	dim
	)

GeGLU forward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_forward_fp32

Computes out = GELU(a) * b where x = [a, b] along last dimension. Input shape: [tokens, 2 * dim], Output shape: [tokens, dim]

After changes: make test

Definition at line 623 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         // Process first half (a) with GELU, second half (b) directly
         for (; d + 32 <= dim; d += 32) {
             // Load a (first half of inner_dim)
             __m512 a0 = _mm512_loadu_ps(&x_ptr[d]);
             __m512 a1 = _mm512_loadu_ps(&x_ptr[d + 16]);
  
             // Compute GELU(a)
             __m512 a0_sq = _mm512_mul_ps(a0, a0);
             __m512 a0_cu = _mm512_mul_ps(a0_sq, a0);
             __m512 a1_sq = _mm512_mul_ps(a1, a1);
             __m512 a1_cu = _mm512_mul_ps(a1_sq, a1);
  
             // inner = sqrt(2/pi) * (a + 0.044715 * a^3)
             __m512 inner0 = _mm512_fmadd_ps(coeff_vec, a0_cu, a0);
             __m512 inner1 = _mm512_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm512_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm512_mul_ps(sqrt_2_pi_vec, inner1);
  
             // tanh(inner)
             __m512 tanh0 = tanh512_fast(inner0);
             __m512 tanh1 = tanh512_fast(inner1);
  
             // GELU = 0.5 * a * (1 + tanh)
             __m512 gelu0 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a0, _mm512_add_ps(one_vec, tanh0)));
             __m512 gelu1 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a1, _mm512_add_ps(one_vec, tanh1)));
  
             // Load b (second half of inner_dim)
             __m512 b0 = _mm512_loadu_ps(&x_ptr[dim + d]);
             __m512 b1 = _mm512_loadu_ps(&x_ptr[dim + d + 16]);
  
             // out = GELU(a) * b
             _mm512_storeu_ps(&out_ptr[d], _mm512_mul_ps(gelu0, b0));
             _mm512_storeu_ps(&out_ptr[d + 16], _mm512_mul_ps(gelu1, b1));
         }
         // Handle remaining
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 16 <= dim; d += 16) {
             // Load a
             __m256 a0 = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a1 = _mm256_loadu_ps(&x_ptr[d + 8]);
  
             // GELU(a)
             __m256 a0_sq = _mm256_mul_ps(a0, a0);
             __m256 a0_cu = _mm256_mul_ps(a0_sq, a0);
             __m256 a1_sq = _mm256_mul_ps(a1, a1);
             __m256 a1_cu = _mm256_mul_ps(a1_sq, a1);
  
             __m256 inner0 = _mm256_fmadd_ps(coeff_vec, a0_cu, a0);
             __m256 inner1 = _mm256_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm256_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm256_mul_ps(sqrt_2_pi_vec, inner1);
  
             __m256 tanh0 = tanh256_fast(inner0);
             __m256 tanh1 = tanh256_fast(inner1);
  
             __m256 gelu0 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a0, _mm256_add_ps(one_vec, tanh0)));
             __m256 gelu1 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a1, _mm256_add_ps(one_vec, tanh1)));
  
             // b
             __m256 b0 = _mm256_loadu_ps(&x_ptr[dim + d]);
             __m256 b1 = _mm256_loadu_ps(&x_ptr[dim + d + 8]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu0, b0));
             _mm256_storeu_ps(&out_ptr[d + 8], _mm256_mul_ps(gelu1, b1));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     float inner_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 8 <= dim; d += 8) {
             __m256 a = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a_sq = _mm256_mul_ps(a, a);
             __m256 a_cu = _mm256_mul_ps(a_sq, a);
  
             __m256 coeff_a_cu = _mm256_mul_ps(coeff_vec, a_cu);
             __m256 inner = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(a, coeff_a_cu));
  
             _mm256_store_ps(inner_arr, inner);
             for (int j = 0; j < 8; ++j) {
                 tanh_arr[j] = tanhf(inner_arr[j]);
             }
             __m256 tanh_val = _mm256_load_ps(tanh_arr);
  
             __m256 gelu = _mm256_mul_ps(half_vec, _mm256_mul_ps(a, _mm256_add_ps(one_vec, tanh_val)));
             __m256 b = _mm256_loadu_ps(&x_ptr[dim + d]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu, b));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #else
     // Scalar fallback
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
 #endif
 }

Referenced by ck_test_geglu(), and geglu_forward_bf16().

◆ gemm_nt_q4_k_q8_k()

void gemm_nt_q4_k_q8_k	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 295 of file gemm_kernels_q4k_q8k.c.

 {
     if (!A_q8 || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     gemm_q4_k_q8_k(C, B, A_q8, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

Referenced by ck_test_gemm_q4_k().

◆ gemm_nt_q5_0_q8_0()

void gemm_nt_q5_0_q8_0	(	const void *	A_q8,
		const void *	B_q5,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.

Computes C = A @ B^T + bias where: A: [M x K] Q8_0 quantized activations (M tokens, K features) B: [N x K] Q5_0 quantized weights (N outputs, K features) C: [M x N] FP32 output

This is the INT8 batch kernel for prefill, using pre-quantized activations to avoid FP32->Q8_0 conversion overhead per operation.

Parameters

A_q8	Input activations in Q8_0 format [M rows of K/32 blocks each]
B_q5	Weights in Q5_0 format [N rows of K/32 blocks each]
bias	Optional bias vector [N], NULL if not used
C	Output matrix [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension (number of output features)
K	Input dimension (must be multiple of 32)

Definition at line 1617 of file gemm_kernels_q5_0.c.

 {
     const block_q5_0 *weights = (const block_q5_0 *)B_q5;
     const block_q8_0 *inputs = (const block_q8_0 *)A_q8;
     const int blocks_per_row = K / QK5_0;
  
     for (int m = 0; m < M; m++) {
         const block_q8_0 *input_row = &inputs[m * blocks_per_row];
  
         for (int n = 0; n < N; n++) {
             const block_q5_0 *weight_row = &weights[n * blocks_per_row];
             float *out = &C[m * N + n];
  
             /* Dispatches to vec_dot_q5_0_q8_0_avx (2x block unrolled) on AVX */
             vec_dot_q5_0_q8_0(K, out, weight_row, input_row);
  
             if (bias) {
                 *out += bias[n];
             }
         }
     }
 }

Referenced by ck_test_gemm_q5_0().

◆ gemm_nt_q5_1()

void gemm_nt_q5_1	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

GEMM with transposed Q5_1 weights: C = A @ B^T.

Parameters

A	Input activations [M x K], row-major FP32
B	Weight matrix in Q5_1 format [N x K], row-major quantized
bias	Optional bias [N], NULL if not used
C	Output [M x N], row-major FP32
M	Batch size (number of tokens)
N	Output dimension
K	Input dimension

Definition at line 309 of file gemm_kernels_q5_1.c.

 {
     const block_q5_1 *blocks = (const block_q5_1 *)B;
     const int blocks_per_row = K / QK5_1;
  
     for (int m = 0; m < M; m++) {
         const float *a_row = &A[m * K];
  
         for (int n = 0; n < N; n++) {
             float sum = 0.0f;
  
             for (int b = 0; b < blocks_per_row; b++) {
                 const block_q5_1 *block = &blocks[n * blocks_per_row + b];
                 const float d = CK_FP16_TO_FP32(block->d);
                 const float min = CK_FP16_TO_FP32(block->m);
                 const float *ap = &a_row[b * QK5_1];
  
                 uint32_t qh;
                 memcpy(&qh, block->qh, sizeof(qh));
  
                 /* First 16 weights: low nibbles, high bits from qh[0:15] */
                 for (int j = 0; j < QK5_1 / 2; j++) {
                     const int lo = (block->qs[j] & 0x0F);
                     const int hi = ((qh >> j) & 1) << 4;
                     sum += (d * (float)(lo | hi) + min) * ap[j];
                 }
  
                 /* Second 16 weights: high nibbles, high bits from qh[16:31] */
                 for (int j = 0; j < QK5_1 / 2; j++) {
                     const int lo = (block->qs[j] >> 4);
                     const int hi = ((qh >> (j + 16)) & 1) << 4;
                     sum += (d * (float)(lo | hi) + min) * ap[j + QK5_1 / 2];
                 }
             }
  
             C[m * N + n] = sum + (bias ? bias[n] : 0.0f);
         }
     }
 }

Referenced by ck_test_gemm_q5_1().

◆ gemm_nt_q5_k()

void gemm_nt_q5_k	(	const float *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 218 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #else
     gemm_nt_q5_k_ref(A, B, bias, C, M, N, K);
 #endif
 }

Referenced by ck_test_gemm_q5_k().

◆ gemm_nt_q6_k_q8_k()

void gemm_nt_q6_k_q8_k	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K.

This is the typical inference pattern:

A: Activations in Q8_K format [M x K]
B: Weights in Q6_K format [N x K]
C: Output [M x N]

Parameters

A_q8	Input activations in Q8_K format
B	Weight matrix in Q6_K format
bias	Optional bias vector [N]
C	Output matrix
M	Batch size (number of tokens)
N	Output dimension
K	Input dimension

Definition at line 1144 of file gemm_kernels_q6k_q8k.c.

 {
     if (!A_q8 || !B || !C) {
         return;
     }
     if (M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     gemm_q6_k_q8_k(C, B, A_q8, /*M_out=*/N, /*N_batch=*/M, K);
  
     if (!bias) {
         return;
     }
  
     for (int i = 0; i < M; ++i) {
         float *row = C + (size_t)i * (size_t)N;
         for (int j = 0; j < N; ++j) {
             row[j] += bias[j];
         }
     }
 }

Referenced by ck_test_gemm_q6_k().

◆ gemm_nt_q8_0_q8_0()

void gemm_nt_q8_0_q8_0	(	const void *	A,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

gemm_nt_q8_0_q8_0 with optional bias (matches header signature)

C[m,n] = A[m,K] @ B[n,K]^T + bias[n]

Definition at line 582 of file gemm_batch_int8.c.

 {
     /* First compute GEMM */
 #if defined(__AVX512VNNI__)
     gemm_nt_q8_0_q8_0_vnni(A, B, C, M, N, K);
 #elif defined(__AVX512F__)
     gemm_nt_q8_0_q8_0_avx512(A, B, C, M, N, K);
 #elif defined(__AVX2__)
     gemm_nt_q8_0_q8_0_avx2(A, B, C, M, N, K);
 #elif defined(__AVX__)
     gemm_nt_q8_0_q8_0_avx(A, B, C, M, N, K);
 #else
     gemm_nt_q8_0_q8_0_ref(A, B, C, M, N, K);
 #endif
  
     /* Add bias if provided */
     if (bias != NULL) {
         for (int m = 0; m < M; m++) {
             for (int n = 0; n < N; n++) {
                 C[(size_t)m * N + n] += bias[n];
             }
         }
     }
 }

Referenced by ck_test_gemm_q8_0().

◆ gemv_q4_k_q8_k()

void gemv_q4_k_q8_k	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 239 of file gemm_kernels_q4k_q8k.c.

 {
 #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     /* VNNI: Best for decode (single token) - INT8 dot product acceleration */
     gemv_q4_k_q8_k_vnni(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q4_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     /* AVX version uses maddubs_epi16 (more efficient than SSE) */
     gemv_q4_k_q8_k_avx(y, W, x_q8, M, K);
 #elif defined(__SSE4_1__)
     gemv_q4_k_q8_k_sse(y, W, x_q8, M, K);
 #else
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

Referenced by ck_test_gemv_q4_k().

◆ gemv_q5_0()

void gemv_q5_0	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV for Q5_0 weights based on CPU features.

Dispatch priority (best available):

AVX-512 (512-bit vectors) - Intel Skylake-X+
AVX2+FMA (256-bit vectors) - Intel Haswell+
AVX (256-bit vectors) - Intel Sandy Bridge+
SSE4.1 (128-bit vectors) - Intel Nehalem+
Reference (scalar) - Fallback

Uses ck_features.h for standardized feature detection.

Parameters

y	Output vector [M]
W	Weight matrix in Q5_0 format [M x K]
x	Input vector [K]
M	Number of output rows
K	Number of input columns (hidden dimension)

Definition at line 547 of file gemm_kernels_q5_0.c.

 {
 // Dispatch order: AVX512 > AVX2 > AVX > SSE > ref
 #if defined(__AVX512F__)
     gemv_q5_0_avx512(y, W, x, M, K);
 #elif defined(__AVX2__)
     gemv_q5_0_avx2(y, W, x, M, K);
 #elif defined(__AVX__)
     gemv_q5_0_avx(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     gemv_q5_0_sse_v2(y, W, x, M, K);
 #else
     gemv_q5_0_ref(y, W, x, M, K);
 #endif
 }

Referenced by dot_q5_0(), gemm_nt_q5_0(), and gemm_q5_0().

◆ gemv_q5_0_q8_0()

void gemv_q5_0_q8_0	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Matrix-vector multiply with Q5_0 weights and Q8_0 input.

Parameters

y	Output vector [M]
W	Weight matrix in Q5_0 format [M x K]
x_q8	Input vector in Q8_0 format [K]
M	Number of output rows
K	Number of columns (must be multiple of 32)

Definition at line 1529 of file gemm_kernels_q5_0.c.

 {
     const block_q5_0 *w_blocks = (const block_q5_0 *)W;
     const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
     const int blocks_per_row = K / QK5_0;
  
     for (int row = 0; row < M; row++) {
         vec_dot_q5_0_q8_0(K, &y[row],
                           &w_blocks[row * blocks_per_row],
                           x_blocks);
     }
 }

Referenced by ck_test_gemv_q5_0(), and ck_test_gemv_q5_0_q8_0().

◆ gemv_q5_1()

void gemv_q5_1	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV.

Definition at line 184 of file gemm_kernels_q5_1.c.

 {
 #ifdef __AVX512F__
     gemv_q5_1_avx512(y, W, x, M, K);
 #else
     gemv_q5_1_ref(y, W, x, M, K);
 #endif
 }

Referenced by ck_test_gemv_q5_1(), dot_q5_1(), and gemm_q5_1().

◆ gemv_q5_k()

void gemv_q5_k	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Definition at line 199 of file gemm_kernels_q5_k.c.

 {
 #if defined(__AVX512F__)
     /* TODO: AVX-512 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX2__)
     /* TODO: AVX-2 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__AVX__)
     /* TODO: AVX implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     /* TODO: SSE4.1 implementation */
     gemv_q5_k_ref(y, W, x, M, K);
 #else
     gemv_q5_k_ref(y, W, x, M, K);
 #endif
 }

Referenced by ck_test_gemv_q5_k().

◆ gemv_q6_k_q8_k()

void gemv_q6_k_q8_k	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

GEMV: y = W @ x where W is Q6_K and x is Q8_K.

Definition at line 980 of file gemm_kernels_q6k_q8k.c.

 {
     /* AVX-512 uses same algorithm as AVX2 (matches llama.cpp) */
 #if defined(__AVX512F__) && defined(__AVX512BW__)
     gemv_q6_k_q8_k_avx512(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q6_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     gemv_q6_k_q8_k_avx(y, W, x_q8, M, K);
 #elif defined(__SSSE3__)
     gemv_q6_k_q8_k_sse(y, W, x_q8, M, K);
 #else
     gemv_q6_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

Referenced by gemm_q6_k_q8_k().

◆ gemv_q8_0()

void gemv_q8_0	(	float *	y,
		const void *	W,
		const float *	x,
		int	M,
		int	K
	)

Auto-dispatch GEMV for Q8_0 weights based on CPU features.

Dispatch priority (best available):

AVX-512 (512-bit vectors) - Intel Skylake-X+
AVX2+FMA (256-bit vectors) - Intel Haswell+
AVX (256-bit vectors) - Intel Sandy Bridge+
SSE4.1 (128-bit vectors) - Intel Nehalem+
Reference (scalar) - Fallback

Uses ck_features.h for standardized feature detection.

Parameters

y	Output vector [M]
W	Weight matrix in Q8_0 format [M x K]
x	Input vector [K]
M	Number of output rows
K	Number of input columns (hidden dimension)

Definition at line 630 of file gemm_kernels_q8_0.c.

 {
 // Dispatch order: AVX512 > AVX2 > AVX > SSE > ref
 #if defined(__AVX512F__)
     gemv_q8_0_avx512(y, W, x, M, K);
 #elif defined(__AVX2__)
     gemv_q8_0_avx2(y, W, x, M, K);
 #elif defined(__AVX__)
     gemv_q8_0_avx(y, W, x, M, K);
 #elif defined(__SSE4_1__)
     gemv_q8_0_sse(y, W, x, M, K);
 #else
     gemv_q8_0_ref(y, W, x, M, K);
 #endif
 }

Referenced by dot_q8_0(), gemm_nt_q8_0(), and gemm_q8_0().

◆ gemv_q8_0_q8_0()

void gemv_q8_0_q8_0	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Matrix-vector multiply with Q8_0 weights and Q8_0 input.

Parameters

y	Output vector [M]
W	Weight matrix in Q8_0 format [M x K]
x_q8	Input vector in Q8_0 format [K]
M	Number of output rows
K	Number of columns (must be multiple of 32)

Definition at line 1042 of file gemm_kernels_q8_0.c.

 {
     const block_q8_0 *w_blocks = (const block_q8_0 *)W;
     const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
     const int blocks_per_row = K / QK8_0;
  
     for (int row = 0; row < M; row++) {
         vec_dot_q8_0_q8_0(K, &y[row],
                           &w_blocks[row * blocks_per_row],
                           x_blocks);
     }
 }

Referenced by ck_test_gemv_q8_0(), and ck_test_gemv_q8_0_q8_0().

◆ mega_fused_outproj_mlp_prefill()

void mega_fused_outproj_mlp_prefill	(	float *	output,
		const float *	attn_out,
		const float *	residual,
		const float *	ln2_gamma,
		const void *	wo,
		const float *	bo,
		int	wo_dt,
		const void *	w1,
		const float *	b1,
		int	w1_dt,
		const void *	w2,
		const float *	b2,
		int	w2_dt,
		int	tokens,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	aligned_head_dim,
		int	intermediate_dim,
		int	aligned_intermediate_dim,
		float	eps,
		void *	scratch
	)

Referenced by ck_test_outproj_mlp_fused_q5_0().

◆ mega_fused_outproj_mlp_prefill_scratch_size()

size_t mega_fused_outproj_mlp_prefill_scratch_size	(	int	tokens,
		int	aligned_embed_dim,
		int	num_heads,
		int	aligned_head_dim,
		int	aligned_intermediate_dim
	)

Get scratch buffer size for mega_fused_outproj_mlp_prefill.

Definition at line 159 of file mega_fused_outproj_mlp_prefill.c.

 {
     if (tokens <= 0 || aligned_embed_dim <= 0 || num_heads <= 0 ||
         aligned_head_dim <= 0 || aligned_intermediate_dim <= 0) {
         return 0;
     }
  
     const size_t q8_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0,
                                                    (size_t)aligned_head_dim);
     const size_t attn_q8_bytes = (size_t)num_heads * (size_t)tokens * q8_row_bytes;
     const size_t h1_bytes = (size_t)tokens * (size_t)aligned_embed_dim * sizeof(float);
     const size_t ln2_bytes = h1_bytes;
     const size_t mlp_scratch = fused_mlp_swiglu_prefill_w1w2_quant_scratch_size(
         aligned_embed_dim, aligned_intermediate_dim);
  
     return align_up_size(attn_q8_bytes, 64) +
            align_up_size(h1_bytes, 64) +
            align_up_size(ln2_bytes, 64) +
            align_up_size(mlp_scratch, 64);
 }

Referenced by ck_test_outproj_mlp_fused_q5_0().

◆ quantize_row_q8_0()

void quantize_row_q8_0	(	const float *	x,
		void *	vy,
		int	k
	)

Quantize FP32 to Q8_0 format (scalar reference)

Parameters

x	Input FP32 values
vy	Output Q8_0 blocks
k	Number of elements (must be multiple of 32)

Definition at line 59 of file gemm_kernels_q8_0.c.

 {
     block_q8_0 *y = (block_q8_0 *)vy;
     const int nb = k / QK8_0;  /* QK8_0 = 32 */
  
 #if defined(__AVX__)
     const __m256 sign_bit = _mm256_set1_ps(-0.0f);
     const __m256 v_half = _mm256_set1_ps(0.5f);
     const __m256 v_min = _mm256_set1_ps(-127.0f);
     const __m256 v_max = _mm256_set1_ps(127.0f);
  
     for (int i = 0; i < nb; i++) {
         __m256 v0 = _mm256_loadu_ps(x + 0);
         __m256 v1 = _mm256_loadu_ps(x + 8);
         __m256 v2 = _mm256_loadu_ps(x + 16);
         __m256 v3 = _mm256_loadu_ps(x + 24);
         x += QK8_0;
  
         __m256 max_abs = _mm256_andnot_ps(sign_bit, v0);
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v1));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v2));
         max_abs = _mm256_max_ps(max_abs, _mm256_andnot_ps(sign_bit, v3));
  
         __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(max_abs, 1),
                                  _mm256_castps256_ps128(max_abs));
         max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
         max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
         const float max_scalar = _mm_cvtss_f32(max4);
  
         const float d = max_scalar / 127.0f;
         const float id = max_scalar != 0.0f ? 127.0f / max_scalar : 0.0f;
         y[i].d = CK_FP32_TO_FP16(d);
  
         const __m256 mul = _mm256_set1_ps(id);
         v0 = _mm256_mul_ps(v0, mul);
         v1 = _mm256_mul_ps(v1, mul);
         v2 = _mm256_mul_ps(v2, mul);
         v3 = _mm256_mul_ps(v3, mul);
  
         v0 = _mm256_min_ps(_mm256_max_ps(v0, v_min), v_max);
         v1 = _mm256_min_ps(_mm256_max_ps(v1, v_min), v_max);
         v2 = _mm256_min_ps(_mm256_max_ps(v2, v_min), v_max);
         v3 = _mm256_min_ps(_mm256_max_ps(v3, v_min), v_max);
  
         /* Round half away from zero to match the scalar path */
         v0 = _mm256_add_ps(v0, _mm256_or_ps(_mm256_and_ps(v0, sign_bit), v_half));
         v1 = _mm256_add_ps(v1, _mm256_or_ps(_mm256_and_ps(v1, sign_bit), v_half));
         v2 = _mm256_add_ps(v2, _mm256_or_ps(_mm256_and_ps(v2, sign_bit), v_half));
         v3 = _mm256_add_ps(v3, _mm256_or_ps(_mm256_and_ps(v3, sign_bit), v_half));
  
         __m256i i0 = _mm256_cvttps_epi32(v0);
         __m256i i1 = _mm256_cvttps_epi32(v1);
         __m256i i2 = _mm256_cvttps_epi32(v2);
         __m256i i3 = _mm256_cvttps_epi32(v3);
  
 #if defined(__AVX2__)
         i0 = _mm256_packs_epi32(i0, i1);
         i2 = _mm256_packs_epi32(i2, i3);
         i0 = _mm256_packs_epi16(i0, i2);
  
         const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
         i0 = _mm256_permutevar8x32_epi32(i0, perm);
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
         __m128i ni0 = _mm256_castsi256_si128(i0);
         __m128i ni1 = _mm256_extractf128_si256(i0, 1);
         __m128i ni2 = _mm256_castsi256_si128(i1);
         __m128i ni3 = _mm256_extractf128_si256(i1, 1);
         __m128i ni4 = _mm256_castsi256_si128(i2);
         __m128i ni5 = _mm256_extractf128_si256(i2, 1);
         __m128i ni6 = _mm256_castsi256_si128(i3);
         __m128i ni7 = _mm256_extractf128_si256(i3, 1);
  
         ni0 = _mm_packs_epi32(ni0, ni1);
         ni2 = _mm_packs_epi32(ni2, ni3);
         ni4 = _mm_packs_epi32(ni4, ni5);
         ni6 = _mm_packs_epi32(ni6, ni7);
  
         ni0 = _mm_packs_epi16(ni0, ni2);
         ni4 = _mm_packs_epi16(ni4, ni6);
  
         _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
         _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
 #endif
     }
 #else
     for (int i = 0; i < nb; i++) {
         const float *xb = x + i * QK8_0;
  
         /* Find max absolute value in block */
         float amax = 0.0f;
         for (int j = 0; j < QK8_0; j++) {
             float av = xb[j] >= 0 ? xb[j] : -xb[j];
             if (av > amax) amax = av;
         }
  
         /* Compute scale: d = max / 127 */
         float d = amax / 127.0f;
         float id = d != 0.0f ? 127.0f / amax : 0.0f;
  
         /* Store scale as FP16 */
         y[i].d = CK_FP32_TO_FP16(d);
  
         /* Quantize values */
         for (int j = 0; j < QK8_0; j++) {
             float v = xb[j] * id;
             /* Round to nearest int and clamp to [-127, 127] */
             int q = (int)(v + (v >= 0 ? 0.5f : -0.5f));
             if (q > 127) q = 127;
             if (q < -127) q = -127;
             y[i].qs[j] = (int8_t)q;
         }
     }
 #endif
 }

Referenced by ck_test_gemm_q5_0(), ck_test_gemm_q8_0(), ck_test_gemv_q5_0(), ck_test_gemv_q5_0_q8_0(), ck_test_gemv_q8_0(), and ck_test_gemv_q8_0_q8_0().

◆ quantize_row_q8_k()

void quantize_row_q8_k	(	const float *	x,
		void *	vy,
		int	k
	)

Definition at line 107 of file gemm_kernels_q4k_q8k.c.

                                                         {
 #if defined(__SSE4_1__)
     quantize_row_q8_k_sse(x, vy, k);
 #else
     quantize_row_q8_k_ref(x, vy, k);
 #endif
 }

Referenced by ck_test_gemm_q4_k(), ck_test_gemm_q6_k(), ck_test_gemv_q4_k(), and ck_test_quantize_q8_k().

◆ rmsnorm_forward()

void rmsnorm_forward	(	const float *	input,
		const float *	gamma,
		float *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

RMSNorm forward pass

Test:

test_rmsnorm.py::TestRMSNormForward::test_fp32_tokens

test_rmsnorm.py::TestRMSNormForward::test_fp32_single

test_rmsnorm.py::TestRMSNormForward::test_perf_rolled

test_layernorm.py::TestLayerNormForward::test_rmsnorm_compat

test_parity.py::test_rmsnorm_parity

RMSNorm: y[i] = gamma[i] * x[i] / sqrt(mean(x^2) + eps)

After changes: make test && make llamacpp-parity-full

Definition at line 50 of file rmsnorm_kernels.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     for (int t = 0; t < T; ++t) {
         const float *x = input + (size_t)t * aligned;
         float *y = output + (size_t)t * aligned;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         __m512 sum_sq_vec = _mm512_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
         }
         float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             __m512 yv = _mm512_mul_ps(x_hat, gv);
             _mm512_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #elif defined(__AVX__)
         // AVX: Process 8 floats at a time
         __m256 sum_sq_vec = _mm256_setzero_ps();
         int d = 0;
  
         // Vectorized sum of squares (no FMA in AVX1, use mul + add)
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 xv_sq = _mm256_mul_ps(xv, xv);
             sum_sq_vec = _mm256_add_ps(sum_sq_vec, xv_sq);
         }
         float sum_sq = hsum256_ps_rmsnorm(sum_sq_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float mean_sq = sum_sq / (float)D;
         float rstd = 1.0f / sqrtf(mean_sq + eps);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale (vectorized)
         __m256 rstd_vec = _mm256_set1_ps(rstd);
         d = 0;
         for (; d + 8 <= D; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 gv = _mm256_loadu_ps(&gamma[d]);
             __m256 x_hat = _mm256_mul_ps(xv, rstd_vec);
             __m256 yv = _mm256_mul_ps(x_hat, gv);
             _mm256_storeu_ps(&y[d], yv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             y[d] = x[d] * rstd * gamma[d];
         }
  
 #else
         // Scalar fallback
         double sum_sq = 0.0;
         for (int d = 0; d < D; ++d) {
             double v = (double)x[d];
             sum_sq += v * v;
         }
         double mean_sq = sum_sq / (double)D;
         double r = sqrt(mean_sq + (double)eps);
         float rstd = (float)(1.0 / r);
         if (rstd_cache) {
             rstd_cache[t] = rstd;
         }
  
         // Apply normalization and scale
         for (int d = 0; d < D; ++d) {
             float x_hat = x[d] * rstd;
             y[d] = x_hat * gamma[d];
         }
 #endif
  
         // Zero padding (if any)
         for (int d = D; d < aligned; ++d) {
             y[d] = 0.0f;
         }
     }
 }

Referenced by ck_test_rmsnorm().

◆ rope_forward_qk()

void rope_forward_qk	(	float *	q,
		float *	k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset
	)

RoPE forward for both Q and K (common inference pattern)

Test:

test_rope.py::TestRoPEForward::test_rope_forward_qk

test_fused_attention_decode.py::TestFusedAttentionDecode::test_qk_rope

test_parity.py::test_rope_qk_parity

Combined RoPE forward for both Q and K in one call. q: [num_heads, num_tokens, head_dim] k: [num_kv_heads, num_tokens, head_dim]

After changes: make test && make llamacpp-parity-full

Definition at line 448 of file rope_kernels.c.

 {
     rope_forward(q, cos_cache, sin_cache, num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     rope_forward(k, cos_cache, sin_cache, num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
 }

Referenced by ck_test_rope().

◆ rope_precompute_cache()

void rope_precompute_cache	(	float *	cos_cache,
		float *	sin_cache,
		int	max_seq_len,
		int	head_dim,
		float	base
	)

Precompute RoPE cos/sin cache

Test:

test_rope.py::TestRoPECache::test_cache_computation

test_rope.py::TestRoPECache::test_cache_values

Precomputes cos(m * theta_i) and sin(m * theta_i) for positions 0..max_seq_len-1. cos_cache, sin_cache: [max_seq_len, head_dim/2]

After changes: make test

Definition at line 52 of file rope_kernels.c.

 {
     int half_dim = head_dim / 2;
  
     long double base_ld = (long double)base;
     long double head_dim_ld = (long double)head_dim;
     long double log_base = logl(base_ld);
     for (int pos = 0; pos < max_seq_len; ++pos) {
         for (int i = 0; i < half_dim; ++i) {
             long double exponent = ((long double)(2 * i)) / head_dim_ld;
             long double freq = expl(-exponent * log_base);
             float freq_f = (float)freq;
             float angle_f = (float)pos * freq_f;
             cos_cache[pos * half_dim + i] = cosf(angle_f);
             sin_cache[pos * half_dim + i] = sinf(angle_f);
         }
     }
 }

Referenced by ck_test_rope().

◆ swiglu_forward()

void swiglu_forward	(	const float *	input,
		float *	output,
		int	tokens,
		int	dim
	)

SwiGLU forward pass

Test:

test_swiglu.py::TestSwiGLUForward::test_forward_tokens

test_swiglu.py::TestSwiGLUForward::test_forward_single

test_mlp.py::TestMLPForward::test_swiglu_mlp

test_fused_swiglu_decode.py::TestFusedSwiGLUDecode::test_fused_swiglu_decode

test_parity.py::test_swiglu_parity

SwiGLU: y = silu(gate) * up where silu(x) = x * sigmoid(x)

After changes: make test && make llamacpp-parity-full

Definition at line 131 of file swiglu_kernels.c.

 {
     int T = tokens;
     int D = dim;
  
     for (int t = 0; t < T; ++t) {
         const float *row = input + (size_t)t * (2 * D);
         float *out_row = output + (size_t)t * D;
         int d = 0;
  
 #if defined(__AVX512F__)
         // AVX-512: Process 16 floats at a time
         for (; d + 16 <= D; d += 16) {
             __m512 a = _mm512_loadu_ps(&row[d]);         // gate
             __m512 b = _mm512_loadu_ps(&row[D + d]);     // value
  
             __m512 s = sigmoid512_fast(a);              // sigmoid(a)
             __m512 silu = _mm512_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m512 y = _mm512_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm512_storeu_ps(&out_row[d], y);
         }
 #elif defined(__AVX2__)
         // AVX2: Process 8 floats at a time
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
  
             __m256 s = sigmoid256_fast(a);              // sigmoid(a)
             __m256 silu = _mm256_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m256 y = _mm256_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm256_storeu_ps(&out_row[d], y);
         }
 #elif defined(__AVX__)
         // AVX1: Vectorize arithmetic, use scalar sigmoid
         float a_arr[8] __attribute__((aligned(32)));
         float s_arr[8] __attribute__((aligned(32)));
  
         for (; d + 8 <= D; d += 8) {
             __m256 a = _mm256_loadu_ps(&row[d]);         // gate
             __m256 b = _mm256_loadu_ps(&row[D + d]);     // value
  
             // Compute sigmoid scalarly
             _mm256_store_ps(a_arr, a);
             for (int j = 0; j < 8; ++j) {
                 s_arr[j] = sigmoid_scalar(a_arr[j]);
             }
             __m256 s = _mm256_load_ps(s_arr);
  
             __m256 silu = _mm256_mul_ps(a, s);          // silu(a) = a * sigmoid(a)
             __m256 y = _mm256_mul_ps(silu, b);          // y = silu(a) * b
  
             _mm256_storeu_ps(&out_row[d], y);
         }
 #endif
  
         // Scalar fallback for remaining elements
         for (; d < D; ++d) {
             float a = row[d];       // gate
             float b = row[D + d];   // value
  
             float s = sigmoid_scalar(a);         // sigmoid(a)
             float silu = a * s;                  // silu(a) = a * sigmoid(a)
  
             out_row[d] = silu * b;
         }
     }
 }

Referenced by ck_test_swiglu().

◆ vec_dot_q5_0_q8_0()

void vec_dot_q5_0_q8_0	(	int	n,
		float *	s,
		const void *	vx,
		const void *	vy
	)

Auto-dispatch quantized dot product Q5_0 x Q8_0.

Dispatch priority:

AVX512 (best performance on modern Intel/AMD)
AVX (256-bit float ops, works on Sandy/Ivy Bridge and newer)
SSSE3 (128-bit fallback)
Reference scalar (last resort)

Definition at line 1498 of file gemm_kernels_q5_0.c.

 {
 #if defined(__AVX512F__)
     vec_dot_q5_0_q8_0_avx512(n, s, vx, vy);
 #elif defined(__AVX__)
     /* AVX for 256-bit float ops (works on Ivy Bridge and newer) */
     vec_dot_q5_0_q8_0_avx(n, s, vx, vy);
 #elif defined(__SSSE3__)
     /* SSSE3 - most efficient on older CPUs */
     vec_dot_q5_0_q8_0_sse(n, s, vx, vy);
 #else
     vec_dot_q5_0_q8_0_ref(n, s, vx, vy);
 #endif
 }

Referenced by ck_test_vec_dot_q5_0_q8_0().

◆ vec_dot_q8_0_q8_0()

void vec_dot_q8_0_q8_0	(	int	n,
		float *	s,
		const void *	vx,
		const void *	vy
	)

Auto-dispatch quantized dot product Q8_0 x Q8_0.

Definition at line 1013 of file gemm_kernels_q8_0.c.

 {
 #ifdef __AVX512F__
     vec_dot_q8_0_q8_0_avx512(n, s, vx, vy);
 #elif defined(__AVX__)
     vec_dot_q8_0_q8_0_avx(n, s, vx, vy);
 #elif defined(__SSE4_1__)
     vec_dot_q8_0_q8_0_sse(n, s, vx, vy);
 #else
     vec_dot_q8_0_q8_0_ref(n, s, vx, vy);
 #endif
 }

Referenced by ck_test_vec_dot_q8_0_q8_0().

Functions

Detailed Description

Function Documentation

◆ attention_forward_causal_head_major_gqa_flash_strided()

◆ attention_forward_causal_head_major_gqa_flash_strided_sliding()

◆ attention_forward_decode_head_major_gqa_flash_sliding()

◆ ck_get_block_q4_k_size()

◆ ck_get_block_q5_1_size()

◆ ck_get_block_q5_k_size()

◆ ck_get_block_q6_k_size()

◆ ck_get_block_q8_k_size()

◆ ck_get_qk5_1()

◆ ck_get_qk_k()

◆ ck_test_attention_causal()

◆ ck_test_attention_decode_sliding()

◆ ck_test_attention_sliding_window()

◆ ck_test_dequant_q4_0()

◆ ck_test_dequant_q4_k()

◆ ck_test_dequant_q5_1()

◆ ck_test_dequant_q6_k()

◆ ck_test_geglu()

◆ ck_test_geglu_backward()

◆ ck_test_gemm_q4_k()

◆ ck_test_gemm_q5_0()

◆ ck_test_gemm_q5_1()

◆ ck_test_gemm_q5_k()

◆ ck_test_gemm_q6_k()

◆ ck_test_gemm_q8_0()

◆ ck_test_gemv_q4_k()

◆ ck_test_gemv_q5_0()

◆ ck_test_gemv_q5_0_q8_0()

◆ ck_test_gemv_q5_1()

◆ ck_test_gemv_q5_k()

◆ ck_test_gemv_q6_k()

◆ ck_test_gemv_q8_0()

◆ ck_test_gemv_q8_0_q8_0()

◆ ck_test_outproj_mlp_fused_q5_0()

◆ ck_test_quantize_q8_k()

◆ ck_test_rmsnorm()

◆ ck_test_rope()

◆ ck_test_rope_interleaved()

◆ ck_test_softmax()

◆ ck_test_swiglu()

◆ ck_test_vec_dot_q5_0_q8_0()

◆ ck_test_vec_dot_q8_0_q8_0()

◆ dequant_q4_0_row()

◆ dequant_q4_k_row()

◆ dequant_q5_1_row()

◆ dequant_q6_k_row()

◆ geglu_backward_fp32()

◆ geglu_forward_fp32()

◆ gemm_nt_q4_k_q8_k()

◆ gemm_nt_q5_0_q8_0()

◆ gemm_nt_q5_1()

◆ gemm_nt_q5_k()

◆ gemm_nt_q6_k_q8_k()

◆ gemm_nt_q8_0_q8_0()

◆ gemv_q4_k_q8_k()

◆ gemv_q5_0()

◆ gemv_q5_0_q8_0()

◆ gemv_q5_1()

◆ gemv_q5_k()

◆ gemv_q6_k_q8_k()

◆ gemv_q8_0()

◆ gemv_q8_0_q8_0()

◆ mega_fused_outproj_mlp_prefill()

◆ mega_fused_outproj_mlp_prefill_scratch_size()

◆ quantize_row_q8_0()

◆ quantize_row_q8_k()

◆ rmsnorm_forward()

◆ rope_forward_qk()

◆ rope_precompute_cache()

◆ swiglu_forward()

◆ vec_dot_q5_0_q8_0()

◆ vec_dot_q8_0_q8_0()