C-Kernel-Engine Parity Testing API. More...
#include <stddef.h>#include <stdint.h>Go to the source code of this file.
Macros | |
| #define | CK_BLOCK_Q4_0_SIZE 18 |
| #define | CK_BLOCK_Q4_K_SIZE 144 |
| #define | CK_BLOCK_Q5_1_SIZE 24 |
| #define | CK_BLOCK_Q5_K_SIZE 176 |
| #define | CK_BLOCK_Q6_K_SIZE 210 |
| #define | CK_BLOCK_Q8_K_SIZE 292 |
| #define | CK_QK4_0 32 /* Elements per Q4_0 block */ |
| #define | CK_QK8_0 32 /* Elements per Q8_0 block */ |
| #define | CK_QK_K 256 /* Elements per K-quant super-block */ |
Functions | |
| int | ck_get_block_q4_k_size (void) |
| Get Q4_K block size in bytes. More... | |
| int | ck_get_block_q5_1_size (void) |
| Get Q5_1 block size in bytes (24 bytes per 32 weights) More... | |
| int | ck_get_block_q5_k_size (void) |
| Get Q5_K block size in bytes (176 bytes per 256 weights) More... | |
| int | ck_get_block_q6_k_size (void) |
| Get Q6_K block size in bytes. More... | |
| int | ck_get_block_q8_k_size (void) |
| Get Q8_K block size in bytes. More... | |
| int | ck_get_qk5_1 (void) |
| Get QK5_1 (elements per Q5_1 block) More... | |
| int | ck_get_qk_k (void) |
| Get QK_K (elements per super-block) More... | |
| void | ck_test_attention_causal (const float *q, const float *k, const float *v, float *out, int num_heads, int num_kv_heads, int tokens, int seq_len, int head_dim) |
| Multi-head causal attention for prefill (head-major layout) More... | |
| void | ck_test_dequant_q4_0 (const void *src, float *dst, int n) |
| Dequantize Q4_0 data to FP32. More... | |
| void | ck_test_dequant_q4_k (const void *src, float *dst, int n) |
| Dequantize Q4_K data to FP32. More... | |
| void | ck_test_dequant_q5_1 (const void *src, float *dst, int n) |
| Dequantize Q5_1 data to FP32. More... | |
| void | ck_test_dequant_q6_k (const void *src, float *dst, int n) |
| Dequantize Q6_K data to FP32. More... | |
| void | ck_test_gemm_q4_k (const void *weight_q4k, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q4_K GEMM - batched matrix multiply with quantized weights. More... | |
| void | ck_test_gemm_q5_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks) More... | |
| void | ck_test_gemm_q5_1 (const void *weight_q5_1, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks) More... | |
| void | ck_test_gemm_q5_k (const void *weight_q5_k, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks) More... | |
| void | ck_test_gemm_q6_k (const void *weight_q6k, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q6_K GEMM - batched matrix multiply with Q6_K weights. More... | |
| void | ck_test_gemm_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols, int n_tokens) |
| Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks) More... | |
| void | ck_test_gemv_q4_k (const void *weight_q4k, const float *input_f32, float *output, int cols) |
| Q4_K GEMV - dot product of quantized weights and FP32 input. More... | |
| void | ck_test_gemv_q5_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols) |
| Q5_0 GEMV - matrix-vector multiply with Q5_0 weights. More... | |
| void | ck_test_gemv_q5_0_q8_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols) |
| Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More... | |
| void | ck_test_gemv_q5_1 (const void *weight_q5_1, const float *input_f32, float *output, int rows, int cols) |
| Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks) More... | |
| void | ck_test_gemv_q5_k (const void *weight_q5_k, const float *input_f32, float *output, int rows, int cols) |
| Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks) More... | |
| void | ck_test_gemv_q6_k (const void *weight_q6k, const float *input_f32, float *output, int cols) |
| Q6_K GEMV. More... | |
| void | ck_test_gemv_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols) |
| Q8_0 GEMV - matrix-vector multiply with Q8_0 weights. More... | |
| void | ck_test_gemv_q8_0_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols) |
| Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More... | |
| void | ck_test_outproj_mlp_fused_q5_0 (const float *attn_out, const float *residual, const float *ln2_gamma, const void *wo, const void *w1, const void *w2, float *output, int tokens, int num_heads, int head_dim, int embed_dim, int intermediate, float eps, int w2_is_q6k) |
| Test mega-fused OutProj + MLP kernel (Q5_0 weights) More... | |
| void | ck_test_quantize_q8_k (const float *src, void *dst, int n) |
| Quantize FP32 to Q8_K (for activations) More... | |
| void | ck_test_rmsnorm (const float *input, const float *weight, float *output, int n_tokens, int dim, float eps) |
| RMSNorm. More... | |
| void | ck_test_rope (float *q, float *k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta) |
| RoPE (Rotary Position Embedding) More... | |
| void | ck_test_rope_interleaved (float *q, float *k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta) |
| RoPE with interleaved format (for llama.cpp compatibility) More... | |
| void | ck_test_softmax (const float *input, float *output, int n) |
| Softmax (simple, non-causal) More... | |
| void | ck_test_swiglu (const float *gate_up, float *output, int n_tokens, int intermediate_dim) |
| SwiGLU activation. More... | |
| void | ck_test_vec_dot_q5_0_q8_0 (const void *weight_q5_0, const void *input_q8_0, float *output, int cols) |
| Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More... | |
| void | ck_test_vec_dot_q8_0_q8_0 (const void *weight_q8_0, const void *input_q8_0, float *output, int cols) |
| Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More... | |
C-Kernel-Engine Parity Testing API.
Exposes individual CK kernels for parity testing against llama.cpp/ggml. This API mirrors the test-kernel-parity.cpp interface in llama.cpp.
Usage:
Definition in file ck_parity_api.h.
| #define CK_BLOCK_Q4_0_SIZE 18 |
Definition at line 36 of file ck_parity_api.h.
| #define CK_BLOCK_Q4_K_SIZE 144 |
Definition at line 33 of file ck_parity_api.h.
| #define CK_BLOCK_Q5_1_SIZE 24 |
Definition at line 38 of file ck_parity_api.h.
| #define CK_BLOCK_Q5_K_SIZE 176 |
Definition at line 37 of file ck_parity_api.h.
| #define CK_BLOCK_Q6_K_SIZE 210 |
Definition at line 34 of file ck_parity_api.h.
| #define CK_BLOCK_Q8_K_SIZE 292 |
Definition at line 35 of file ck_parity_api.h.
| #define CK_QK4_0 32 /* Elements per Q4_0 block */ |
Definition at line 29 of file ck_parity_api.h.
| #define CK_QK8_0 32 /* Elements per Q8_0 block */ |
Definition at line 30 of file ck_parity_api.h.
| #define CK_QK_K 256 /* Elements per K-quant super-block */ |
Definition at line 28 of file ck_parity_api.h.
| int ck_get_block_q4_k_size | ( | void | ) |
Get Q4_K block size in bytes.
Definition at line 961 of file ck_parity_api.c.
| int ck_get_block_q5_1_size | ( | void | ) |
Get Q5_1 block size in bytes (24 bytes per 32 weights)
Definition at line 986 of file ck_parity_api.c.
| int ck_get_block_q5_k_size | ( | void | ) |
Get Q5_K block size in bytes (176 bytes per 256 weights)
Definition at line 981 of file ck_parity_api.c.
| int ck_get_block_q6_k_size | ( | void | ) |
Get Q6_K block size in bytes.
Definition at line 966 of file ck_parity_api.c.
| int ck_get_block_q8_k_size | ( | void | ) |
Get Q8_K block size in bytes.
Definition at line 971 of file ck_parity_api.c.
| int ck_get_qk5_1 | ( | void | ) |
Get QK5_1 (elements per Q5_1 block)
Definition at line 991 of file ck_parity_api.c.
References QK5_1.
| int ck_get_qk_k | ( | void | ) |
Get QK_K (elements per super-block)
Definition at line 976 of file ck_parity_api.c.
References QK_K.
| void ck_test_attention_causal | ( | const float * | q, |
| const float * | k, | ||
| const float * | v, | ||
| float * | out, | ||
| int | num_heads, | ||
| int | num_kv_heads, | ||
| int | tokens, | ||
| int | seq_len, | ||
| int | head_dim | ||
| ) |
Multi-head causal attention for prefill (head-major layout)
Layout (head-major, matches llama.cpp test): Q: [num_heads, tokens, head_dim] K: [num_kv_heads, seq_len, head_dim] V: [num_kv_heads, seq_len, head_dim] out: [num_heads, tokens, head_dim]
Supports GQA (grouped-query attention) where num_heads > num_kv_heads. Causal masking: token t can only attend to positions 0..t (inclusive).
| q | Query [num_heads, tokens, head_dim] |
| k | Key [num_kv_heads, seq_len, head_dim] |
| v | Value [num_kv_heads, seq_len, head_dim] |
| out | Output [num_heads, tokens, head_dim] |
| num_heads | Number of query heads |
| num_kv_heads | Number of key/value heads (for GQA) |
| tokens | Number of query tokens |
| seq_len | Key/value sequence length (for prefill: seq_len == tokens) |
| head_dim | Dimension per head |
Definition at line 736 of file ck_parity_api.c.
References attention_forward_causal_head_major_gqa_flash_strided().
| void ck_test_dequant_q4_0 | ( | const void * | src, |
| float * | dst, | ||
| int | n | ||
| ) |
Dequantize Q4_0 data to FP32.
Definition at line 122 of file ck_parity_api.c.
References dequant_q4_0_row().
| void ck_test_dequant_q4_k | ( | const void * | src, |
| float * | dst, | ||
| int | n | ||
| ) |
Dequantize Q4_K data to FP32.
| src | Input Q4_K blocks |
| dst | Output FP32 values |
| n | Number of elements (must be multiple of 256) |
Definition at line 112 of file ck_parity_api.c.
References dequant_q4_k_row().
| void ck_test_dequant_q5_1 | ( | const void * | src, |
| float * | dst, | ||
| int | n | ||
| ) |
Dequantize Q5_1 data to FP32.
Definition at line 127 of file ck_parity_api.c.
References dequant_q5_1_row().
| void ck_test_dequant_q6_k | ( | const void * | src, |
| float * | dst, | ||
| int | n | ||
| ) |
Dequantize Q6_K data to FP32.
Definition at line 117 of file ck_parity_api.c.
References dequant_q6_k_row().
| void ck_test_gemm_q4_k | ( | const void * | weight_q4k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q4_K GEMM - batched matrix multiply with quantized weights.
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])
| weight_q4k | Q4_K quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 256) |
| n_tokens | Batch size |
Definition at line 392 of file ck_parity_api.c.
References CK_QK_K, gemm_nt_q4_k_q8_k(), and quantize_row_q8_k().
| void ck_test_gemm_q5_0 | ( | const void * | weight_q5_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])
| weight_q5_0 | Q5_0 quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
| n_tokens | Batch size |
Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)
Used for MLP W1 (gate/up projection) and attention Q/K with Q5_0 weights.
Definition at line 491 of file ck_parity_api.c.
References CK_QK8_0, gemm_nt_q5_0_q8_0(), and quantize_row_q8_0().
| void ck_test_gemm_q5_1 | ( | const void * | weight_q5_1, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_0 for activations.
| weight_q5_1 | Q5_1 quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
| n_tokens | Batch size |
Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)
Used for MLP W1 (gate/up projection) and attention Q/K with Q5_1 weights. gemm_nt_q5_1 expects FP32 activations (not quantized).
Definition at line 542 of file ck_parity_api.c.
References gemm_nt_q5_1().
| void ck_test_gemm_q5_k | ( | const void * | weight_q5_k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_K for activations.
| weight_q5_k | Q5_K quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 256) |
| n_tokens | Batch size |
Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)
Used for MLP W1 (gate/up projection) and attention Q/K with Q5_K weights. gemm_nt_q5_k expects FP32 activations (not quantized).
Definition at line 525 of file ck_parity_api.c.
References gemm_nt_q5_k().
| void ck_test_gemm_q6_k | ( | const void * | weight_q6k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q6_K GEMM - batched matrix multiply with Q6_K weights.
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])
| weight_q6k | Q6_K quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 256) |
| n_tokens | Batch size |
Q6_K GEMM - batched matrix multiply with Q6_K weights.
Used for MLP W2 (down projection) with Q6_K weights.
Definition at line 425 of file ck_parity_api.c.
References CK_QK_K, gemm_nt_q6_k_q8_k(), and quantize_row_q8_k().
| void ck_test_gemm_q8_0 | ( | const void * | weight_q8_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols, | ||
| int | n_tokens | ||
| ) |
Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)
Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])
| weight_q8_0 | Q8_0 quantized weights [rows, cols] |
| input_f32 | FP32 input [n_tokens, cols] |
| output | FP32 output [n_tokens, rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
| n_tokens | Batch size |
Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)
Used for attention V projection with Q8_0 weights.
Definition at line 458 of file ck_parity_api.c.
References CK_QK8_0, gemm_nt_q8_0_q8_0(), and quantize_row_q8_0().
| void ck_test_gemv_q4_k | ( | const void * | weight_q4k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | cols | ||
| ) |
Q4_K GEMV - dot product of quantized weights and FP32 input.
Internally quantizes input to Q8_K, then computes dot product.
| weight_q4k | Q4_K quantized weights [cols] |
| input_f32 | FP32 input vector [cols] |
| output | Output scalar [1] |
| cols | Number of columns (must be multiple of 256) |
Definition at line 145 of file ck_parity_api.c.
References CK_QK_K, gemv_q4_k_q8_k(), and quantize_row_q8_k().
| void ck_test_gemv_q5_0 | ( | const void * | weight_q5_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q5_0 GEMV - matrix-vector multiply with Q5_0 weights.
| weight_q5_0 | Q5_0 quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
Definition at line 192 of file ck_parity_api.c.
References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().
| void ck_test_gemv_q5_0_q8_0 | ( | const void * | weight_q5_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.
This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.
| weight_q5_0 | Q5_0 quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] - will be quantized to Q8_0 |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
Definition at line 248 of file ck_parity_api.c.
References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().
| void ck_test_gemv_q5_1 | ( | const void * | weight_q5_1, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks)
Uses Q8_0 for activations (like Q5_0).
| weight_q5_1 | Q5_1 quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
Definition at line 333 of file ck_parity_api.c.
References gemv_q5_1(), and QK5_1.
| void ck_test_gemv_q5_k | ( | const void * | weight_q5_k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks)
Uses Q8_K for activations (like Q4_K).
| weight_q5_k | Q5_K quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 256) |
Definition at line 301 of file ck_parity_api.c.
References CK_QK_K, and gemv_q5_k().
| void ck_test_gemv_q6_k | ( | const void * | weight_q6k, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | cols | ||
| ) |
| void ck_test_gemv_q8_0 | ( | const void * | weight_q8_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q8_0 GEMV - matrix-vector multiply with Q8_0 weights.
| weight_q8_0 | Q8_0 quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
Definition at line 220 of file ck_parity_api.c.
References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().
| void ck_test_gemv_q8_0_q8_0 | ( | const void * | weight_q8_0, |
| const float * | input_f32, | ||
| float * | output, | ||
| int | rows, | ||
| int | cols | ||
| ) |
Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.
This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.
| weight_q8_0 | Q8_0 quantized weights [rows * cols] |
| input_f32 | FP32 input vector [cols] - will be quantized to Q8_0 |
| output | FP32 output vector [rows] |
| rows | Number of output rows |
| cols | Number of columns (must be multiple of 32) |
Definition at line 274 of file ck_parity_api.c.
References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().
| void ck_test_outproj_mlp_fused_q5_0 | ( | const float * | attn_out, |
| const float * | residual, | ||
| const float * | ln2_gamma, | ||
| const void * | wo, | ||
| const void * | w1, | ||
| const void * | w2, | ||
| float * | output, | ||
| int | tokens, | ||
| int | num_heads, | ||
| int | head_dim, | ||
| int | embed_dim, | ||
| int | intermediate, | ||
| float | eps, | ||
| int | w2_is_q6k | ||
| ) |
Test mega-fused OutProj + MLP kernel (Q5_0 weights)
This tests the mega_fused_outproj_mlp_prefill kernel which fuses:
| attn_out | Attention output [num_heads, tokens, head_dim] (FP32, head-major) |
| residual | Residual input [tokens, embed_dim] (FP32) |
| ln2_gamma | RMSNorm gamma [embed_dim] (FP32) |
| wo | OutProj weights [embed_dim, embed_dim] (Q5_0) |
| w1 | MLP W1 weights [2*intermediate, embed_dim] (Q5_0) |
| w2 | MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K) |
| output | Output [tokens, embed_dim] (FP32) |
| tokens | Number of tokens |
| num_heads | Number of attention heads |
| head_dim | Dimension per head |
| embed_dim | Embedding dimension (= num_heads * head_dim) |
| intermediate | MLP intermediate dimension |
| eps | RMSNorm epsilon |
| w2_is_q6k | If true, W2 is Q6_K; if false, W2 is Q4_K |
This is a simplified wrapper for parity testing that:
| attn_out | Attention output [num_heads, tokens, head_dim] (FP32, head-major) |
| residual | Residual input [tokens, embed_dim] (FP32) |
| ln2_gamma | RMSNorm gamma [embed_dim] (FP32) |
| wo | OutProj weights [embed_dim, embed_dim] (Q5_0) |
| w1 | MLP W1 weights [2*intermediate, embed_dim] (Q5_0) |
| w2 | MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K) |
| output | Output [tokens, embed_dim] (FP32) |
| tokens | Number of tokens |
| num_heads | Number of attention heads |
| head_dim | Dimension per head |
| embed_dim | Embedding dimension (= num_heads * head_dim) |
| intermediate | MLP intermediate dimension |
| eps | RMSNorm epsilon |
| w2_is_q6k | If true, W2 is Q6_K; if false, W2 is Q4_K |
Definition at line 894 of file ck_parity_api.c.
References mega_fused_outproj_mlp_prefill(), and mega_fused_outproj_mlp_prefill_scratch_size().
| void ck_test_quantize_q8_k | ( | const float * | src, |
| void * | dst, | ||
| int | n | ||
| ) |
Quantize FP32 to Q8_K (for activations)
| src | Input FP32 values |
| dst | Output Q8_K blocks |
| n | Number of elements (must be multiple of 256) |
Definition at line 136 of file ck_parity_api.c.
References quantize_row_q8_k().
| void ck_test_rmsnorm | ( | const float * | input, |
| const float * | weight, | ||
| float * | output, | ||
| int | n_tokens, | ||
| int | dim, | ||
| float | eps | ||
| ) |
RMSNorm.
Computes: output = (input / rms(input)) * weight where rms(x) = sqrt(mean(x^2) + eps)
| input | Input tensor [n_tokens, dim] |
| weight | Normalization weights [dim] |
| output | Output tensor [n_tokens, dim] |
| n_tokens | Number of tokens |
| dim | Hidden dimension |
| eps | Epsilon for numerical stability |
Definition at line 557 of file ck_parity_api.c.
References rmsnorm_forward().
| void ck_test_rope | ( | float * | q, |
| float * | k, | ||
| int | n_tokens, | ||
| int | n_heads, | ||
| int | n_heads_kv, | ||
| int | head_dim, | ||
| int | pos_offset, | ||
| float | theta | ||
| ) |
RoPE (Rotary Position Embedding)
Applies rotary position embeddings to Q and K tensors.
NOTE: CK uses rotate-half format (split first/second halves) while some implementations use interleaved format. The test harness should account for this.
| q | Query tensor [n_tokens, n_heads * head_dim], modified in-place |
| k | Key tensor [n_tokens, n_heads_kv * head_dim], modified in-place |
| n_tokens | Number of tokens |
| n_heads | Number of query heads |
| n_heads_kv | Number of key/value heads |
| head_dim | Dimension per head |
| pos_offset | Starting position for RoPE |
| theta | RoPE base frequency (typically 10000.0) |
Definition at line 567 of file ck_parity_api.c.
References rope_forward_qk(), and rope_precompute_cache().
| void ck_test_rope_interleaved | ( | float * | q, |
| float * | k, | ||
| int | n_tokens, | ||
| int | n_heads, | ||
| int | n_heads_kv, | ||
| int | head_dim, | ||
| int | pos_offset, | ||
| float | theta | ||
| ) |
RoPE with interleaved format (for llama.cpp compatibility)
Uses interleaved format: (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)
Definition at line 644 of file ck_parity_api.c.
| void ck_test_softmax | ( | const float * | input, |
| float * | output, | ||
| int | n | ||
| ) |
Softmax (simple, non-causal)
Computes: output[i] = exp(input[i]) / sum(exp(input))
| input | Input tensor [n] |
| output | Output tensor [n] |
| n | Number of elements |
Definition at line 710 of file ck_parity_api.c.
| void ck_test_swiglu | ( | const float * | gate_up, |
| float * | output, | ||
| int | n_tokens, | ||
| int | intermediate_dim | ||
| ) |
SwiGLU activation.
Computes: output = SiLU(gate) * up where SiLU(x) = x * sigmoid(x)
| gate_up | Input tensor [n_tokens, 2 * intermediate_dim] Layout: [gate_0..gate_D-1, up_0..up_D-1] per token |
| output | Output tensor [n_tokens, intermediate_dim] |
| n_tokens | Number of tokens |
| intermediate_dim | Intermediate dimension |
Definition at line 703 of file ck_parity_api.c.
References swiglu_forward().
| void ck_test_vec_dot_q5_0_q8_0 | ( | const void * | weight_q5_0, |
| const void * | input_q8_0, | ||
| float * | output, | ||
| int | cols | ||
| ) |
Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)
This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.
| weight_q5_0 | Q5_0 quantized weights [cols] |
| input_q8_0 | Q8_0 quantized input [cols] (pre-quantized!) |
| output | Output scalar [1] |
| cols | Number of elements (must be multiple of 32) |
Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)
This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.
| weight_q5_0 | Q5_0 quantized weights [cols] |
| input_q8_0 | Q8_0 quantized input [cols] (pre-quantized!) |
| output | Output scalar [1] |
| cols | Number of elements (must be multiple of 32) |
Definition at line 364 of file ck_parity_api.c.
References vec_dot_q5_0_q8_0().
| void ck_test_vec_dot_q8_0_q8_0 | ( | const void * | weight_q8_0, |
| const void * | input_q8_0, | ||
| float * | output, | ||
| int | cols | ||
| ) |
Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)
| weight_q8_0 | Q8_0 quantized weights [cols] |
| input_q8_0 | Q8_0 quantized input [cols] (pre-quantized!) |
| output | Output scalar [1] |
| cols | Number of elements (must be multiple of 32) |
Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)
| weight_q8_0 | Q8_0 quantized weights [cols] |
| input_q8_0 | Q8_0 quantized input [cols] (pre-quantized!) |
| output | Output scalar [1] |
| cols | Number of elements (must be multiple of 32) |
Definition at line 380 of file ck_parity_api.c.
References vec_dot_q8_0_q8_0().