← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_parity_api.h File Reference

C-Kernel-Engine Parity Testing API. More...

#include <stddef.h>
#include <stdint.h>

Go to the source code of this file.

Macros

#define CK_BLOCK_Q4_0_SIZE   18
 
#define CK_BLOCK_Q4_K_SIZE   144
 
#define CK_BLOCK_Q5_1_SIZE   24
 
#define CK_BLOCK_Q5_K_SIZE   176
 
#define CK_BLOCK_Q6_K_SIZE   210
 
#define CK_BLOCK_Q8_K_SIZE   292
 
#define CK_QK4_0   32 /* Elements per Q4_0 block */
 
#define CK_QK8_0   32 /* Elements per Q8_0 block */
 
#define CK_QK_K   256 /* Elements per K-quant super-block */
 

Functions

int ck_get_block_q4_k_size (void)
 Get Q4_K block size in bytes. More...
 
int ck_get_block_q5_1_size (void)
 Get Q5_1 block size in bytes (24 bytes per 32 weights) More...
 
int ck_get_block_q5_k_size (void)
 Get Q5_K block size in bytes (176 bytes per 256 weights) More...
 
int ck_get_block_q6_k_size (void)
 Get Q6_K block size in bytes. More...
 
int ck_get_block_q8_k_size (void)
 Get Q8_K block size in bytes. More...
 
int ck_get_qk5_1 (void)
 Get QK5_1 (elements per Q5_1 block) More...
 
int ck_get_qk_k (void)
 Get QK_K (elements per super-block) More...
 
void ck_test_attention_causal (const float *q, const float *k, const float *v, float *out, int num_heads, int num_kv_heads, int tokens, int seq_len, int head_dim)
 Multi-head causal attention for prefill (head-major layout) More...
 
void ck_test_dequant_q4_0 (const void *src, float *dst, int n)
 Dequantize Q4_0 data to FP32. More...
 
void ck_test_dequant_q4_k (const void *src, float *dst, int n)
 Dequantize Q4_K data to FP32. More...
 
void ck_test_dequant_q5_1 (const void *src, float *dst, int n)
 Dequantize Q5_1 data to FP32. More...
 
void ck_test_dequant_q6_k (const void *src, float *dst, int n)
 Dequantize Q6_K data to FP32. More...
 
void ck_test_gemm_q4_k (const void *weight_q4k, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q4_K GEMM - batched matrix multiply with quantized weights. More...
 
void ck_test_gemm_q5_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks) More...
 
void ck_test_gemm_q5_1 (const void *weight_q5_1, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks) More...
 
void ck_test_gemm_q5_k (const void *weight_q5_k, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks) More...
 
void ck_test_gemm_q6_k (const void *weight_q6k, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q6_K GEMM - batched matrix multiply with Q6_K weights. More...
 
void ck_test_gemm_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols, int n_tokens)
 Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks) More...
 
void ck_test_gemv_q4_k (const void *weight_q4k, const float *input_f32, float *output, int cols)
 Q4_K GEMV - dot product of quantized weights and FP32 input. More...
 
void ck_test_gemv_q5_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols)
 Q5_0 GEMV - matrix-vector multiply with Q5_0 weights. More...
 
void ck_test_gemv_q5_0_q8_0 (const void *weight_q5_0, const float *input_f32, float *output, int rows, int cols)
 Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...
 
void ck_test_gemv_q5_1 (const void *weight_q5_1, const float *input_f32, float *output, int rows, int cols)
 Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks) More...
 
void ck_test_gemv_q5_k (const void *weight_q5_k, const float *input_f32, float *output, int rows, int cols)
 Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks) More...
 
void ck_test_gemv_q6_k (const void *weight_q6k, const float *input_f32, float *output, int cols)
 Q6_K GEMV. More...
 
void ck_test_gemv_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols)
 Q8_0 GEMV - matrix-vector multiply with Q8_0 weights. More...
 
void ck_test_gemv_q8_0_q8_0 (const void *weight_q8_0, const float *input_f32, float *output, int rows, int cols)
 Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...
 
void ck_test_outproj_mlp_fused_q5_0 (const float *attn_out, const float *residual, const float *ln2_gamma, const void *wo, const void *w1, const void *w2, float *output, int tokens, int num_heads, int head_dim, int embed_dim, int intermediate, float eps, int w2_is_q6k)
 Test mega-fused OutProj + MLP kernel (Q5_0 weights) More...
 
void ck_test_quantize_q8_k (const float *src, void *dst, int n)
 Quantize FP32 to Q8_K (for activations) More...
 
void ck_test_rmsnorm (const float *input, const float *weight, float *output, int n_tokens, int dim, float eps)
 RMSNorm. More...
 
void ck_test_rope (float *q, float *k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
 RoPE (Rotary Position Embedding) More...
 
void ck_test_rope_interleaved (float *q, float *k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
 RoPE with interleaved format (for llama.cpp compatibility) More...
 
void ck_test_softmax (const float *input, float *output, int n)
 Softmax (simple, non-causal) More...
 
void ck_test_swiglu (const float *gate_up, float *output, int n_tokens, int intermediate_dim)
 SwiGLU activation. More...
 
void ck_test_vec_dot_q5_0_q8_0 (const void *weight_q5_0, const void *input_q8_0, float *output, int cols)
 Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More...
 
void ck_test_vec_dot_q8_0_q8_0 (const void *weight_q8_0, const void *input_q8_0, float *output, int cols)
 Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More...
 

Detailed Description

C-Kernel-Engine Parity Testing API.

Exposes individual CK kernels for parity testing against llama.cpp/ggml. This API mirrors the test-kernel-parity.cpp interface in llama.cpp.

Usage:

  1. Build as shared library: libck_parity.so
  2. Load from Python using ctypes
  3. Call functions with matching signatures to test-kernel-parity.cpp

Definition in file ck_parity_api.h.

Macro Definition Documentation

◆ CK_BLOCK_Q4_0_SIZE

#define CK_BLOCK_Q4_0_SIZE   18

Definition at line 36 of file ck_parity_api.h.

◆ CK_BLOCK_Q4_K_SIZE

#define CK_BLOCK_Q4_K_SIZE   144

Definition at line 33 of file ck_parity_api.h.

◆ CK_BLOCK_Q5_1_SIZE

#define CK_BLOCK_Q5_1_SIZE   24

Definition at line 38 of file ck_parity_api.h.

◆ CK_BLOCK_Q5_K_SIZE

#define CK_BLOCK_Q5_K_SIZE   176

Definition at line 37 of file ck_parity_api.h.

◆ CK_BLOCK_Q6_K_SIZE

#define CK_BLOCK_Q6_K_SIZE   210

Definition at line 34 of file ck_parity_api.h.

◆ CK_BLOCK_Q8_K_SIZE

#define CK_BLOCK_Q8_K_SIZE   292

Definition at line 35 of file ck_parity_api.h.

◆ CK_QK4_0

#define CK_QK4_0   32 /* Elements per Q4_0 block */

Definition at line 29 of file ck_parity_api.h.

◆ CK_QK8_0

#define CK_QK8_0   32 /* Elements per Q8_0 block */

Definition at line 30 of file ck_parity_api.h.

◆ CK_QK_K

#define CK_QK_K   256 /* Elements per K-quant super-block */

Definition at line 28 of file ck_parity_api.h.

Function Documentation

◆ ck_get_block_q4_k_size()

int ck_get_block_q4_k_size ( void  )

Get Q4_K block size in bytes.

Definition at line 961 of file ck_parity_api.c.

962 {
963  return sizeof(block_q4_K);
964 }

◆ ck_get_block_q5_1_size()

int ck_get_block_q5_1_size ( void  )

Get Q5_1 block size in bytes (24 bytes per 32 weights)

Definition at line 986 of file ck_parity_api.c.

987 {
988  return sizeof(block_q5_1);
989 }

◆ ck_get_block_q5_k_size()

int ck_get_block_q5_k_size ( void  )

Get Q5_K block size in bytes (176 bytes per 256 weights)

Definition at line 981 of file ck_parity_api.c.

982 {
983  return sizeof(block_q5_K);
984 }

◆ ck_get_block_q6_k_size()

int ck_get_block_q6_k_size ( void  )

Get Q6_K block size in bytes.

Definition at line 966 of file ck_parity_api.c.

967 {
968  return sizeof(block_q6_K);
969 }

◆ ck_get_block_q8_k_size()

int ck_get_block_q8_k_size ( void  )

Get Q8_K block size in bytes.

Definition at line 971 of file ck_parity_api.c.

972 {
973  return sizeof(block_q8_K);
974 }

◆ ck_get_qk5_1()

int ck_get_qk5_1 ( void  )

Get QK5_1 (elements per Q5_1 block)

Definition at line 991 of file ck_parity_api.c.

992 {
993  return QK5_1;
994 }
#define QK5_1
Definition: ckernel_quant.h:84

References QK5_1.

◆ ck_get_qk_k()

int ck_get_qk_k ( void  )

Get QK_K (elements per super-block)

Definition at line 976 of file ck_parity_api.c.

977 {
978  return QK_K;
979 }
#define QK_K

References QK_K.

◆ ck_test_attention_causal()

void ck_test_attention_causal ( const float *  q,
const float *  k,
const float *  v,
float *  out,
int  num_heads,
int  num_kv_heads,
int  tokens,
int  seq_len,
int  head_dim 
)

Multi-head causal attention for prefill (head-major layout)

Layout (head-major, matches llama.cpp test): Q: [num_heads, tokens, head_dim] K: [num_kv_heads, seq_len, head_dim] V: [num_kv_heads, seq_len, head_dim] out: [num_heads, tokens, head_dim]

Supports GQA (grouped-query attention) where num_heads > num_kv_heads. Causal masking: token t can only attend to positions 0..t (inclusive).

Parameters
qQuery [num_heads, tokens, head_dim]
kKey [num_kv_heads, seq_len, head_dim]
vValue [num_kv_heads, seq_len, head_dim]
outOutput [num_heads, tokens, head_dim]
num_headsNumber of query heads
num_kv_headsNumber of key/value heads (for GQA)
tokensNumber of query tokens
seq_lenKey/value sequence length (for prefill: seq_len == tokens)
head_dimDimension per head

Definition at line 736 of file ck_parity_api.c.

745 {
746  /* For prefill, seq_len == tokens, and kv_stride == tokens.
747  * The CK kernel expects strided KV layout with kv_stride_tokens parameter.
748  * For parity testing with contiguous tensors, kv_stride = seq_len.
749  */
751  q, k, v, out,
752  num_heads, num_kv_heads, tokens,
753  head_dim, head_dim, /* aligned_head_dim = head_dim for testing */
754  seq_len /* kv_stride_tokens = seq_len for contiguous KV */
755  );
756 }
void attention_forward_causal_head_major_gqa_flash_strided(const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int kv_stride_tokens)

References attention_forward_causal_head_major_gqa_flash_strided().

◆ ck_test_dequant_q4_0()

void ck_test_dequant_q4_0 ( const void *  src,
float *  dst,
int  n 
)

Dequantize Q4_0 data to FP32.

Definition at line 122 of file ck_parity_api.c.

123 {
124  dequant_q4_0_row(src, dst, (size_t)n);
125 }
void dequant_q4_0_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_0 row (multiple blocks)

References dequant_q4_0_row().

◆ ck_test_dequant_q4_k()

void ck_test_dequant_q4_k ( const void *  src,
float *  dst,
int  n 
)

Dequantize Q4_K data to FP32.

Parameters
srcInput Q4_K blocks
dstOutput FP32 values
nNumber of elements (must be multiple of 256)

Definition at line 112 of file ck_parity_api.c.

113 {
114  dequant_q4_k_row(src, dst, (size_t)n);
115 }
void dequant_q4_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q4_K row (multiple blocks)

References dequant_q4_k_row().

◆ ck_test_dequant_q5_1()

void ck_test_dequant_q5_1 ( const void *  src,
float *  dst,
int  n 
)

Dequantize Q5_1 data to FP32.

Definition at line 127 of file ck_parity_api.c.

128 {
129  dequant_q5_1_row(src, dst, (size_t)n);
130 }
void dequant_q5_1_row(const void *src, float *dst, size_t n_elements)
Dequantize Q5_1 row (multiple blocks)

References dequant_q5_1_row().

◆ ck_test_dequant_q6_k()

void ck_test_dequant_q6_k ( const void *  src,
float *  dst,
int  n 
)

Dequantize Q6_K data to FP32.

Definition at line 117 of file ck_parity_api.c.

118 {
119  dequant_q6_k_row(src, dst, (size_t)n);
120 }
void dequant_q6_k_row(const void *src, float *dst, size_t n_elements)
Dequantize Q6_K row (multiple blocks)

References dequant_q6_k_row().

◆ ck_test_gemm_q4_k()

void ck_test_gemm_q4_k ( const void *  weight_q4k,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q4_K GEMM - batched matrix multiply with quantized weights.

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters
weight_q4kQ4_K quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 256)
n_tokensBatch size

Definition at line 392 of file ck_parity_api.c.

396 {
397  /* Allocate Q8_K buffer for quantized activations */
398  int n_blocks_per_row = cols / CK_QK_K;
399  block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
400  if (!q8_data) {
401  memset(output, 0, n_tokens * rows * sizeof(float));
402  return;
403  }
404 
405  /* Quantize all input tokens */
406  for (int t = 0; t < n_tokens; t++) {
407  quantize_row_q8_k(input_f32 + t * cols,
408  q8_data + t * n_blocks_per_row, cols);
409  }
410 
411  /* Use gemm_nt_q4_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
412  * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
413  * So: M = n_tokens, N = rows, K = cols
414  */
415  gemm_nt_q4_k_q8_k(q8_data, weight_q4k, NULL, output, n_tokens, rows, cols);
416 
417  free(q8_data);
418 }
void quantize_row_q8_k(const float *x, void *vy, int k)
void gemm_nt_q4_k_q8_k(const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K)
#define CK_QK_K
Definition: ck_parity_api.h:28

References CK_QK_K, gemm_nt_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q5_0()

void ck_test_gemm_q5_0 ( const void *  weight_q5_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters
weight_q5_0Q5_0 quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)
n_tokensBatch size

Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_0 weights.

Definition at line 491 of file ck_parity_api.c.

495 {
496  /* Allocate Q8_0 buffer for quantized activations */
497  int n_blocks_per_row = cols / CK_QK8_0;
498  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
499  if (!q8_data) {
500  memset(output, 0, n_tokens * rows * sizeof(float));
501  return;
502  }
503 
504  /* Quantize all input tokens */
505  for (int t = 0; t < n_tokens; t++) {
506  quantize_row_q8_0(input_f32 + t * cols,
507  q8_data + t * n_blocks_per_row, cols);
508  }
509 
510  /* Use gemm_nt_q5_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
511  * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
512  * So: M = n_tokens, N = rows, K = cols
513  */
514  gemm_nt_q5_0_q8_0(q8_data, weight_q5_0, NULL, output, n_tokens, rows, cols);
515 
516  free(q8_data);
517 }
void gemm_nt_q5_0_q8_0(const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)
Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.
void quantize_row_q8_0(const float *x, void *vy, int k)
Quantize FP32 to Q8_0 format (scalar reference)
#define CK_QK8_0
Definition: ck_parity_api.h:30

References CK_QK8_0, gemm_nt_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemm_q5_1()

void ck_test_gemm_q5_1 ( const void *  weight_q5_1,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_0 for activations.

Parameters
weight_q5_1Q5_1 quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)
n_tokensBatch size

Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_1 weights. gemm_nt_q5_1 expects FP32 activations (not quantized).

Definition at line 542 of file ck_parity_api.c.

546 {
547  /* gemm_nt_q5_1 expects FP32 activations, not quantized.
548  * Pass input_f32 directly as-is (already FP32).
549  */
550  gemm_nt_q5_1(input_f32, weight_q5_1, NULL, output, n_tokens, rows, cols);
551 }
void gemm_nt_q5_1(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_1 weights: C = A @ B^T.

References gemm_nt_q5_1().

◆ ck_test_gemm_q5_k()

void ck_test_gemm_q5_k ( const void *  weight_q5_k,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_K for activations.

Parameters
weight_q5_kQ5_K quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 256)
n_tokensBatch size

Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_K weights. gemm_nt_q5_k expects FP32 activations (not quantized).

Definition at line 525 of file ck_parity_api.c.

529 {
530  /* gemm_nt_q5_k expects FP32 activations, not quantized.
531  * Pass input_f32 directly as-is (already FP32).
532  */
533  gemm_nt_q5_k(input_f32, weight_q5_k, NULL, output, n_tokens, rows, cols);
534 }
void gemm_nt_q5_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)

References gemm_nt_q5_k().

◆ ck_test_gemm_q6_k()

void ck_test_gemm_q6_k ( const void *  weight_q6k,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q6_K GEMM - batched matrix multiply with Q6_K weights.

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters
weight_q6kQ6_K quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 256)
n_tokensBatch size

Q6_K GEMM - batched matrix multiply with Q6_K weights.

Used for MLP W2 (down projection) with Q6_K weights.

Definition at line 425 of file ck_parity_api.c.

429 {
430  /* Allocate Q8_K buffer for quantized activations */
431  int n_blocks_per_row = cols / CK_QK_K;
432  block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
433  if (!q8_data) {
434  memset(output, 0, n_tokens * rows * sizeof(float));
435  return;
436  }
437 
438  /* Quantize all input tokens */
439  for (int t = 0; t < n_tokens; t++) {
440  quantize_row_q8_k(input_f32 + t * cols,
441  q8_data + t * n_blocks_per_row, cols);
442  }
443 
444  /* Use gemm_nt_q6_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
445  * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
446  * So: M = n_tokens, N = rows, K = cols
447  */
448  gemm_nt_q6_k_q8_k(q8_data, weight_q6k, NULL, output, n_tokens, rows, cols);
449 
450  free(q8_data);
451 }
void gemm_nt_q6_k_q8_k(const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K)
NT GEMM: C = A @ B^T where A is Q8_K and B is Q6_K.

References CK_QK_K, gemm_nt_q6_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q8_0()

void ck_test_gemm_q8_0 ( const void *  weight_q8_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols,
int  n_tokens 
)

Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters
weight_q8_0Q8_0 quantized weights [rows, cols]
input_f32FP32 input [n_tokens, cols]
outputFP32 output [n_tokens, rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)
n_tokensBatch size

Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)

Used for attention V projection with Q8_0 weights.

Definition at line 458 of file ck_parity_api.c.

462 {
463  /* Allocate Q8_0 buffer for quantized activations */
464  int n_blocks_per_row = cols / CK_QK8_0;
465  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
466  if (!q8_data) {
467  memset(output, 0, n_tokens * rows * sizeof(float));
468  return;
469  }
470 
471  /* Quantize all input tokens */
472  for (int t = 0; t < n_tokens; t++) {
473  quantize_row_q8_0(input_f32 + t * cols,
474  q8_data + t * n_blocks_per_row, cols);
475  }
476 
477  /* Use gemm_nt_q8_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
478  * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
479  * So: M = n_tokens, N = rows, K = cols
480  */
481  gemm_nt_q8_0_q8_0(q8_data, weight_q8_0, NULL, output, n_tokens, rows, cols);
482 
483  free(q8_data);
484 }
void gemm_nt_q8_0_q8_0(const void *A_q8, const void *B_q8, const float *bias, float *C, int M, int N, int K)
gemm_nt_q8_0_q8_0 with optional bias (matches header signature)

References CK_QK8_0, gemm_nt_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q4_k()

void ck_test_gemv_q4_k ( const void *  weight_q4k,
const float *  input_f32,
float *  output,
int  cols 
)

Q4_K GEMV - dot product of quantized weights and FP32 input.

Internally quantizes input to Q8_K, then computes dot product.

Parameters
weight_q4kQ4_K quantized weights [cols]
input_f32FP32 input vector [cols]
outputOutput scalar [1]
colsNumber of columns (must be multiple of 256)

Definition at line 145 of file ck_parity_api.c.

149 {
150  /* Allocate Q8_K buffer for quantized activations */
151  int n_blocks = cols / CK_QK_K;
152  block_q8_K *q8_data = (block_q8_K *)malloc(n_blocks * sizeof(block_q8_K));
153  if (!q8_data) {
154  *output = 0.0f;
155  return;
156  }
157 
158  /* Quantize input to Q8_K */
159  quantize_row_q8_k(input_f32, q8_data, cols);
160 
161  /* Compute dot product using GEMV with M=1 */
162  gemv_q4_k_q8_k(output, weight_q4k, q8_data, 1, cols);
163 
164  free(q8_data);
165 }
void gemv_q4_k_q8_k(float *y, const void *W, const void *x_q8, int M, int K)

References CK_QK_K, gemv_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemv_q5_0()

void ck_test_gemv_q5_0 ( const void *  weight_q5_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q5_0 GEMV - matrix-vector multiply with Q5_0 weights.

Parameters
weight_q5_0Q5_0 quantized weights [rows * cols]
input_f32FP32 input vector [cols]
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)

Definition at line 192 of file ck_parity_api.c.

196 {
197  /* Match llama.cpp's test_gemv_q5_0:
198  * 1. Quantize input to Q8_0 format
199  * 2. Use quantized dot product (vec_dot_q5_0_q8_0)
200  *
201  * This ensures parity with llama.cpp which always uses the
202  * quantized path, NOT the FP32 dequantization path.
203  */
204  int n_blocks = cols / CK_QK8_0;
205  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
206  if (!q8_data) {
207  for (int r = 0; r < rows; r++) output[r] = 0.0f;
208  return;
209  }
210 
211  /* Quantize input to Q8_0 */
212  quantize_row_q8_0(input_f32, q8_data, cols);
213 
214  /* Call the quantized GEMV kernel (same as ck_test_gemv_q5_0_q8_0) */
215  gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
216 
217  free(q8_data);
218 }
void gemv_q5_0_q8_0(float *y, const void *W, const void *x_q8, int M, int K)
Matrix-vector multiply with Q5_0 weights and Q8_0 input.

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_0_q8_0()

void ck_test_gemv_q5_0_q8_0 ( const void *  weight_q5_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters
weight_q5_0Q5_0 quantized weights [rows * cols]
input_f32FP32 input vector [cols] - will be quantized to Q8_0
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)

Definition at line 248 of file ck_parity_api.c.

252 {
253  /* This matches llama.cpp's approach:
254  * 1. Quantize input to Q8_0 format
255  * 2. Use quantized dot product (integer math)
256  * 3. Scale at the end
257  */
258  int n_blocks = cols / CK_QK8_0;
259  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
260  if (!q8_data) {
261  for (int r = 0; r < rows; r++) output[r] = 0.0f;
262  return;
263  }
264 
265  /* Quantize input to Q8_0 */
266  quantize_row_q8_0(input_f32, q8_data, cols);
267 
268  /* Call the quantized GEMV kernel */
269  gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
270 
271  free(q8_data);
272 }

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_1()

void ck_test_gemv_q5_1 ( const void *  weight_q5_1,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks)

Uses Q8_0 for activations (like Q5_0).

Parameters
weight_q5_1Q5_1 quantized weights [rows * cols]
input_f32FP32 input vector [cols]
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)

Definition at line 333 of file ck_parity_api.c.

337 {
338  /*
339  * IMPORTANT: gemv_q5_1() expects raw FP32 activations, NOT pre-quantized Q8_0.
340  * See comment in ck_test_gemv_q5_k() above for explanation.
341  */
342  for (int r = 0; r < rows; r++) {
343  gemv_q5_1(&output[r],
344  (const char *)weight_q5_1 + r * (cols / QK5_1) * sizeof(block_q5_1),
345  input_f32, 1, cols);
346  }
347 }
void gemv_q5_1(float *y, const void *W, const float *x, int M, int K)
Auto-dispatch GEMV.

References gemv_q5_1(), and QK5_1.

◆ ck_test_gemv_q5_k()

void ck_test_gemv_q5_k ( const void *  weight_q5_k,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks)

Uses Q8_K for activations (like Q4_K).

Parameters
weight_q5_kQ5_K quantized weights [rows * cols]
input_f32FP32 input vector [cols]
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 256)

Definition at line 301 of file ck_parity_api.c.

305 {
306  /*
307  * IMPORTANT: gemv_q5_k() expects raw FP32 activations, NOT pre-quantized Q8_K.
308  *
309  * This is different from gemv_q4_k_q8_k() and gemv_q5_0_q8_0() which are
310  * "quantized dot product" kernels that take block_q8_K or block_q8_0 input.
311  *
312  * WHY THIS IS ERROR-PRONE:
313  * When copying from ck_test_gemv_q5_0() (which calls gemv_q5_0_q8_0),
314  * it is natural to assume Q5_K also needs pre-quantization. But the
315  * function name tells you: gemv_q5_k() takes float*, while
316  * gemv_q5_0_q8_0() takes block_q8_0*. If the kernel name does not
317  * have "_q8_0" or "_q8_k" suffix, it expects FP32 input.
318  *
319  * PARITY NOTE:
320  * llama.cpp reference uses ggml_vec_dot_q5_K_q8_K which quantizes
321  * the input to Q8_K internally. Our FP32 path will have slightly
322  * different numerical results. Use tolerance ~1e-2 for comparison.
323  * To get exact parity, implement gemv_q5_k_q8_k() (quantized dot product).
324  */
325  for (int r = 0; r < rows; r++) {
326  gemv_q5_k(&output[r],
327  (const char *)weight_q5_k + r * (cols / CK_QK_K) * sizeof(block_q5_K),
328  input_f32, 1, cols);
329  }
330 }
void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)

References CK_QK_K, and gemv_q5_k().

◆ ck_test_gemv_q6_k()

void ck_test_gemv_q6_k ( const void *  weight_q6k,
const float *  input_f32,
float *  output,
int  cols 
)

Q6_K GEMV.

Definition at line 167 of file ck_parity_api.c.

171 {
172  /* Q6_K GEMV is not yet implemented in CK - provide reference impl */
173  /* For now, dequantize and compute in FP32 */
174  float *weight_f32 = (float *)malloc(cols * sizeof(float));
175  if (!weight_f32) {
176  *output = 0.0f;
177  return;
178  }
179 
180  dequant_q6_k_row(weight_q6k, weight_f32, cols);
181 
182  /* Dot product in FP32 */
183  double sum = 0.0;
184  for (int i = 0; i < cols; i++) {
185  sum += (double)weight_f32[i] * (double)input_f32[i];
186  }
187  *output = (float)sum;
188 
189  free(weight_f32);
190 }

References dequant_q6_k_row().

◆ ck_test_gemv_q8_0()

void ck_test_gemv_q8_0 ( const void *  weight_q8_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q8_0 GEMV - matrix-vector multiply with Q8_0 weights.

Parameters
weight_q8_0Q8_0 quantized weights [rows * cols]
input_f32FP32 input vector [cols]
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)

Definition at line 220 of file ck_parity_api.c.

224 {
225  /* Match llama.cpp's test_gemv_q8_0:
226  * 1. Quantize input to Q8_0 format
227  * 2. Use quantized dot product (vec_dot_q8_0_q8_0)
228  *
229  * This ensures parity with llama.cpp which always uses the
230  * quantized path, NOT the FP32 dequantization path.
231  */
232  int n_blocks = cols / CK_QK8_0;
233  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
234  if (!q8_data) {
235  for (int r = 0; r < rows; r++) output[r] = 0.0f;
236  return;
237  }
238 
239  /* Quantize input to Q8_0 */
240  quantize_row_q8_0(input_f32, q8_data, cols);
241 
242  /* Call the quantized GEMV kernel (same as ck_test_gemv_q8_0_q8_0) */
243  gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
244 
245  free(q8_data);
246 }
void gemv_q8_0_q8_0(float *y, const void *W, const void *x_q8, int M, int K)
Matrix-vector multiply with Q8_0 weights and Q8_0 input.

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q8_0_q8_0()

void ck_test_gemv_q8_0_q8_0 ( const void *  weight_q8_0,
const float *  input_f32,
float *  output,
int  rows,
int  cols 
)

Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters
weight_q8_0Q8_0 quantized weights [rows * cols]
input_f32FP32 input vector [cols] - will be quantized to Q8_0
outputFP32 output vector [rows]
rowsNumber of output rows
colsNumber of columns (must be multiple of 32)

Definition at line 274 of file ck_parity_api.c.

278 {
279  /* This matches llama.cpp's approach:
280  * 1. Quantize input to Q8_0 format
281  * 2. Use quantized dot product (integer math)
282  * 3. Scale at the end
283  */
284  int n_blocks = cols / CK_QK8_0;
285  block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
286  if (!q8_data) {
287  for (int r = 0; r < rows; r++) output[r] = 0.0f;
288  return;
289  }
290 
291  /* Quantize input to Q8_0 */
292  quantize_row_q8_0(input_f32, q8_data, cols);
293 
294  /* Call the quantized GEMV kernel */
295  gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
296 
297  free(q8_data);
298 }

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_outproj_mlp_fused_q5_0()

void ck_test_outproj_mlp_fused_q5_0 ( const float *  attn_out,
const float *  residual,
const float *  ln2_gamma,
const void *  wo,
const void *  w1,
const void *  w2,
float *  output,
int  tokens,
int  num_heads,
int  head_dim,
int  embed_dim,
int  intermediate,
float  eps,
int  w2_is_q6k 
)

Test mega-fused OutProj + MLP kernel (Q5_0 weights)

This tests the mega_fused_outproj_mlp_prefill kernel which fuses:

  1. Quantize attention output (head-major) to Q8_0
  2. OutProj: attn_out @ W_o (Q5_0) → h1
  3. Residual: h1 += residual
  4. RMSNorm: h1 → ln2_out
  5. MLP: silu(ln2_out @ W_gate) * (ln2_out @ W_up) @ W2
  6. Residual: output += h1
Parameters
attn_outAttention output [num_heads, tokens, head_dim] (FP32, head-major)
residualResidual input [tokens, embed_dim] (FP32)
ln2_gammaRMSNorm gamma [embed_dim] (FP32)
woOutProj weights [embed_dim, embed_dim] (Q5_0)
w1MLP W1 weights [2*intermediate, embed_dim] (Q5_0)
w2MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K)
outputOutput [tokens, embed_dim] (FP32)
tokensNumber of tokens
num_headsNumber of attention heads
head_dimDimension per head
embed_dimEmbedding dimension (= num_heads * head_dim)
intermediateMLP intermediate dimension
epsRMSNorm epsilon
w2_is_q6kIf true, W2 is Q6_K; if false, W2 is Q4_K

This is a simplified wrapper for parity testing that:

  • Uses Q5_0 for W_o and W1 weights
  • Uses Q4_K for W2 weights
  • Allocates scratch internally
Parameters
attn_outAttention output [num_heads, tokens, head_dim] (FP32, head-major)
residualResidual input [tokens, embed_dim] (FP32)
ln2_gammaRMSNorm gamma [embed_dim] (FP32)
woOutProj weights [embed_dim, embed_dim] (Q5_0)
w1MLP W1 weights [2*intermediate, embed_dim] (Q5_0)
w2MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K)
outputOutput [tokens, embed_dim] (FP32)
tokensNumber of tokens
num_headsNumber of attention heads
head_dimDimension per head
embed_dimEmbedding dimension (= num_heads * head_dim)
intermediateMLP intermediate dimension
epsRMSNorm epsilon
w2_is_q6kIf true, W2 is Q6_K; if false, W2 is Q4_K

Definition at line 894 of file ck_parity_api.c.

909 {
910  /* CK uses dtype enum: CK_DT_Q5_0 = 11, CK_DT_Q4_K = 7, CK_DT_Q6_K = 8 */
911  const int CK_DT_Q5_0_VAL = 11;
912  const int CK_DT_Q4_K_VAL = 7;
913  const int CK_DT_Q6_K_VAL = 8;
914 
915  /* For parity testing, aligned = actual (no padding) */
916  int aligned_embed_dim = embed_dim;
917  int aligned_head_dim = head_dim;
918  int aligned_intermediate = intermediate;
919 
920  /* Ensure intermediate is multiple of 256 (QK_K) for K-quants */
921  if ((intermediate % 256) != 0) {
922  aligned_intermediate = ((intermediate + 255) / 256) * 256;
923  }
924 
925  /* Allocate scratch */
926  size_t scratch_size = mega_fused_outproj_mlp_prefill_scratch_size(
927  tokens, aligned_embed_dim, num_heads, aligned_head_dim, aligned_intermediate);
928 
929  void *scratch = malloc(scratch_size);
930  if (!scratch) {
931  return;
932  }
933 
934  /* Call the mega-fused kernel */
936  output,
937  attn_out,
938  residual,
939  ln2_gamma,
940  wo, NULL, CK_DT_Q5_0_VAL, /* W_o with Q5_0 */
941  w1, NULL, CK_DT_Q5_0_VAL, /* W1 with Q5_0 */
942  w2, NULL, w2_is_q6k ? CK_DT_Q6_K_VAL : CK_DT_Q4_K_VAL, /* W2 with Q4_K or Q6_K */
943  tokens,
944  embed_dim,
945  aligned_embed_dim,
946  num_heads,
947  aligned_head_dim,
948  intermediate,
949  aligned_intermediate,
950  eps,
951  scratch
952  );
953 
954  free(scratch);
955 }
void mega_fused_outproj_mlp_prefill(float *output, const float *attn_out, const float *residual, const float *ln2_gamma, const void *wo, const float *bo, int wo_dt, const void *w1, const float *b1, int w1_dt, const void *w2, const float *b2, int w2_dt, int tokens, int embed_dim, int aligned_embed_dim, int num_heads, int aligned_head_dim, int intermediate_dim, int aligned_intermediate_dim, float eps, void *scratch)
size_t mega_fused_outproj_mlp_prefill_scratch_size(int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim, int aligned_intermediate_dim)
Get scratch buffer size for mega_fused_outproj_mlp_prefill.

References mega_fused_outproj_mlp_prefill(), and mega_fused_outproj_mlp_prefill_scratch_size().

◆ ck_test_quantize_q8_k()

void ck_test_quantize_q8_k ( const float *  src,
void *  dst,
int  n 
)

Quantize FP32 to Q8_K (for activations)

Parameters
srcInput FP32 values
dstOutput Q8_K blocks
nNumber of elements (must be multiple of 256)

Definition at line 136 of file ck_parity_api.c.

137 {
138  quantize_row_q8_k(src, dst, n);
139 }

References quantize_row_q8_k().

◆ ck_test_rmsnorm()

void ck_test_rmsnorm ( const float *  input,
const float *  weight,
float *  output,
int  n_tokens,
int  dim,
float  eps 
)

RMSNorm.

Computes: output = (input / rms(input)) * weight where rms(x) = sqrt(mean(x^2) + eps)

Parameters
inputInput tensor [n_tokens, dim]
weightNormalization weights [dim]
outputOutput tensor [n_tokens, dim]
n_tokensNumber of tokens
dimHidden dimension
epsEpsilon for numerical stability

Definition at line 557 of file ck_parity_api.c.

561 {
562  /* CK rmsnorm_forward has aligned_embed_dim parameter
563  * For testing, use dim as aligned_embed_dim (no padding) */
564  rmsnorm_forward(input, weight, output, NULL, n_tokens, dim, dim, eps);
565 }
void rmsnorm_forward(const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

References rmsnorm_forward().

◆ ck_test_rope()

void ck_test_rope ( float *  q,
float *  k,
int  n_tokens,
int  n_heads,
int  n_heads_kv,
int  head_dim,
int  pos_offset,
float  theta 
)

RoPE (Rotary Position Embedding)

Applies rotary position embeddings to Q and K tensors.

NOTE: CK uses rotate-half format (split first/second halves) while some implementations use interleaved format. The test harness should account for this.

Parameters
qQuery tensor [n_tokens, n_heads * head_dim], modified in-place
kKey tensor [n_tokens, n_heads_kv * head_dim], modified in-place
n_tokensNumber of tokens
n_headsNumber of query heads
n_heads_kvNumber of key/value heads
head_dimDimension per head
pos_offsetStarting position for RoPE
thetaRoPE base frequency (typically 10000.0)

Definition at line 567 of file ck_parity_api.c.

570 {
571  /* Precompute cos/sin cache */
572  int half_dim = head_dim / 2;
573  int max_seq = pos_offset + n_tokens;
574 
575  float *cos_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
576  float *sin_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
577  if (!cos_cache || !sin_cache) {
578  free(cos_cache);
579  free(sin_cache);
580  return;
581  }
582 
583  rope_precompute_cache(cos_cache, sin_cache, max_seq, head_dim, theta);
584 
585  /* CK RoPE expects layout [num_heads, num_tokens, head_dim]
586  * Reshape from [n_tokens, n_heads * head_dim] to [n_heads, n_tokens, head_dim]
587  */
588  float *q_reorder = (float *)malloc(n_heads * n_tokens * head_dim * sizeof(float));
589  float *k_reorder = (float *)malloc(n_heads_kv * n_tokens * head_dim * sizeof(float));
590 
591  if (q_reorder && k_reorder) {
592  /* Reorder Q: [T, H*D] -> [H, T, D] */
593  for (int t = 0; t < n_tokens; t++) {
594  for (int h = 0; h < n_heads; h++) {
595  for (int d = 0; d < head_dim; d++) {
596  q_reorder[h * n_tokens * head_dim + t * head_dim + d] =
597  q[t * n_heads * head_dim + h * head_dim + d];
598  }
599  }
600  }
601 
602  /* Reorder K: [T, H_kv*D] -> [H_kv, T, D] */
603  for (int t = 0; t < n_tokens; t++) {
604  for (int h = 0; h < n_heads_kv; h++) {
605  for (int d = 0; d < head_dim; d++) {
606  k_reorder[h * n_tokens * head_dim + t * head_dim + d] =
607  k[t * n_heads_kv * head_dim + h * head_dim + d];
608  }
609  }
610  }
611 
612  /* Apply RoPE */
613  rope_forward_qk(q_reorder, k_reorder,
614  cos_cache, sin_cache,
615  n_heads, n_heads_kv, n_tokens,
616  head_dim, head_dim, pos_offset);
617 
618  /* Reorder back: [H, T, D] -> [T, H*D] */
619  for (int t = 0; t < n_tokens; t++) {
620  for (int h = 0; h < n_heads; h++) {
621  for (int d = 0; d < head_dim; d++) {
622  q[t * n_heads * head_dim + h * head_dim + d] =
623  q_reorder[h * n_tokens * head_dim + t * head_dim + d];
624  }
625  }
626  }
627 
628  for (int t = 0; t < n_tokens; t++) {
629  for (int h = 0; h < n_heads_kv; h++) {
630  for (int d = 0; d < head_dim; d++) {
631  k[t * n_heads_kv * head_dim + h * head_dim + d] =
632  k_reorder[h * n_tokens * head_dim + t * head_dim + d];
633  }
634  }
635  }
636  }
637 
638  free(q_reorder);
639  free(k_reorder);
640  free(cos_cache);
641  free(sin_cache);
642 }
void rope_precompute_cache(float *cos_cache, float *sin_cache, int max_seq_len, int head_dim, float base)
Definition: rope_kernels.c:52
void rope_forward_qk(float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:448

References rope_forward_qk(), and rope_precompute_cache().

◆ ck_test_rope_interleaved()

void ck_test_rope_interleaved ( float *  q,
float *  k,
int  n_tokens,
int  n_heads,
int  n_heads_kv,
int  head_dim,
int  pos_offset,
float  theta 
)

RoPE with interleaved format (for llama.cpp compatibility)

Uses interleaved format: (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)

Definition at line 644 of file ck_parity_api.c.

647 {
648  /* Interleaved RoPE format (matches llama.cpp):
649  * (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)
650  * Applied to consecutive pairs of elements
651  */
652 
653  /* Precompute inverse frequencies */
654  float *inv_freq = (float *)malloc((head_dim / 2) * sizeof(float));
655  if (!inv_freq) return;
656 
657  for (int i = 0; i < head_dim / 2; i++) {
658  inv_freq[i] = 1.0f / powf(theta, (float)(2 * i) / head_dim);
659  }
660 
661  /* Apply RoPE to Q */
662  for (int t = 0; t < n_tokens; t++) {
663  int pos = pos_offset + t;
664  for (int h = 0; h < n_heads; h++) {
665  float *qh = q + t * n_heads * head_dim + h * head_dim;
666 
667  for (int i = 0; i < head_dim / 2; i++) {
668  float freq = pos * inv_freq[i];
669  float cos_val = cosf(freq);
670  float sin_val = sinf(freq);
671 
672  /* Interleaved format */
673  float x0 = qh[i * 2];
674  float x1 = qh[i * 2 + 1];
675  qh[i * 2] = x0 * cos_val - x1 * sin_val;
676  qh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
677  }
678  }
679  }
680 
681  /* Apply RoPE to K */
682  for (int t = 0; t < n_tokens; t++) {
683  int pos = pos_offset + t;
684  for (int h = 0; h < n_heads_kv; h++) {
685  float *kh = k + t * n_heads_kv * head_dim + h * head_dim;
686 
687  for (int i = 0; i < head_dim / 2; i++) {
688  float freq = pos * inv_freq[i];
689  float cos_val = cosf(freq);
690  float sin_val = sinf(freq);
691 
692  float x0 = kh[i * 2];
693  float x1 = kh[i * 2 + 1];
694  kh[i * 2] = x0 * cos_val - x1 * sin_val;
695  kh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
696  }
697  }
698  }
699 
700  free(inv_freq);
701 }

◆ ck_test_softmax()

void ck_test_softmax ( const float *  input,
float *  output,
int  n 
)

Softmax (simple, non-causal)

Computes: output[i] = exp(input[i]) / sum(exp(input))

Parameters
inputInput tensor [n]
outputOutput tensor [n]
nNumber of elements

Definition at line 710 of file ck_parity_api.c.

711 {
712  /* Find max for numerical stability */
713  float max_val = input[0];
714  for (int i = 1; i < n; i++) {
715  if (input[i] > max_val) max_val = input[i];
716  }
717 
718  /* Compute exp and sum */
719  float sum = 0.0f;
720  for (int i = 0; i < n; i++) {
721  output[i] = expf(input[i] - max_val);
722  sum += output[i];
723  }
724 
725  /* Normalize */
726  float inv_sum = 1.0f / sum;
727  for (int i = 0; i < n; i++) {
728  output[i] *= inv_sum;
729  }
730 }

◆ ck_test_swiglu()

void ck_test_swiglu ( const float *  gate_up,
float *  output,
int  n_tokens,
int  intermediate_dim 
)

SwiGLU activation.

Computes: output = SiLU(gate) * up where SiLU(x) = x * sigmoid(x)

Parameters
gate_upInput tensor [n_tokens, 2 * intermediate_dim] Layout: [gate_0..gate_D-1, up_0..up_D-1] per token
outputOutput tensor [n_tokens, intermediate_dim]
n_tokensNumber of tokens
intermediate_dimIntermediate dimension

Definition at line 703 of file ck_parity_api.c.

706 {
707  swiglu_forward(gate_up, output, n_tokens, intermediate_dim);
708 }
void swiglu_forward(const float *input, float *output, int tokens, int dim)

References swiglu_forward().

◆ ck_test_vec_dot_q5_0_q8_0()

void ck_test_vec_dot_q5_0_q8_0 ( const void *  weight_q5_0,
const void *  input_q8_0,
float *  output,
int  cols 
)

Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.

Parameters
weight_q5_0Q5_0 quantized weights [cols]
input_q8_0Q8_0 quantized input [cols] (pre-quantized!)
outputOutput scalar [1]
colsNumber of elements (must be multiple of 32)

Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.

Parameters
weight_q5_0Q5_0 quantized weights [cols]
input_q8_0Q8_0 quantized input [cols] (pre-quantized!)
outputOutput scalar [1]
colsNumber of elements (must be multiple of 32)

Definition at line 364 of file ck_parity_api.c.

368 {
369  vec_dot_q5_0_q8_0(cols, output, weight_q5_0, input_q8_0);
370 }
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.

References vec_dot_q5_0_q8_0().

◆ ck_test_vec_dot_q8_0_q8_0()

void ck_test_vec_dot_q8_0_q8_0 ( const void *  weight_q8_0,
const void *  input_q8_0,
float *  output,
int  cols 
)

Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

Parameters
weight_q8_0Q8_0 quantized weights [cols]
input_q8_0Q8_0 quantized input [cols] (pre-quantized!)
outputOutput scalar [1]
colsNumber of elements (must be multiple of 32)

Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

Parameters
weight_q8_0Q8_0 quantized weights [cols]
input_q8_0Q8_0 quantized input [cols] (pre-quantized!)
outputOutput scalar [1]
colsNumber of elements (must be multiple of 32)

Definition at line 380 of file ck_parity_api.c.

384 {
385  vec_dot_q8_0_q8_0(cols, output, weight_q8_0, input_q8_0);
386 }
void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q8_0 x Q8_0.

References vec_dot_q8_0_q8_0().