C-Kernel-Engine Parity Testing API. More...

#include <stddef.h>
#include <stdint.h>

Macros
#define	CK_BLOCK_Q4_0_SIZE 18

#define	CK_BLOCK_Q4_K_SIZE 144

#define	CK_BLOCK_Q5_1_SIZE 24

#define	CK_BLOCK_Q5_K_SIZE 176

#define	CK_BLOCK_Q6_K_SIZE 210

#define	CK_BLOCK_Q8_K_SIZE 292

#define	CK_QK4_0 32 /* Elements per Q4_0 block */

#define	CK_QK8_0 32 /* Elements per Q8_0 block */

#define	CK_QK_K 256 /* Elements per K-quant super-block */

Functions
int	ck_get_block_q4_k_size (void)
	Get Q4_K block size in bytes. More...

int	ck_get_block_q5_1_size (void)
	Get Q5_1 block size in bytes (24 bytes per 32 weights) More...

int	ck_get_block_q5_k_size (void)
	Get Q5_K block size in bytes (176 bytes per 256 weights) More...

int	ck_get_block_q6_k_size (void)
	Get Q6_K block size in bytes. More...

int	ck_get_block_q8_k_size (void)
	Get Q8_K block size in bytes. More...

int	ck_get_qk5_1 (void)
	Get QK5_1 (elements per Q5_1 block) More...

int	ck_get_qk_k (void)
	Get QK_K (elements per super-block) More...

void	ck_test_attention_causal (const float q, const float k, const float v, float out, int num_heads, int num_kv_heads, int tokens, int seq_len, int head_dim)
	Multi-head causal attention for prefill (head-major layout) More...

void	ck_test_dequant_q4_0 (const void src, float dst, int n)
	Dequantize Q4_0 data to FP32. More...

void	ck_test_dequant_q4_k (const void src, float dst, int n)
	Dequantize Q4_K data to FP32. More...

void	ck_test_dequant_q5_1 (const void src, float dst, int n)
	Dequantize Q5_1 data to FP32. More...

void	ck_test_dequant_q6_k (const void src, float dst, int n)
	Dequantize Q6_K data to FP32. More...

void	ck_test_gemm_q4_k (const void weight_q4k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q4_K GEMM - batched matrix multiply with quantized weights. More...

void	ck_test_gemm_q5_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks) More...

void	ck_test_gemm_q5_1 (const void weight_q5_1, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks) More...

void	ck_test_gemm_q5_k (const void weight_q5_k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks) More...

void	ck_test_gemm_q6_k (const void weight_q6k, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q6_K GEMM - batched matrix multiply with Q6_K weights. More...

void	ck_test_gemm_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols, int n_tokens)
	Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks) More...

void	ck_test_gemv_q4_k (const void weight_q4k, const float input_f32, float *output, int cols)
	Q4_K GEMV - dot product of quantized weights and FP32 input. More...

void	ck_test_gemv_q5_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols)
	Q5_0 GEMV - matrix-vector multiply with Q5_0 weights. More...

void	ck_test_gemv_q5_0_q8_0 (const void weight_q5_0, const float input_f32, float *output, int rows, int cols)
	Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...

void	ck_test_gemv_q5_1 (const void weight_q5_1, const float input_f32, float *output, int rows, int cols)
	Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks) More...

void	ck_test_gemv_q5_k (const void weight_q5_k, const float input_f32, float *output, int rows, int cols)
	Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks) More...

void	ck_test_gemv_q6_k (const void weight_q6k, const float input_f32, float *output, int cols)
	Q6_K GEMV. More...

void	ck_test_gemv_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols)
	Q8_0 GEMV - matrix-vector multiply with Q8_0 weights. More...

void	ck_test_gemv_q8_0_q8_0 (const void weight_q8_0, const float input_f32, float *output, int rows, int cols)
	Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach. More...

void	ck_test_outproj_mlp_fused_q5_0 (const float attn_out, const float residual, const float ln2_gamma, const void wo, const void w1, const void w2, float *output, int tokens, int num_heads, int head_dim, int embed_dim, int intermediate, float eps, int w2_is_q6k)
	Test mega-fused OutProj + MLP kernel (Q5_0 weights) More...

void	ck_test_quantize_q8_k (const float src, void dst, int n)
	Quantize FP32 to Q8_K (for activations) More...

void	ck_test_rmsnorm (const float input, const float weight, float *output, int n_tokens, int dim, float eps)
	RMSNorm. More...

void	ck_test_rope (float q, float k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
	RoPE (Rotary Position Embedding) More...

void	ck_test_rope_interleaved (float q, float k, int n_tokens, int n_heads, int n_heads_kv, int head_dim, int pos_offset, float theta)
	RoPE with interleaved format (for llama.cpp compatibility) More...

void	ck_test_softmax (const float input, float output, int n)
	Softmax (simple, non-causal) More...

void	ck_test_swiglu (const float gate_up, float output, int n_tokens, int intermediate_dim)
	SwiGLU activation. More...

void	ck_test_vec_dot_q5_0_q8_0 (const void weight_q5_0, const void input_q8_0, float *output, int cols)
	Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More...

void	ck_test_vec_dot_q8_0_q8_0 (const void weight_q8_0, const void input_q8_0, float *output, int cols)
	Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input) More...

Detailed Description

C-Kernel-Engine Parity Testing API.

Exposes individual CK kernels for parity testing against llama.cpp/ggml. This API mirrors the test-kernel-parity.cpp interface in llama.cpp.

Usage:

Build as shared library: libck_parity.so
Load from Python using ctypes
Call functions with matching signatures to test-kernel-parity.cpp

Definition in file ck_parity_api.h.

Macro Definition Documentation

◆ CK_BLOCK_Q4_0_SIZE

#define CK_BLOCK_Q4_0_SIZE 18

Definition at line 36 of file ck_parity_api.h.

◆ CK_BLOCK_Q4_K_SIZE

#define CK_BLOCK_Q4_K_SIZE 144

Definition at line 33 of file ck_parity_api.h.

◆ CK_BLOCK_Q5_1_SIZE

#define CK_BLOCK_Q5_1_SIZE 24

Definition at line 38 of file ck_parity_api.h.

◆ CK_BLOCK_Q5_K_SIZE

#define CK_BLOCK_Q5_K_SIZE 176

Definition at line 37 of file ck_parity_api.h.

◆ CK_BLOCK_Q6_K_SIZE

#define CK_BLOCK_Q6_K_SIZE 210

Definition at line 34 of file ck_parity_api.h.

◆ CK_BLOCK_Q8_K_SIZE

#define CK_BLOCK_Q8_K_SIZE 292

Definition at line 35 of file ck_parity_api.h.

◆ CK_QK4_0

#define CK_QK4_0 32 /* Elements per Q4_0 block */

Definition at line 29 of file ck_parity_api.h.

◆ CK_QK8_0

#define CK_QK8_0 32 /* Elements per Q8_0 block */

Definition at line 30 of file ck_parity_api.h.

◆ CK_QK_K

#define CK_QK_K 256 /* Elements per K-quant super-block */

Definition at line 28 of file ck_parity_api.h.

Function Documentation

◆ ck_get_block_q4_k_size()

int ck_get_block_q4_k_size ( void )

Get Q4_K block size in bytes.

Definition at line 961 of file ck_parity_api.c.

 {
     return sizeof(block_q4_K);
 }

◆ ck_get_block_q5_1_size()

int ck_get_block_q5_1_size ( void )

Get Q5_1 block size in bytes (24 bytes per 32 weights)

Definition at line 986 of file ck_parity_api.c.

 {
     return sizeof(block_q5_1);
 }

◆ ck_get_block_q5_k_size()

int ck_get_block_q5_k_size ( void )

Get Q5_K block size in bytes (176 bytes per 256 weights)

Definition at line 981 of file ck_parity_api.c.

 {
     return sizeof(block_q5_K);
 }

◆ ck_get_block_q6_k_size()

int ck_get_block_q6_k_size ( void )

Get Q6_K block size in bytes.

Definition at line 966 of file ck_parity_api.c.

 {
     return sizeof(block_q6_K);
 }

◆ ck_get_block_q8_k_size()

int ck_get_block_q8_k_size ( void )

Get Q8_K block size in bytes.

Definition at line 971 of file ck_parity_api.c.

 {
     return sizeof(block_q8_K);
 }

◆ ck_get_qk5_1()

int ck_get_qk5_1 ( void )

Get QK5_1 (elements per Q5_1 block)

Definition at line 991 of file ck_parity_api.c.

 {
     return QK5_1;
 }

References QK5_1.

◆ ck_get_qk_k()

int ck_get_qk_k ( void )

Get QK_K (elements per super-block)

Definition at line 976 of file ck_parity_api.c.

 {
     return QK_K;
 }

References QK_K.

◆ ck_test_attention_causal()

void ck_test_attention_causal	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	out,
		int	num_heads,
		int	num_kv_heads,
		int	tokens,
		int	seq_len,
		int	head_dim
	)

Multi-head causal attention for prefill (head-major layout)

Layout (head-major, matches llama.cpp test): Q: [num_heads, tokens, head_dim] K: [num_kv_heads, seq_len, head_dim] V: [num_kv_heads, seq_len, head_dim] out: [num_heads, tokens, head_dim]

Supports GQA (grouped-query attention) where num_heads > num_kv_heads. Causal masking: token t can only attend to positions 0..t (inclusive).

Parameters

q	Query [num_heads, tokens, head_dim]
k	Key [num_kv_heads, seq_len, head_dim]
v	Value [num_kv_heads, seq_len, head_dim]
out	Output [num_heads, tokens, head_dim]
num_heads	Number of query heads
num_kv_heads	Number of key/value heads (for GQA)
tokens	Number of query tokens
seq_len	Key/value sequence length (for prefill: seq_len == tokens)
head_dim	Dimension per head

Definition at line 736 of file ck_parity_api.c.

 {
     /* For prefill, seq_len == tokens, and kv_stride == tokens.
      * The CK kernel expects strided KV layout with kv_stride_tokens parameter.
      * For parity testing with contiguous tensors, kv_stride = seq_len.
      */
     attention_forward_causal_head_major_gqa_flash_strided(
         q, k, v, out,
         num_heads, num_kv_heads, tokens,
         head_dim, head_dim,  /* aligned_head_dim = head_dim for testing */
         seq_len              /* kv_stride_tokens = seq_len for contiguous KV */
     );
 }

References attention_forward_causal_head_major_gqa_flash_strided().

◆ ck_test_dequant_q4_0()

void ck_test_dequant_q4_0	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q4_0 data to FP32.

Definition at line 122 of file ck_parity_api.c.

 {
     dequant_q4_0_row(src, dst, (size_t)n);
 }

References dequant_q4_0_row().

◆ ck_test_dequant_q4_k()

void ck_test_dequant_q4_k	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q4_K data to FP32.

Parameters

src	Input Q4_K blocks
dst	Output FP32 values
n	Number of elements (must be multiple of 256)

Definition at line 112 of file ck_parity_api.c.

 {
     dequant_q4_k_row(src, dst, (size_t)n);
 }

References dequant_q4_k_row().

◆ ck_test_dequant_q5_1()

void ck_test_dequant_q5_1	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q5_1 data to FP32.

Definition at line 127 of file ck_parity_api.c.

 {
     dequant_q5_1_row(src, dst, (size_t)n);
 }

References dequant_q5_1_row().

◆ ck_test_dequant_q6_k()

void ck_test_dequant_q6_k	(	const void *	src,
		float *	dst,
		int	n
	)

Dequantize Q6_K data to FP32.

Definition at line 117 of file ck_parity_api.c.

 {
     dequant_q6_k_row(src, dst, (size_t)n);
 }

References dequant_q6_k_row().

◆ ck_test_gemm_q4_k()

void ck_test_gemm_q4_k	(	const void *	weight_q4k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q4_K GEMM - batched matrix multiply with quantized weights.

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters

weight_q4k	Q4_K quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)
n_tokens	Batch size

Definition at line 392 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_k(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q4_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q4_k_q8_k(q8_data, weight_q4k, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemm_nt_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q5_0()

void ck_test_gemm_q5_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters

weight_q5_0	Q5_0 quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)
n_tokens	Batch size

Q5_0 GEMM - batched matrix multiply with Q5_0 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_0 weights.

Definition at line 491 of file ck_parity_api.c.

 {
     /* Allocate Q8_0 buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_0(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q5_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q5_0_q8_0(q8_data, weight_q5_0, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemm_nt_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemm_q5_1()

void ck_test_gemm_q5_1	(	const void *	weight_q5_1,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_0 for activations.

Parameters

weight_q5_1	Q5_1 quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)
n_tokens	Batch size

Q5_1 GEMM - batched matrix multiply with Q5_1 weights (32-element blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_1 weights. gemm_nt_q5_1 expects FP32 activations (not quantized).

Definition at line 542 of file ck_parity_api.c.

 {
     /* gemm_nt_q5_1 expects FP32 activations, not quantized.
      * Pass input_f32 directly as-is (already FP32).
      */
     gemm_nt_q5_1(input_f32, weight_q5_1, NULL, output, n_tokens, rows, cols);
 }

References gemm_nt_q5_1().

◆ ck_test_gemm_q5_k()

void ck_test_gemm_q5_k	(	const void *	weight_q5_k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k]) Uses Q8_K for activations.

Parameters

weight_q5_k	Q5_K quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)
n_tokens	Batch size

Q5_K GEMM - batched matrix multiply with Q5_K weights (256-element super-blocks)

Used for MLP W1 (gate/up projection) and attention Q/K with Q5_K weights. gemm_nt_q5_k expects FP32 activations (not quantized).

Definition at line 525 of file ck_parity_api.c.

 {
     /* gemm_nt_q5_k expects FP32 activations, not quantized.
      * Pass input_f32 directly as-is (already FP32).
      */
     gemm_nt_q5_k(input_f32, weight_q5_k, NULL, output, n_tokens, rows, cols);
 }

References gemm_nt_q5_k().

◆ ck_test_gemm_q6_k()

void ck_test_gemm_q6_k	(	const void *	weight_q6k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q6_K GEMM - batched matrix multiply with Q6_K weights.

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters

weight_q6k	Q6_K quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)
n_tokens	Batch size

Q6_K GEMM - batched matrix multiply with Q6_K weights.

Used for MLP W2 (down projection) with Q6_K weights.

Definition at line 425 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_K));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_k(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q6_k_q8_k: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q6_k_q8_k(q8_data, weight_q6k, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemm_nt_q6_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemm_q8_0()

void ck_test_gemm_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols,
		int	n_tokens
	)

Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)

Computes: output[t,r] = sum_k(weight[r,k] * input[t,k])

Parameters

weight_q8_0	Q8_0 quantized weights [rows, cols]
input_f32	FP32 input [n_tokens, cols]
output	FP32 output [n_tokens, rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)
n_tokens	Batch size

Q8_0 GEMM - batched matrix multiply with Q8_0 weights (32-element blocks)

Used for attention V projection with Q8_0 weights.

Definition at line 458 of file ck_parity_api.c.

 {
     /* Allocate Q8_0 buffer for quantized activations */
     int n_blocks_per_row = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_tokens * n_blocks_per_row * sizeof(block_q8_0));
     if (!q8_data) {
         memset(output, 0, n_tokens * rows * sizeof(float));
         return;
     }
  
     /* Quantize all input tokens */
     for (int t = 0; t < n_tokens; t++) {
         quantize_row_q8_0(input_f32 + t * cols,
                           q8_data + t * n_blocks_per_row, cols);
     }
  
     /* Use gemm_nt_q8_0_q8_0: C[M,N] = A[M,K] * B[N,K]^T
      * Our layout: output[n_tokens, rows] = input[n_tokens, cols] * weight[rows, cols]^T
      * So: M = n_tokens, N = rows, K = cols
      */
     gemm_nt_q8_0_q8_0(q8_data, weight_q8_0, NULL, output, n_tokens, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemm_nt_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q4_k()

void ck_test_gemv_q4_k	(	const void *	weight_q4k,
		const float *	input_f32,
		float *	output,
		int	cols
	)

Q4_K GEMV - dot product of quantized weights and FP32 input.

Internally quantizes input to Q8_K, then computes dot product.

Parameters

weight_q4k	Q4_K quantized weights [cols]
input_f32	FP32 input vector [cols]
output	Output scalar [1]
cols	Number of columns (must be multiple of 256)

Definition at line 145 of file ck_parity_api.c.

 {
     /* Allocate Q8_K buffer for quantized activations */
     int n_blocks = cols / CK_QK_K;
     block_q8_K *q8_data = (block_q8_K *)malloc(n_blocks * sizeof(block_q8_K));
     if (!q8_data) {
         *output = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_K */
     quantize_row_q8_k(input_f32, q8_data, cols);
  
     /* Compute dot product using GEMV with M=1 */
     gemv_q4_k_q8_k(output, weight_q4k, q8_data, 1, cols);
  
     free(q8_data);
 }

References CK_QK_K, gemv_q4_k_q8_k(), and quantize_row_q8_k().

◆ ck_test_gemv_q5_0()

void ck_test_gemv_q5_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_0 GEMV - matrix-vector multiply with Q5_0 weights.

Parameters

weight_q5_0	Q5_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 192 of file ck_parity_api.c.

 {
     /* Match llama.cpp's test_gemv_q5_0:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (vec_dot_q5_0_q8_0)
      *
      * This ensures parity with llama.cpp which always uses the
      * quantized path, NOT the FP32 dequantization path.
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel (same as ck_test_gemv_q5_0_q8_0) */
     gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_0_q8_0()

void ck_test_gemv_q5_0_q8_0	(	const void *	weight_q5_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters

weight_q5_0	Q5_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols] - will be quantized to Q8_0
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 248 of file ck_parity_api.c.

 {
     /* This matches llama.cpp's approach:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (integer math)
      * 3. Scale at the end
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel */
     gemv_q5_0_q8_0(output, weight_q5_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q5_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q5_1()

void ck_test_gemv_q5_1	(	const void *	weight_q5_1,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_1 GEMV - matrix-vector multiply with Q5_1 weights (32-element blocks)

Uses Q8_0 for activations (like Q5_0).

Parameters

weight_q5_1	Q5_1 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 333 of file ck_parity_api.c.

 {
     /*
      * IMPORTANT: gemv_q5_1() expects raw FP32 activations, NOT pre-quantized Q8_0.
      * See comment in ck_test_gemv_q5_k() above for explanation.
      */
     for (int r = 0; r < rows; r++) {
         gemv_q5_1(&output[r],
                   (const char *)weight_q5_1 + r * (cols / QK5_1) * sizeof(block_q5_1),
                   input_f32, 1, cols);
     }
 }

References gemv_q5_1(), and QK5_1.

◆ ck_test_gemv_q5_k()

void ck_test_gemv_q5_k	(	const void *	weight_q5_k,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q5_K GEMV - matrix-vector multiply with Q5_K weights (256-element super-blocks)

Uses Q8_K for activations (like Q4_K).

Parameters

weight_q5_k	Q5_K quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 256)

Definition at line 301 of file ck_parity_api.c.

 {
     /*
      * IMPORTANT: gemv_q5_k() expects raw FP32 activations, NOT pre-quantized Q8_K.
      *
      * This is different from gemv_q4_k_q8_k() and gemv_q5_0_q8_0() which are
      * "quantized dot product" kernels that take block_q8_K or block_q8_0 input.
      *
      * WHY THIS IS ERROR-PRONE:
      *   When copying from ck_test_gemv_q5_0() (which calls gemv_q5_0_q8_0),
      *   it is natural to assume Q5_K also needs pre-quantization. But the
      *   function name tells you: gemv_q5_k() takes float*, while
      *   gemv_q5_0_q8_0() takes block_q8_0*. If the kernel name does not
      *   have "_q8_0" or "_q8_k" suffix, it expects FP32 input.
      *
      * PARITY NOTE:
      *   llama.cpp reference uses ggml_vec_dot_q5_K_q8_K which quantizes
      *   the input to Q8_K internally. Our FP32 path will have slightly
      *   different numerical results. Use tolerance ~1e-2 for comparison.
      *   To get exact parity, implement gemv_q5_k_q8_k() (quantized dot product).
      */
     for (int r = 0; r < rows; r++) {
         gemv_q5_k(&output[r],
                   (const char *)weight_q5_k + r * (cols / CK_QK_K) * sizeof(block_q5_K),
                   input_f32, 1, cols);
     }
 }

References CK_QK_K, and gemv_q5_k().

◆ ck_test_gemv_q6_k()

void ck_test_gemv_q6_k	(	const void *	weight_q6k,
		const float *	input_f32,
		float *	output,
		int	cols
	)

Q6_K GEMV.

Definition at line 167 of file ck_parity_api.c.

 {
     /* Q6_K GEMV is not yet implemented in CK - provide reference impl */
     /* For now, dequantize and compute in FP32 */
     float *weight_f32 = (float *)malloc(cols * sizeof(float));
     if (!weight_f32) {
         *output = 0.0f;
         return;
     }
  
     dequant_q6_k_row(weight_q6k, weight_f32, cols);
  
     /* Dot product in FP32 */
     double sum = 0.0;
     for (int i = 0; i < cols; i++) {
         sum += (double)weight_f32[i] * (double)input_f32[i];
     }
     *output = (float)sum;
  
     free(weight_f32);
 }

References dequant_q6_k_row().

◆ ck_test_gemv_q8_0()

void ck_test_gemv_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q8_0 GEMV - matrix-vector multiply with Q8_0 weights.

Parameters

weight_q8_0	Q8_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols]
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 220 of file ck_parity_api.c.

 {
     /* Match llama.cpp's test_gemv_q8_0:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (vec_dot_q8_0_q8_0)
      *
      * This ensures parity with llama.cpp which always uses the
      * quantized path, NOT the FP32 dequantization path.
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel (same as ck_test_gemv_q8_0_q8_0) */
     gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_gemv_q8_0_q8_0()

void ck_test_gemv_q8_0_q8_0	(	const void *	weight_q8_0,
		const float *	input_f32,
		float *	output,
		int	rows,
		int	cols
	)

Q8_0 x Q8_0 quantized GEMV - matches llama.cpp's approach.

This version quantizes the input to Q8_0 first, then uses integer dot products (like llama.cpp does). Use this for parity testing.

Parameters

weight_q8_0	Q8_0 quantized weights [rows * cols]
input_f32	FP32 input vector [cols] - will be quantized to Q8_0
output	FP32 output vector [rows]
rows	Number of output rows
cols	Number of columns (must be multiple of 32)

Definition at line 274 of file ck_parity_api.c.

 {
     /* This matches llama.cpp's approach:
      * 1. Quantize input to Q8_0 format
      * 2. Use quantized dot product (integer math)
      * 3. Scale at the end
      */
     int n_blocks = cols / CK_QK8_0;
     block_q8_0 *q8_data = (block_q8_0 *)malloc(n_blocks * sizeof(block_q8_0));
     if (!q8_data) {
         for (int r = 0; r < rows; r++) output[r] = 0.0f;
         return;
     }
  
     /* Quantize input to Q8_0 */
     quantize_row_q8_0(input_f32, q8_data, cols);
  
     /* Call the quantized GEMV kernel */
     gemv_q8_0_q8_0(output, weight_q8_0, q8_data, rows, cols);
  
     free(q8_data);
 }

References CK_QK8_0, gemv_q8_0_q8_0(), and quantize_row_q8_0().

◆ ck_test_outproj_mlp_fused_q5_0()

void ck_test_outproj_mlp_fused_q5_0	(	const float *	attn_out,
		const float *	residual,
		const float *	ln2_gamma,
		const void *	wo,
		const void *	w1,
		const void *	w2,
		float *	output,
		int	tokens,
		int	num_heads,
		int	head_dim,
		int	embed_dim,
		int	intermediate,
		float	eps,
		int	w2_is_q6k
	)

Test mega-fused OutProj + MLP kernel (Q5_0 weights)

This tests the mega_fused_outproj_mlp_prefill kernel which fuses:

Quantize attention output (head-major) to Q8_0
OutProj: attn_out @ W_o (Q5_0) → h1
Residual: h1 += residual
RMSNorm: h1 → ln2_out
MLP: silu(ln2_out @ W_gate) * (ln2_out @ W_up) @ W2
Residual: output += h1

Parameters

attn_out	Attention output [num_heads, tokens, head_dim] (FP32, head-major)
residual	Residual input [tokens, embed_dim] (FP32)
ln2_gamma	RMSNorm gamma [embed_dim] (FP32)
wo	OutProj weights [embed_dim, embed_dim] (Q5_0)
w1	MLP W1 weights [2*intermediate, embed_dim] (Q5_0)
w2	MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K)
output	Output [tokens, embed_dim] (FP32)
tokens	Number of tokens
num_heads	Number of attention heads
head_dim	Dimension per head
embed_dim	Embedding dimension (= num_heads * head_dim)
intermediate	MLP intermediate dimension
eps	RMSNorm epsilon
w2_is_q6k	If true, W2 is Q6_K; if false, W2 is Q4_K

This is a simplified wrapper for parity testing that:

Uses Q5_0 for W_o and W1 weights
Uses Q4_K for W2 weights
Allocates scratch internally

Parameters

attn_out	Attention output [num_heads, tokens, head_dim] (FP32, head-major)
residual	Residual input [tokens, embed_dim] (FP32)
ln2_gamma	RMSNorm gamma [embed_dim] (FP32)
wo	OutProj weights [embed_dim, embed_dim] (Q5_0)
w1	MLP W1 weights [2*intermediate, embed_dim] (Q5_0)
w2	MLP W2 weights [embed_dim, intermediate] (Q4_K or Q6_K)
output	Output [tokens, embed_dim] (FP32)
tokens	Number of tokens
num_heads	Number of attention heads
head_dim	Dimension per head
embed_dim	Embedding dimension (= num_heads * head_dim)
intermediate	MLP intermediate dimension
eps	RMSNorm epsilon
w2_is_q6k	If true, W2 is Q6_K; if false, W2 is Q4_K

Definition at line 894 of file ck_parity_api.c.

 {
     /* CK uses dtype enum: CK_DT_Q5_0 = 11, CK_DT_Q4_K = 7, CK_DT_Q6_K = 8 */
     const int CK_DT_Q5_0_VAL = 11;
     const int CK_DT_Q4_K_VAL = 7;
     const int CK_DT_Q6_K_VAL = 8;
  
     /* For parity testing, aligned = actual (no padding) */
     int aligned_embed_dim = embed_dim;
     int aligned_head_dim = head_dim;
     int aligned_intermediate = intermediate;
  
     /* Ensure intermediate is multiple of 256 (QK_K) for K-quants */
     if ((intermediate % 256) != 0) {
         aligned_intermediate = ((intermediate + 255) / 256) * 256;
     }
  
     /* Allocate scratch */
     size_t scratch_size = mega_fused_outproj_mlp_prefill_scratch_size(
         tokens, aligned_embed_dim, num_heads, aligned_head_dim, aligned_intermediate);
  
     void *scratch = malloc(scratch_size);
     if (!scratch) {
         return;
     }
  
     /* Call the mega-fused kernel */
     mega_fused_outproj_mlp_prefill(
         output,
         attn_out,
         residual,
         ln2_gamma,
         wo, NULL, CK_DT_Q5_0_VAL,          /* W_o with Q5_0 */
         w1, NULL, CK_DT_Q5_0_VAL,          /* W1 with Q5_0 */
         w2, NULL, w2_is_q6k ? CK_DT_Q6_K_VAL : CK_DT_Q4_K_VAL,  /* W2 with Q4_K or Q6_K */
         tokens,
         embed_dim,
         aligned_embed_dim,
         num_heads,
         aligned_head_dim,
         intermediate,
         aligned_intermediate,
         eps,
         scratch
     );
  
     free(scratch);
 }

References mega_fused_outproj_mlp_prefill(), and mega_fused_outproj_mlp_prefill_scratch_size().

◆ ck_test_quantize_q8_k()

void ck_test_quantize_q8_k	(	const float *	src,
		void *	dst,
		int	n
	)

Quantize FP32 to Q8_K (for activations)

Parameters

src	Input FP32 values
dst	Output Q8_K blocks
n	Number of elements (must be multiple of 256)

Definition at line 136 of file ck_parity_api.c.

 {
     quantize_row_q8_k(src, dst, n);
 }

References quantize_row_q8_k().

◆ ck_test_rmsnorm()

void ck_test_rmsnorm	(	const float *	input,
		const float *	weight,
		float *	output,
		int	n_tokens,
		int	dim,
		float	eps
	)

RMSNorm.

Computes: output = (input / rms(input)) * weight where rms(x) = sqrt(mean(x^2) + eps)

Parameters

input	Input tensor [n_tokens, dim]
weight	Normalization weights [dim]
output	Output tensor [n_tokens, dim]
n_tokens	Number of tokens
dim	Hidden dimension
eps	Epsilon for numerical stability

Definition at line 557 of file ck_parity_api.c.

 {
     /* CK rmsnorm_forward has aligned_embed_dim parameter
      * For testing, use dim as aligned_embed_dim (no padding) */
     rmsnorm_forward(input, weight, output, NULL, n_tokens, dim, dim, eps);
 }

References rmsnorm_forward().

◆ ck_test_rope()

void ck_test_rope	(	float *	q,
		float *	k,
		int	n_tokens,
		int	n_heads,
		int	n_heads_kv,
		int	head_dim,
		int	pos_offset,
		float	theta
	)

RoPE (Rotary Position Embedding)

Applies rotary position embeddings to Q and K tensors.

NOTE: CK uses rotate-half format (split first/second halves) while some implementations use interleaved format. The test harness should account for this.

Parameters

q	Query tensor [n_tokens, n_heads * head_dim], modified in-place
k	Key tensor [n_tokens, n_heads_kv * head_dim], modified in-place
n_tokens	Number of tokens
n_heads	Number of query heads
n_heads_kv	Number of key/value heads
head_dim	Dimension per head
pos_offset	Starting position for RoPE
theta	RoPE base frequency (typically 10000.0)

Definition at line 567 of file ck_parity_api.c.

 {
     /* Precompute cos/sin cache */
     int half_dim = head_dim / 2;
     int max_seq = pos_offset + n_tokens;
  
     float *cos_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
     float *sin_cache = (float *)malloc(max_seq * half_dim * sizeof(float));
     if (!cos_cache || !sin_cache) {
         free(cos_cache);
         free(sin_cache);
         return;
     }
  
     rope_precompute_cache(cos_cache, sin_cache, max_seq, head_dim, theta);
  
     /* CK RoPE expects layout [num_heads, num_tokens, head_dim]
      * Reshape from [n_tokens, n_heads * head_dim] to [n_heads, n_tokens, head_dim]
      */
     float *q_reorder = (float *)malloc(n_heads * n_tokens * head_dim * sizeof(float));
     float *k_reorder = (float *)malloc(n_heads_kv * n_tokens * head_dim * sizeof(float));
  
     if (q_reorder && k_reorder) {
         /* Reorder Q: [T, H*D] -> [H, T, D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     q_reorder[h * n_tokens * head_dim + t * head_dim + d] =
                         q[t * n_heads * head_dim + h * head_dim + d];
                 }
             }
         }
  
         /* Reorder K: [T, H_kv*D] -> [H_kv, T, D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads_kv; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     k_reorder[h * n_tokens * head_dim + t * head_dim + d] =
                         k[t * n_heads_kv * head_dim + h * head_dim + d];
                 }
             }
         }
  
         /* Apply RoPE */
         rope_forward_qk(q_reorder, k_reorder,
                         cos_cache, sin_cache,
                         n_heads, n_heads_kv, n_tokens,
                         head_dim, head_dim, pos_offset);
  
         /* Reorder back: [H, T, D] -> [T, H*D] */
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     q[t * n_heads * head_dim + h * head_dim + d] =
                         q_reorder[h * n_tokens * head_dim + t * head_dim + d];
                 }
             }
         }
  
         for (int t = 0; t < n_tokens; t++) {
             for (int h = 0; h < n_heads_kv; h++) {
                 for (int d = 0; d < head_dim; d++) {
                     k[t * n_heads_kv * head_dim + h * head_dim + d] =
                         k_reorder[h * n_tokens * head_dim + t * head_dim + d];
                 }
             }
         }
     }
  
     free(q_reorder);
     free(k_reorder);
     free(cos_cache);
     free(sin_cache);
 }

References rope_forward_qk(), and rope_precompute_cache().

◆ ck_test_rope_interleaved()

void ck_test_rope_interleaved	(	float *	q,
		float *	k,
		int	n_tokens,
		int	n_heads,
		int	n_heads_kv,
		int	head_dim,
		int	pos_offset,
		float	theta
	)

RoPE with interleaved format (for llama.cpp compatibility)

Uses interleaved format: (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)

Definition at line 644 of file ck_parity_api.c.

 {
     /* Interleaved RoPE format (matches llama.cpp):
      * (x0, x1) -> (x0*cos - x1*sin, x0*sin + x1*cos)
      * Applied to consecutive pairs of elements
      */
  
     /* Precompute inverse frequencies */
     float *inv_freq = (float *)malloc((head_dim / 2) * sizeof(float));
     if (!inv_freq) return;
  
     for (int i = 0; i < head_dim / 2; i++) {
         inv_freq[i] = 1.0f / powf(theta, (float)(2 * i) / head_dim);
     }
  
     /* Apply RoPE to Q */
     for (int t = 0; t < n_tokens; t++) {
         int pos = pos_offset + t;
         for (int h = 0; h < n_heads; h++) {
             float *qh = q + t * n_heads * head_dim + h * head_dim;
  
             for (int i = 0; i < head_dim / 2; i++) {
                 float freq = pos * inv_freq[i];
                 float cos_val = cosf(freq);
                 float sin_val = sinf(freq);
  
                 /* Interleaved format */
                 float x0 = qh[i * 2];
                 float x1 = qh[i * 2 + 1];
                 qh[i * 2]     = x0 * cos_val - x1 * sin_val;
                 qh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
             }
         }
     }
  
     /* Apply RoPE to K */
     for (int t = 0; t < n_tokens; t++) {
         int pos = pos_offset + t;
         for (int h = 0; h < n_heads_kv; h++) {
             float *kh = k + t * n_heads_kv * head_dim + h * head_dim;
  
             for (int i = 0; i < head_dim / 2; i++) {
                 float freq = pos * inv_freq[i];
                 float cos_val = cosf(freq);
                 float sin_val = sinf(freq);
  
                 float x0 = kh[i * 2];
                 float x1 = kh[i * 2 + 1];
                 kh[i * 2]     = x0 * cos_val - x1 * sin_val;
                 kh[i * 2 + 1] = x0 * sin_val + x1 * cos_val;
             }
         }
     }
  
     free(inv_freq);
 }

◆ ck_test_softmax()

void ck_test_softmax	(	const float *	input,
		float *	output,
		int	n
	)

Softmax (simple, non-causal)

Computes: output[i] = exp(input[i]) / sum(exp(input))

Parameters

input	Input tensor [n]
output	Output tensor [n]
n	Number of elements

Definition at line 710 of file ck_parity_api.c.

 {
     /* Find max for numerical stability */
     float max_val = input[0];
     for (int i = 1; i < n; i++) {
         if (input[i] > max_val) max_val = input[i];
     }
  
     /* Compute exp and sum */
     float sum = 0.0f;
     for (int i = 0; i < n; i++) {
         output[i] = expf(input[i] - max_val);
         sum += output[i];
     }
  
     /* Normalize */
     float inv_sum = 1.0f / sum;
     for (int i = 0; i < n; i++) {
         output[i] *= inv_sum;
     }
 }

◆ ck_test_swiglu()

void ck_test_swiglu	(	const float *	gate_up,
		float *	output,
		int	n_tokens,
		int	intermediate_dim
	)

SwiGLU activation.

Computes: output = SiLU(gate) * up where SiLU(x) = x * sigmoid(x)

Parameters

gate_up	Input tensor [n_tokens, 2 * intermediate_dim] Layout: [gate_0..gate_D-1, up_0..up_D-1] per token
output	Output tensor [n_tokens, intermediate_dim]
n_tokens	Number of tokens
intermediate_dim	Intermediate dimension

Definition at line 703 of file ck_parity_api.c.

 {
     swiglu_forward(gate_up, output, n_tokens, intermediate_dim);
 }

References swiglu_forward().

◆ ck_test_vec_dot_q5_0_q8_0()

void ck_test_vec_dot_q5_0_q8_0	(	const void *	weight_q5_0,
		const void *	input_q8_0,
		float *	output,
		int	cols
	)

Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.

Parameters

weight_q5_0	Q5_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Direct Q5_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

This is a "direct" test that bypasses FP32-to-Q8_0 conversion. Useful for isolating kernel bugs from quantization bugs.

Parameters

weight_q5_0	Q5_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Definition at line 364 of file ck_parity_api.c.

 {
     vec_dot_q5_0_q8_0(cols, output, weight_q5_0, input_q8_0);
 }

References vec_dot_q5_0_q8_0().

◆ ck_test_vec_dot_q8_0_q8_0()

void ck_test_vec_dot_q8_0_q8_0	(	const void *	weight_q8_0,
		const void *	input_q8_0,
		float *	output,
		int	cols
	)

Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

Parameters

weight_q8_0	Q8_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Direct Q8_0 x Q8_0 dot product (takes pre-quantized Q8_0 input)

Parameters

weight_q8_0	Q8_0 quantized weights [cols]
input_q8_0	Q8_0 quantized input [cols] (pre-quantized!)
output	Output scalar [1]
cols	Number of elements (must be multiple of 32)

Definition at line 380 of file ck_parity_api.c.

 {
     vec_dot_q8_0_q8_0(cols, output, weight_q8_0, input_q8_0);
 }

References vec_dot_q8_0_q8_0().

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ CK_BLOCK_Q4_0_SIZE

◆ CK_BLOCK_Q4_K_SIZE

◆ CK_BLOCK_Q5_1_SIZE

◆ CK_BLOCK_Q5_K_SIZE

◆ CK_BLOCK_Q6_K_SIZE

◆ CK_BLOCK_Q8_K_SIZE

◆ CK_QK4_0

◆ CK_QK8_0

◆ CK_QK_K

Function Documentation

◆ ck_get_block_q4_k_size()

◆ ck_get_block_q5_1_size()

◆ ck_get_block_q5_k_size()

◆ ck_get_block_q6_k_size()

◆ ck_get_block_q8_k_size()

◆ ck_get_qk5_1()

◆ ck_get_qk_k()

◆ ck_test_attention_causal()

◆ ck_test_dequant_q4_0()

◆ ck_test_dequant_q4_k()

◆ ck_test_dequant_q5_1()

◆ ck_test_dequant_q6_k()

◆ ck_test_gemm_q4_k()

◆ ck_test_gemm_q5_0()

◆ ck_test_gemm_q5_1()

◆ ck_test_gemm_q5_k()

◆ ck_test_gemm_q6_k()

◆ ck_test_gemm_q8_0()

◆ ck_test_gemv_q4_k()

◆ ck_test_gemv_q5_0()

◆ ck_test_gemv_q5_0_q8_0()

◆ ck_test_gemv_q5_1()

◆ ck_test_gemv_q5_k()

◆ ck_test_gemv_q6_k()

◆ ck_test_gemv_q8_0()

◆ ck_test_gemv_q8_0_q8_0()

◆ ck_test_outproj_mlp_fused_q5_0()

◆ ck_test_quantize_q8_k()

◆ ck_test_rmsnorm()

◆ ck_test_rope()

◆ ck_test_rope_interleaved()

◆ ck_test_softmax()

◆ ck_test_swiglu()

◆ ck_test_vec_dot_q5_0_q8_0()

◆ ck_test_vec_dot_q8_0_q8_0()