← Back to C-Kernel-Engine Docs Doxygen Source Documentation
File List
Here is a list of all files with brief descriptions:
 add_kernels_bf16.cElement-wise addition kernels for BF16 tensors
 attention_decode_fused.cFused attention decode kernel (legacy v6/v6.5)
 attention_flash_true.cFlash-style attention (online softmax, causal, streaming)
 attention_kernels.cAttention score/softmax/output kernels with SIMD (SSE/AVX/AVX512)
 attention_mlp_fused.cMega-Fused Attention + MLP Block
 axpy_kernels.cAXPY kernels for FP32: y = y + alpha * x
 backend_native.c
 bf16_utils.h
 v6.5/test_generated/ck-kernel-inference.cAUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled)
 v6.6/generated/ck-kernel-inference.cAUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled)
 v6.6/test_generated/ck-kernel-inference.cAUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled)
 v6.5/test_generated/ck-kernel-inference.hAUTO-GENERATED: qwen2_0.5b_decode Memory Layout
 v6.6/generated/ck-kernel-inference.hAUTO-GENERATED: qwen2_0.5b_decode Memory Layout
 v6.6/test_generated/ck-kernel-inference.hAUTO-GENERATED: qwen2_0.5b_decode Memory Layout
 ck-kernel-prefill.c
 ck_cli_v5.c
 ck_cli_v6.5.c
 ck_cli_v6.6.c
 ck_cli_v6.c
 ck_features.hCPU feature detection and dispatch macros
 ck_metrics.h
 ck_model_api.hGeneric Model API - Model-agnostic interface for CK-Engine
 ck_parity_api.cC-Kernel-Engine Parity Testing API Implementation
 ck_parity_api.hC-Kernel-Engine Parity Testing API
 ck_threadpool.cPersistent pthread thread pool for CK-Engine inference
 ck_threadpool.hPersistent pthread thread pool for CK-Engine inference
 ck_tokenizer.c
 ck_tokenizer.h
 ck_tokenizer_v2.c
 v2_legacy/ck_tokenizer_v2.c
 ck_tokenizer_v6.5.c
 ck_tokenizer_v6.6.c
 ck_tokenizer_v6.c
 ckernel_alloc.c
 ckernel_alloc.h
 ckernel_alloc_v6.5.c
 ckernel_alloc_v6.6.c
 ckernel_alloc_v6.c
 ckernel_bump_v5.h
 ckernel_codegen.c
 ckernel_codegen.h
 ckernel_codegen_v2.c
 ckernel_codegen_v2.h
 ckernel_codegen_v2_dispatch.c
 ckernel_codegen_v2_emit.h
 ckernel_codegen_v2_schedule.c
 ckernel_codegen_v2_sections.c
 ckernel_codegen_v2_struct.c
 ckernel_codegen_v6.5.c
 ckernel_codegen_v6.6.c
 ckernel_codegen_v6.c
 ckernel_dtype.h
 ckernel_engine.h
 ckernel_ir.c
 ckernel_ir.h
 ckernel_ir_demo.c
 ckernel_ir_v2.c
 ckernel_ir_v2.h
 ckernel_ir_v2_builder.c
 ckernel_ir_v2_demo.c
 ckernel_ir_v2_lower.c
 ckernel_ir_v2_lower.h
 ckernel_ir_v6.5.c
 ckernel_ir_v6.6.c
 ckernel_ir_v6.c
 ckernel_kernel_specs.c
 ckernel_kernel_specs.h
 ckernel_mem_plan.c
 ckernel_mem_plan.h
 ckernel_memory_layout.hSingle-Arena Memory Layout for CPU-Optimized Inference & Training
 ckernel_model.h
 ckernel_model_layout.c
 ckernel_model_load.c
 ckernel_model_load_v4.c
 ckernel_model_load_v4.h
 ckernel_orchestration.c
 ckernel_orchestration.h
 ckernel_quant.hQuantization block structures for weight-only quantization
 ckernel_registry.c
 ckernel_registry.h
 ckernel_section_layout.hSection-Based Memory Layout: Header / Body / Footer Pattern
 ckernel_strict.c
 cpu_features.c
 cpu_features.h
 dequant_kernels.cDequantization kernels for GGML-compatible formats
 embedding_kernels.cToken/position embedding lookup kernels
 embedding_kernels_bf16.cToken/position embedding lookup kernels for BF16
 fp16_convert.cFP32 <-> FP16 SIMD conversion utilities
 fused_kernels.hFused Kernel API for Cache-Aware Attention Fusion
 fused_rmsnorm_linear.cFused RMSNorm + Linear (GEMV) kernel
 gelu_kernels.cGELU activation kernels with SIMD (SSE/AVX/AVX512)
 gelu_kernels_bf16.cGELU activation kernels for BF16 tensors
 gemm_batch_int8.cBatch GEMM kernels for quantized weights with INT8 activations
 gemm_fused_kernels.cFused GEMM Kernels with activations
 gemm_head_major_output.cOutput projection from head-major attention (NO LAYOUT CONVERSION)
 gemm_kernels.cGeneral matrix multiply (GEMM) kernels with SIMD (SSE/AVX/AVX512)
 gemm_kernels_amx.cAMX (Advanced Matrix Extensions) GEMM kernels
 gemm_kernels_bf16.cOptimized BF16 GEMM Kernels for AVX-512
 gemm_kernels_f16.cGEMM kernels with FP16 (half-precision) weights
 gemm_kernels_q4_0.cGEMM/GEMV kernels with Q4_0 quantized weights
 gemm_kernels_q4_1.cGEMM/GEMV kernels with Q4_1 quantized weights
 gemm_kernels_q4k.cGEMM/GEMV kernels with Q4_K quantized weights
 gemm_kernels_q4k_avx.cAVX Q4_K x Q8_K matvec kernel for Sandy/Ivy Bridge
 gemm_kernels_q4k_q8k.cQ4_K (weights) x Q8_K (activations) kernels for inference
 gemm_kernels_q4k_q8k_avx2.cAVX2 Q4_K x Q8_K matvec kernel (inference only)
 gemm_kernels_q4k_q8k_vnni.cVNNI Q4_K x Q8_K matvec kernel (inference only)
 gemm_kernels_q4k_sse.cSSE4.1 Q4_K x Q8_K dot product kernels
 gemm_kernels_q5_0.cGEMM/GEMV kernels with Q5_0 quantized weights
 gemm_kernels_q5_0_sse.cSSE4.1 GEMM for Q5_0 quantized weights
 gemm_kernels_q5_0_sse_v2.cSSE-optimized GEMM kernels for Q5_0 x Q8_K quantization
 gemm_kernels_q5_1.cGEMM/GEMV kernels with Q5_1 quantized weights
 gemm_kernels_q5_k.cGEMM/GEMV kernels with Q5_K quantized weights
 gemm_kernels_q6k.cGEMM/GEMV kernels with Q6_K quantized weights
 gemm_kernels_q6k_q8k.cQ6_K (weights) x Q8_K (activations) kernels for inference
 gemm_kernels_q6k_sse.cSSE-optimized GEMM kernels for Q6_K x Q8_K quantization
 gemm_kernels_q8_0.cGEMM/GEMV kernels with Q8_0 quantized weights
 gemm_microkernel.cGEMM Microkernel - High-Performance Register-Blocked Matrix Multiplication
 gemv_fused_quant_bias.cFused GEMV kernels with online quantization and bias
 gemv_omp.c
 gemv_omp.h
 v6.5/test_generated/generic_api_test.cAUTO-GENERATED: model Implementation (IR v6.5 - Explicit Unrolled)
 v6.6/test_generated/generic_api_test.cAUTO-GENERATED: model Implementation (IR v6.6 - Explicit Unrolled)
 hash_table.c
 hash_table.h
 v6.5/test_generated/int8_q4k_test.cAUTO-GENERATED: model Implementation (IR v6.5 - Explicit Unrolled)
 v6.6/test_generated/int8_q4k_test.cAUTO-GENERATED: model Implementation (IR v6.6 - Explicit Unrolled)
 kv_cache_kernels.cKV-cache helper kernels (head-major layout)
 layernorm_kernels.cLayerNorm forward/backward kernels with SIMD (SSE/AVX/AVX512)
 layernorm_kernels_bf16.cLayerNorm kernels for BF16 tensors
 loss_kernels.cLoss function kernels (cross-entropy, etc.)
 loss_kernels_bf16.cLoss function kernels for BF16 tensors
 mega_fused_attention.hMega-Fused Attention Kernel
 mega_fused_attention_avx.cMega-Fused Attention for AVX (256-bit) and AVX-512 (512-bit)
 mega_fused_attention_decode_q5_0.cMega-fused attention decode with Q5_0 weights
 mega_fused_attention_decode_q5_0.hMega-fused attention decode with Q5_0 weights - Header
 mega_fused_attention_prefill.cMega-fused prefill attention kernel
 mega_fused_attention_prefill_q8_0.cMega-fused prefill attention kernel with Q8_0 out-proj
 mega_fused_outproj_mlp_prefill.cMega-fused post-attention block for prefill
 memory_pool.c
 memory_pool.h
 mlp_fused_decode.cFully fused MLP decode kernel (T=1 token generation)
 mlp_kernels.cMLP (feed-forward) kernels with SIMD (SSE/AVX/AVX512)
 mlp_kernels_bf16.cOptimized BF16 MLP Kernels
 murmurhash3.c
 murmurhash3.h
 optimizer_kernels.cOptimizer kernels for training (AdamW, SGD)
 optimizer_kernels_bf16.cBF16 optimizer kernels for training
 parallel_orchestration.c[LEGACY] Parallel decode orchestration prototype — NOT USED by v6.6
 prefill_fused_gemm.cFused kernels for prefill phase with proper 2D tiling
 qk_norm_kernels.cPer-head RMSNorm on Q and K (Qwen3-style QK norm)
 quantize_row_q8_k_sse.cSSE-optimized Q8_K row quantization kernel
 v6.5/test_generated/qwen2_int8.cAUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6.5 - Explicit Unrolled)
 v6.6/test_generated/qwen2_int8.cAUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6.6 - Explicit Unrolled)
 relu_kernels.cReLU activation kernels with SIMD (SSE/AVX/AVX512)
 relu_kernels_bf16.cReLU activation kernels for BF16 tensors
 rmsnorm_kernels.cRMSNorm forward/backward kernels with SIMD (SSE/AVX/AVX512)
 rmsnorm_kernels_bf16.cRMSNorm kernels for BF16 tensors
 rmsnorm_kernels_int4.cRMSNorm kernels with INT4 output quantization
 rmsnorm_kernels_int8.cRMSNorm kernels with INT8 output quantization
 rmsnorm_q8_k_fused.cFused RMSNorm + Q8_K Quantization kernel
 rmsnorm_qkv.cFused RMSNorm + QKV Projection
 rope_kernels.cRoPE (Rotary Position Embedding) kernels with SIMD
 rope_kernels_bf16.cRoPE (Rotary Position Embedding) kernels for BF16
 show_config.c
 sigmoid_kernels.cSigmoid activation kernels with SIMD (AVX512)
 sigmoid_kernels_bf16.cSigmoid activation kernels for BF16 tensors
 softmax_kernels.cSoftmax forward/backward kernels with SIMD (SSE/AVX/AVX512)
 softmax_kernels_bf16.cSoftmax kernels for BF16 tensors
 swiglu_kernels.cSwiGLU activation kernels with SIMD (SSE/AVX/AVX512)
 swiglu_kernels_bf16.cSwiGLU activation kernels for BF16 tensors
 system_topology.c
 system_topology.h
 v6.5/test_bump_tokenizer.c
 v6.6/test_bump_tokenizer.c
 v6.5/test_generic_api.cGeneric test/benchmark harness using ck_model_* API
 v6.6/test_generic_api.cGeneric test/benchmark harness using ck_model_* API
 v6.5/test_inference_with_bump_tokenizer.c
 v6.6/test_inference_with_bump_tokenizer.c
 test_tokenizer.c
 tokenizer.c
 tokenizer.h
 topk_kernels.cTop-K selection kernels for MoE router dispatch
 trie.c
 include/data_structures/tries/trie.h
 src/data_structures/tries/trie.h
 true_bpe.c
 true_bpe.h
 utf8.c
 utf8.h
 v6.5_cli.c
 v6.5_inference.c
 v6.5_simple.c
 v6.6_cli.c
 v6.6_inference.c
 v6.6_simple.c
 v6_cli.cC-Kernel-Engine v6 CLI
 v6_inference.cC-Kernel-Engine v6 Inference
 v6_simple.cSimplified v6 CLI using only generic kernels
 vision_kernels.cVision kernels (im2patch, patch embedding, etc.)
 vision_kernels_bf16.cVision kernels for BF16 tensors (im2patch, etc.)