| add_kernels_bf16.c | Element-wise addition kernels for BF16 tensors |
| attention_decode_fused.c | Fused attention decode kernel (legacy v6/v6.5) |
| attention_flash_true.c | Flash-style attention (online softmax, causal, streaming) |
| attention_kernels.c | Attention score/softmax/output kernels with SIMD (SSE/AVX/AVX512) |
| attention_mlp_fused.c | Mega-Fused Attention + MLP Block |
| axpy_kernels.c | AXPY kernels for FP32: y = y + alpha * x |
| backend_native.c | |
| bf16_utils.h | |
| v6.5/test_generated/ck-kernel-inference.c | AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled) |
| v6.6/generated/ck-kernel-inference.c | AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled) |
| v6.6/test_generated/ck-kernel-inference.c | AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled) |
| v6.5/test_generated/ck-kernel-inference.h | AUTO-GENERATED: qwen2_0.5b_decode Memory Layout |
| v6.6/generated/ck-kernel-inference.h | AUTO-GENERATED: qwen2_0.5b_decode Memory Layout |
| v6.6/test_generated/ck-kernel-inference.h | AUTO-GENERATED: qwen2_0.5b_decode Memory Layout |
| ck-kernel-prefill.c | |
| ck_cli_v5.c | |
| ck_cli_v6.5.c | |
| ck_cli_v6.6.c | |
| ck_cli_v6.c | |
| ck_features.h | CPU feature detection and dispatch macros |
| ck_metrics.h | |
| ck_model_api.h | Generic Model API - Model-agnostic interface for CK-Engine |
| ck_parity_api.c | C-Kernel-Engine Parity Testing API Implementation |
| ck_parity_api.h | C-Kernel-Engine Parity Testing API |
| ck_threadpool.c | Persistent pthread thread pool for CK-Engine inference |
| ck_threadpool.h | Persistent pthread thread pool for CK-Engine inference |
| ck_tokenizer.c | |
| ck_tokenizer.h | |
| ck_tokenizer_v2.c | |
| v2_legacy/ck_tokenizer_v2.c | |
| ck_tokenizer_v6.5.c | |
| ck_tokenizer_v6.6.c | |
| ck_tokenizer_v6.c | |
| ckernel_alloc.c | |
| ckernel_alloc.h | |
| ckernel_alloc_v6.5.c | |
| ckernel_alloc_v6.6.c | |
| ckernel_alloc_v6.c | |
| ckernel_bump_v5.h | |
| ckernel_codegen.c | |
| ckernel_codegen.h | |
| ckernel_codegen_v2.c | |
| ckernel_codegen_v2.h | |
| ckernel_codegen_v2_dispatch.c | |
| ckernel_codegen_v2_emit.h | |
| ckernel_codegen_v2_schedule.c | |
| ckernel_codegen_v2_sections.c | |
| ckernel_codegen_v2_struct.c | |
| ckernel_codegen_v6.5.c | |
| ckernel_codegen_v6.6.c | |
| ckernel_codegen_v6.c | |
| ckernel_dtype.h | |
| ckernel_engine.h | |
| ckernel_ir.c | |
| ckernel_ir.h | |
| ckernel_ir_demo.c | |
| ckernel_ir_v2.c | |
| ckernel_ir_v2.h | |
| ckernel_ir_v2_builder.c | |
| ckernel_ir_v2_demo.c | |
| ckernel_ir_v2_lower.c | |
| ckernel_ir_v2_lower.h | |
| ckernel_ir_v6.5.c | |
| ckernel_ir_v6.6.c | |
| ckernel_ir_v6.c | |
| ckernel_kernel_specs.c | |
| ckernel_kernel_specs.h | |
| ckernel_mem_plan.c | |
| ckernel_mem_plan.h | |
| ckernel_memory_layout.h | Single-Arena Memory Layout for CPU-Optimized Inference & Training |
| ckernel_model.h | |
| ckernel_model_layout.c | |
| ckernel_model_load.c | |
| ckernel_model_load_v4.c | |
| ckernel_model_load_v4.h | |
| ckernel_orchestration.c | |
| ckernel_orchestration.h | |
| ckernel_quant.h | Quantization block structures for weight-only quantization |
| ckernel_registry.c | |
| ckernel_registry.h | |
| ckernel_section_layout.h | Section-Based Memory Layout: Header / Body / Footer Pattern |
| ckernel_strict.c | |
| cpu_features.c | |
| cpu_features.h | |
| dequant_kernels.c | Dequantization kernels for GGML-compatible formats |
| embedding_kernels.c | Token/position embedding lookup kernels |
| embedding_kernels_bf16.c | Token/position embedding lookup kernels for BF16 |
| fp16_convert.c | FP32 <-> FP16 SIMD conversion utilities |
| fused_kernels.h | Fused Kernel API for Cache-Aware Attention Fusion |
| fused_rmsnorm_linear.c | Fused RMSNorm + Linear (GEMV) kernel |
| gelu_kernels.c | GELU activation kernels with SIMD (SSE/AVX/AVX512) |
| gelu_kernels_bf16.c | GELU activation kernels for BF16 tensors |
| gemm_batch_int8.c | Batch GEMM kernels for quantized weights with INT8 activations |
| gemm_fused_kernels.c | Fused GEMM Kernels with activations |
| gemm_head_major_output.c | Output projection from head-major attention (NO LAYOUT CONVERSION) |
| gemm_kernels.c | General matrix multiply (GEMM) kernels with SIMD (SSE/AVX/AVX512) |
| gemm_kernels_amx.c | AMX (Advanced Matrix Extensions) GEMM kernels |
| gemm_kernels_bf16.c | Optimized BF16 GEMM Kernels for AVX-512 |
| gemm_kernels_f16.c | GEMM kernels with FP16 (half-precision) weights |
| gemm_kernels_q4_0.c | GEMM/GEMV kernels with Q4_0 quantized weights |
| gemm_kernels_q4_1.c | GEMM/GEMV kernels with Q4_1 quantized weights |
| gemm_kernels_q4k.c | GEMM/GEMV kernels with Q4_K quantized weights |
| gemm_kernels_q4k_avx.c | AVX Q4_K x Q8_K matvec kernel for Sandy/Ivy Bridge |
| gemm_kernels_q4k_q8k.c | Q4_K (weights) x Q8_K (activations) kernels for inference |
| gemm_kernels_q4k_q8k_avx2.c | AVX2 Q4_K x Q8_K matvec kernel (inference only) |
| gemm_kernels_q4k_q8k_vnni.c | VNNI Q4_K x Q8_K matvec kernel (inference only) |
| gemm_kernels_q4k_sse.c | SSE4.1 Q4_K x Q8_K dot product kernels |
| gemm_kernels_q5_0.c | GEMM/GEMV kernels with Q5_0 quantized weights |
| gemm_kernels_q5_0_sse.c | SSE4.1 GEMM for Q5_0 quantized weights |
| gemm_kernels_q5_0_sse_v2.c | SSE-optimized GEMM kernels for Q5_0 x Q8_K quantization |
| gemm_kernels_q5_1.c | GEMM/GEMV kernels with Q5_1 quantized weights |
| gemm_kernels_q5_k.c | GEMM/GEMV kernels with Q5_K quantized weights |
| gemm_kernels_q6k.c | GEMM/GEMV kernels with Q6_K quantized weights |
| gemm_kernels_q6k_q8k.c | Q6_K (weights) x Q8_K (activations) kernels for inference |
| gemm_kernels_q6k_sse.c | SSE-optimized GEMM kernels for Q6_K x Q8_K quantization |
| gemm_kernels_q8_0.c | GEMM/GEMV kernels with Q8_0 quantized weights |
| gemm_microkernel.c | GEMM Microkernel - High-Performance Register-Blocked Matrix Multiplication |
| gemv_fused_quant_bias.c | Fused GEMV kernels with online quantization and bias |
| gemv_omp.c | |
| gemv_omp.h | |
| v6.5/test_generated/generic_api_test.c | AUTO-GENERATED: model Implementation (IR v6.5 - Explicit Unrolled) |
| v6.6/test_generated/generic_api_test.c | AUTO-GENERATED: model Implementation (IR v6.6 - Explicit Unrolled) |
| hash_table.c | |
| hash_table.h | |
| v6.5/test_generated/int8_q4k_test.c | AUTO-GENERATED: model Implementation (IR v6.5 - Explicit Unrolled) |
| v6.6/test_generated/int8_q4k_test.c | AUTO-GENERATED: model Implementation (IR v6.6 - Explicit Unrolled) |
| kv_cache_kernels.c | KV-cache helper kernels (head-major layout) |
| layernorm_kernels.c | LayerNorm forward/backward kernels with SIMD (SSE/AVX/AVX512) |
| layernorm_kernels_bf16.c | LayerNorm kernels for BF16 tensors |
| loss_kernels.c | Loss function kernels (cross-entropy, etc.) |
| loss_kernels_bf16.c | Loss function kernels for BF16 tensors |
| mega_fused_attention.h | Mega-Fused Attention Kernel |
| mega_fused_attention_avx.c | Mega-Fused Attention for AVX (256-bit) and AVX-512 (512-bit) |
| mega_fused_attention_decode_q5_0.c | Mega-fused attention decode with Q5_0 weights |
| mega_fused_attention_decode_q5_0.h | Mega-fused attention decode with Q5_0 weights - Header |
| mega_fused_attention_prefill.c | Mega-fused prefill attention kernel |
| mega_fused_attention_prefill_q8_0.c | Mega-fused prefill attention kernel with Q8_0 out-proj |
| mega_fused_outproj_mlp_prefill.c | Mega-fused post-attention block for prefill |
| memory_pool.c | |
| memory_pool.h | |
| mlp_fused_decode.c | Fully fused MLP decode kernel (T=1 token generation) |
| mlp_kernels.c | MLP (feed-forward) kernels with SIMD (SSE/AVX/AVX512) |
| mlp_kernels_bf16.c | Optimized BF16 MLP Kernels |
| murmurhash3.c | |
| murmurhash3.h | |
| optimizer_kernels.c | Optimizer kernels for training (AdamW, SGD) |
| optimizer_kernels_bf16.c | BF16 optimizer kernels for training |
| parallel_orchestration.c | [LEGACY] Parallel decode orchestration prototype — NOT USED by v6.6 |
| prefill_fused_gemm.c | Fused kernels for prefill phase with proper 2D tiling |
| qk_norm_kernels.c | Per-head RMSNorm on Q and K (Qwen3-style QK norm) |
| quantize_row_q8_k_sse.c | SSE-optimized Q8_K row quantization kernel |
| v6.5/test_generated/qwen2_int8.c | AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6.5 - Explicit Unrolled) |
| v6.6/test_generated/qwen2_int8.c | AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6.6 - Explicit Unrolled) |
| relu_kernels.c | ReLU activation kernels with SIMD (SSE/AVX/AVX512) |
| relu_kernels_bf16.c | ReLU activation kernels for BF16 tensors |
| rmsnorm_kernels.c | RMSNorm forward/backward kernels with SIMD (SSE/AVX/AVX512) |
| rmsnorm_kernels_bf16.c | RMSNorm kernels for BF16 tensors |
| rmsnorm_kernels_int4.c | RMSNorm kernels with INT4 output quantization |
| rmsnorm_kernels_int8.c | RMSNorm kernels with INT8 output quantization |
| rmsnorm_q8_k_fused.c | Fused RMSNorm + Q8_K Quantization kernel |
| rmsnorm_qkv.c | Fused RMSNorm + QKV Projection |
| rope_kernels.c | RoPE (Rotary Position Embedding) kernels with SIMD |
| rope_kernels_bf16.c | RoPE (Rotary Position Embedding) kernels for BF16 |
| show_config.c | |
| sigmoid_kernels.c | Sigmoid activation kernels with SIMD (AVX512) |
| sigmoid_kernels_bf16.c | Sigmoid activation kernels for BF16 tensors |
| softmax_kernels.c | Softmax forward/backward kernels with SIMD (SSE/AVX/AVX512) |
| softmax_kernels_bf16.c | Softmax kernels for BF16 tensors |
| swiglu_kernels.c | SwiGLU activation kernels with SIMD (SSE/AVX/AVX512) |
| swiglu_kernels_bf16.c | SwiGLU activation kernels for BF16 tensors |
| system_topology.c | |
| system_topology.h | |
| v6.5/test_bump_tokenizer.c | |
| v6.6/test_bump_tokenizer.c | |
| v6.5/test_generic_api.c | Generic test/benchmark harness using ck_model_* API |
| v6.6/test_generic_api.c | Generic test/benchmark harness using ck_model_* API |
| v6.5/test_inference_with_bump_tokenizer.c | |
| v6.6/test_inference_with_bump_tokenizer.c | |
| test_tokenizer.c | |
| tokenizer.c | |
| tokenizer.h | |
| topk_kernels.c | Top-K selection kernels for MoE router dispatch |
| trie.c | |
| include/data_structures/tries/trie.h | |
| src/data_structures/tries/trie.h | |
| true_bpe.c | |
| true_bpe.h | |
| utf8.c | |
| utf8.h | |
| v6.5_cli.c | |
| v6.5_inference.c | |
| v6.5_simple.c | |
| v6.6_cli.c | |
| v6.6_inference.c | |
| v6.6_simple.c | |
| v6_cli.c | C-Kernel-Engine v6 CLI |
| v6_inference.c | C-Kernel-Engine v6 Inference |
| v6_simple.c | Simplified v6 CLI using only generic kernels |
| vision_kernels.c | Vision kernels (im2patch, patch embedding, etc.) |
| vision_kernels_bf16.c | Vision kernels for BF16 tensors (im2patch, etc.) |