← Back to C-Kernel-Engine Docs Doxygen Source Documentation
kernels Directory Reference

Directories

directory  fused
 

Files

file  add_kernels_bf16.c [code]
 Element-wise addition kernels for BF16 tensors.
 
file  attention_decode_fused.c [code]
 Fused attention decode kernel (legacy v6/v6.5)
 
file  attention_flash_true.c [code]
 Flash-style attention (online softmax, causal, streaming)
 
file  attention_kernels.c [code]
 Attention score/softmax/output kernels with SIMD (SSE/AVX/AVX512)
 
file  axpy_kernels.c [code]
 AXPY kernels for FP32: y = y + alpha * x.
 
file  dequant_kernels.c [code]
 Dequantization kernels for GGML-compatible formats.
 
file  embedding_kernels.c [code]
 Token/position embedding lookup kernels.
 
file  embedding_kernels_bf16.c [code]
 Token/position embedding lookup kernels for BF16.
 
file  fp16_convert.c [code]
 FP32 <-> FP16 SIMD conversion utilities.
 
file  gelu_kernels.c [code]
 GELU activation kernels with SIMD (SSE/AVX/AVX512)
 
file  gelu_kernels_bf16.c [code]
 GELU activation kernels for BF16 tensors.
 
file  gemm_batch_int8.c [code]
 Batch GEMM kernels for quantized weights with INT8 activations.
 
file  gemm_fused_kernels.c [code]
 Fused GEMM Kernels with activations.
 
file  gemm_head_major_output.c [code]
 Output projection from head-major attention (NO LAYOUT CONVERSION)
 
file  gemm_kernels.c [code]
 General matrix multiply (GEMM) kernels with SIMD (SSE/AVX/AVX512)
 
file  gemm_kernels_amx.c [code]
 AMX (Advanced Matrix Extensions) GEMM kernels.
 
file  gemm_kernels_bf16.c [code]
 Optimized BF16 GEMM Kernels for AVX-512.
 
file  gemm_kernels_f16.c [code]
 GEMM kernels with FP16 (half-precision) weights.
 
file  gemm_kernels_q4_0.c [code]
 GEMM/GEMV kernels with Q4_0 quantized weights.
 
file  gemm_kernels_q4_1.c [code]
 GEMM/GEMV kernels with Q4_1 quantized weights.
 
file  gemm_kernels_q4k.c [code]
 GEMM/GEMV kernels with Q4_K quantized weights.
 
file  gemm_kernels_q4k_avx.c [code]
 AVX Q4_K x Q8_K matvec kernel for Sandy/Ivy Bridge.
 
file  gemm_kernels_q4k_q8k.c [code]
 Q4_K (weights) x Q8_K (activations) kernels for inference.
 
file  gemm_kernels_q4k_q8k_avx2.c [code]
 AVX2 Q4_K x Q8_K matvec kernel (inference only)
 
file  gemm_kernels_q4k_q8k_vnni.c [code]
 VNNI Q4_K x Q8_K matvec kernel (inference only)
 
file  gemm_kernels_q4k_sse.c [code]
 SSE4.1 Q4_K x Q8_K dot product kernels.
 
file  gemm_kernels_q5_0.c [code]
 GEMM/GEMV kernels with Q5_0 quantized weights.
 
file  gemm_kernels_q5_0_sse.c [code]
 SSE4.1 GEMM for Q5_0 quantized weights.
 
file  gemm_kernels_q5_0_sse_v2.c [code]
 SSE-optimized GEMM kernels for Q5_0 x Q8_K quantization.
 
file  gemm_kernels_q5_1.c [code]
 GEMM/GEMV kernels with Q5_1 quantized weights.
 
file  gemm_kernels_q5_k.c [code]
 GEMM/GEMV kernels with Q5_K quantized weights.
 
file  gemm_kernels_q6k.c [code]
 GEMM/GEMV kernels with Q6_K quantized weights.
 
file  gemm_kernels_q6k_q8k.c [code]
 Q6_K (weights) x Q8_K (activations) kernels for inference.
 
file  gemm_kernels_q6k_sse.c [code]
 SSE-optimized GEMM kernels for Q6_K x Q8_K quantization.
 
file  gemm_kernels_q8_0.c [code]
 GEMM/GEMV kernels with Q8_0 quantized weights.
 
file  gemm_microkernel.c [code]
 GEMM Microkernel - High-Performance Register-Blocked Matrix Multiplication.
 
file  gemv_omp.c [code]
 
file  gemv_omp.h [code]
 
file  kv_cache_kernels.c [code]
 KV-cache helper kernels (head-major layout)
 
file  layernorm_kernels.c [code]
 LayerNorm forward/backward kernels with SIMD (SSE/AVX/AVX512)
 
file  layernorm_kernels_bf16.c [code]
 LayerNorm kernels for BF16 tensors.
 
file  loss_kernels.c [code]
 Loss function kernels (cross-entropy, etc.)
 
file  loss_kernels_bf16.c [code]
 Loss function kernels for BF16 tensors.
 
file  mlp_fused_decode.c [code]
 Fully fused MLP decode kernel (T=1 token generation)
 
file  mlp_kernels.c [code]
 MLP (feed-forward) kernels with SIMD (SSE/AVX/AVX512)
 
file  mlp_kernels_bf16.c [code]
 Optimized BF16 MLP Kernels.
 
file  optimizer_kernels.c [code]
 Optimizer kernels for training (AdamW, SGD)
 
file  optimizer_kernels_bf16.c [code]
 BF16 optimizer kernels for training.
 
file  qk_norm_kernels.c [code]
 Per-head RMSNorm on Q and K (Qwen3-style QK norm)
 
file  quantize_row_q8_k_sse.c [code]
 SSE-optimized Q8_K row quantization kernel.
 
file  relu_kernels.c [code]
 ReLU activation kernels with SIMD (SSE/AVX/AVX512)
 
file  relu_kernels_bf16.c [code]
 ReLU activation kernels for BF16 tensors.
 
file  rmsnorm_kernels.c [code]
 RMSNorm forward/backward kernels with SIMD (SSE/AVX/AVX512)
 
file  rmsnorm_kernels_bf16.c [code]
 RMSNorm kernels for BF16 tensors.
 
file  rmsnorm_kernels_int4.c [code]
 RMSNorm kernels with INT4 output quantization.
 
file  rmsnorm_kernels_int8.c [code]
 RMSNorm kernels with INT8 output quantization.
 
file  rope_kernels.c [code]
 RoPE (Rotary Position Embedding) kernels with SIMD.
 
file  rope_kernels_bf16.c [code]
 RoPE (Rotary Position Embedding) kernels for BF16.
 
file  sigmoid_kernels.c [code]
 Sigmoid activation kernels with SIMD (AVX512)
 
file  sigmoid_kernels_bf16.c [code]
 Sigmoid activation kernels for BF16 tensors.
 
file  softmax_kernels.c [code]
 Softmax forward/backward kernels with SIMD (SSE/AVX/AVX512)
 
file  softmax_kernels_bf16.c [code]
 Softmax kernels for BF16 tensors.
 
file  swiglu_kernels.c [code]
 SwiGLU activation kernels with SIMD (SSE/AVX/AVX512)
 
file  swiglu_kernels_bf16.c [code]
 SwiGLU activation kernels for BF16 tensors.
 
file  topk_kernels.c [code]
 Top-K selection kernels for MoE router dispatch.
 
file  vision_kernels.c [code]
 Vision kernels (im2patch, patch embedding, etc.)
 
file  vision_kernels_bf16.c [code]
 Vision kernels for BF16 tensors (im2patch, etc.)