Single-Arena Memory Layout for CPU-Optimized Inference & Training. More...
#include <stddef.h>#include <stdint.h>Go to the source code of this file.
Data Structures | |
| struct | CKLayerGradOffsets |
| struct | CKLayerOffsets |
| struct | CKModel |
| struct | CKSection |
| struct | CKSectionConfig |
Macros | |
| #define | CK_CACHE_LINE 64ULL /* CPU cache line */ |
| #define | CK_GRAD(model, section, layer) (&(model)->sections[section].grads[layer]) |
| #define | CK_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */ |
| #define | CK_HUGE_2MB (2ULL << 20) /* 2MB hugepage */ |
| #define | CK_LAYER(model, section, layer) (&(model)->sections[section].layers[layer]) |
| #define | CK_PTR(model, offset) ((float*)((char*)(model)->base + (offset))) |
| #define | CK_SIMD_ALIGN 64ULL /* AVX-512 alignment */ |
Enumerations | |
| enum | CKFusionFlags { CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_ADD_NORM = 1 << 7 , CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_RESIDUAL_NORM = 1 << 7 } |
Functions | |
| int | ck_memory_allocate (CKModel *model, int use_hugepages) |
| void | ck_memory_free (CKModel *model) |
| size_t | ck_memory_plan (const CKSectionConfig *sections, int num_sections, int mode, uint32_t fusion_flags, CKModel *out_model) |
Single-Arena Memory Layout for CPU-Optimized Inference & Training.
The CPU doesn't care if data is a weight or activation - it just needs sequential memory access. This design:
BENEFITS:
Without fusion: rmsnorm writes to ln1_output (offset 1000) qkv_project reads from ln1_output (offset 1000)
With fusion (rmsnorm + qkv fused): fused kernel reads from input, writes directly to q/k/v ln1_output memory (offset 1000) is SKIPPED but still allocated CPU prefetch streams over it, no penalty
Definition in file ckernel_memory_layout.h.
| #define CK_CACHE_LINE 64ULL /* CPU cache line */ |
Definition at line 48 of file ckernel_memory_layout.h.
| #define CK_GRAD | ( | model, | |
| section, | |||
| layer | |||
| ) | (&(model)->sections[section].grads[layer]) |
Definition at line 331 of file ckernel_memory_layout.h.
| #define CK_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */ |
Definition at line 50 of file ckernel_memory_layout.h.
| #define CK_HUGE_2MB (2ULL << 20) /* 2MB hugepage */ |
Definition at line 49 of file ckernel_memory_layout.h.
| #define CK_LAYER | ( | model, | |
| section, | |||
| layer | |||
| ) | (&(model)->sections[section].layers[layer]) |
Definition at line 330 of file ckernel_memory_layout.h.
| #define CK_PTR | ( | model, | |
| offset | |||
| ) | ((float*)((char*)(model)->base + (offset))) |
Definition at line 329 of file ckernel_memory_layout.h.
| #define CK_SIMD_ALIGN 64ULL /* AVX-512 alignment */ |
Definition at line 51 of file ckernel_memory_layout.h.
| enum CKFusionFlags |
Definition at line 269 of file ckernel_memory_layout.h.
| int ck_memory_allocate | ( | CKModel * | model, |
| int | use_hugepages | ||
| ) |
Allocate the planned memory.
| model | Model with planned offsets |
| use_hugepages | 0=regular malloc, 1=2MB hugepages, 2=1GB hugepages |
| void ck_memory_free | ( | CKModel * | model | ) |
Free the model memory.
| model | Model to free |
| size_t ck_memory_plan | ( | const CKSectionConfig * | sections, |
| int | num_sections, | ||
| int | mode, | ||
| uint32_t | fusion_flags, | ||
| CKModel * | out_model | ||
| ) |
Plan memory layout for a model.
| sections | Array of section configs |
| num_sections | Number of sections |
| mode | 0=inference, 1=training (includes gradients) |
| fusion_flags | Which operations to fuse |
| out_model | Output: model with all offsets computed |