← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ckernel_memory_layout.h File Reference

Single-Arena Memory Layout for CPU-Optimized Inference & Training. More...

#include <stddef.h>
#include <stdint.h>

Go to the source code of this file.

Data Structures

struct  CKLayerGradOffsets
 
struct  CKLayerOffsets
 
struct  CKModel
 
struct  CKSection
 
struct  CKSectionConfig
 

Macros

#define CK_CACHE_LINE   64ULL /* CPU cache line */
 
#define CK_GRAD(model, section, layer)   (&(model)->sections[section].grads[layer])
 
#define CK_HUGE_1GB   (1ULL << 30) /* 1GB hugepage (NUMA optimal) */
 
#define CK_HUGE_2MB   (2ULL << 20) /* 2MB hugepage */
 
#define CK_LAYER(model, section, layer)   (&(model)->sections[section].layers[layer])
 
#define CK_PTR(model, offset)   ((float*)((char*)(model)->base + (offset)))
 
#define CK_SIMD_ALIGN   64ULL /* AVX-512 alignment */
 

Enumerations

enum  CKFusionFlags {
  CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 ,
  CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 ,
  CK_FUSE_ADD_NORM = 1 << 7 , CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 ,
  CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 ,
  CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_RESIDUAL_NORM = 1 << 7
}
 

Functions

int ck_memory_allocate (CKModel *model, int use_hugepages)
 
void ck_memory_free (CKModel *model)
 
size_t ck_memory_plan (const CKSectionConfig *sections, int num_sections, int mode, uint32_t fusion_flags, CKModel *out_model)
 

Detailed Description

Single-Arena Memory Layout for CPU-Optimized Inference & Training.

PHILOSOPHY:

The CPU doesn't care if data is a weight or activation - it just needs sequential memory access. This design:

  1. ONE contiguous allocation (hugepage-backed)
  2. ALL offsets from single base pointer
  3. Weights and activations INTERLEAVED in execution order
  4. Fusion = same offset (skip write, read directly)
  5. No fusion = stride over (memory allocated but unused)

BENEFITS:

  • No malloc/free at runtime = no corruption, no double-free
  • Prefetch layer N+1 while computing layer N (different DRAM bank)
  • NUMA-aware: 1GB hugepages map to different memory channels
  • Offset arithmetic only, no pointer chasing

FUSION EXAMPLE:

Without fusion: rmsnorm writes to ln1_output (offset 1000) qkv_project reads from ln1_output (offset 1000)

With fusion (rmsnorm + qkv fused): fused kernel reads from input, writes directly to q/k/v ln1_output memory (offset 1000) is SKIPPED but still allocated CPU prefetch streams over it, no penalty

Definition in file ckernel_memory_layout.h.

Macro Definition Documentation

◆ CK_CACHE_LINE

#define CK_CACHE_LINE   64ULL /* CPU cache line */

Definition at line 48 of file ckernel_memory_layout.h.

◆ CK_GRAD

#define CK_GRAD (   model,
  section,
  layer 
)    (&(model)->sections[section].grads[layer])

Definition at line 331 of file ckernel_memory_layout.h.

◆ CK_HUGE_1GB

#define CK_HUGE_1GB   (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

Definition at line 50 of file ckernel_memory_layout.h.

◆ CK_HUGE_2MB

#define CK_HUGE_2MB   (2ULL << 20) /* 2MB hugepage */

Definition at line 49 of file ckernel_memory_layout.h.

◆ CK_LAYER

#define CK_LAYER (   model,
  section,
  layer 
)    (&(model)->sections[section].layers[layer])

Definition at line 330 of file ckernel_memory_layout.h.

◆ CK_PTR

#define CK_PTR (   model,
  offset 
)    ((float*)((char*)(model)->base + (offset)))

Definition at line 329 of file ckernel_memory_layout.h.

◆ CK_SIMD_ALIGN

#define CK_SIMD_ALIGN   64ULL /* AVX-512 alignment */

Definition at line 51 of file ckernel_memory_layout.h.

Enumeration Type Documentation

◆ CKFusionFlags

Enumerator
CK_FUSE_NONE 
CK_FUSE_EMBED_NORM 
CK_FUSE_NORM_QKV 
CK_FUSE_QKV_ROPE 
CK_FUSE_ATTN_PROJ 
CK_FUSE_NORM_MLP 
CK_FUSE_MLP_GATE_UP 
CK_FUSE_MLP_ACT_DOWN 
CK_FUSE_ADD_NORM 
CK_FUSE_NONE 
CK_FUSE_EMBED_NORM 
CK_FUSE_NORM_QKV 
CK_FUSE_QKV_ROPE 
CK_FUSE_ATTN_PROJ 
CK_FUSE_NORM_MLP 
CK_FUSE_MLP_GATE_UP 
CK_FUSE_MLP_ACT_DOWN 
CK_FUSE_RESIDUAL_NORM 

Definition at line 269 of file ckernel_memory_layout.h.

269  {
270  CK_FUSE_NONE = 0,
271  CK_FUSE_EMBED_NORM = 1 << 0, /* embed + first layernorm */
272  CK_FUSE_NORM_QKV = 1 << 1, /* layernorm + qkv projection */
273  CK_FUSE_QKV_ROPE = 1 << 2, /* qkv projection + rope */
274  CK_FUSE_ATTN_PROJ = 1 << 3, /* attention + output projection */
275  CK_FUSE_NORM_MLP = 1 << 4, /* layernorm + mlp gate/up */
276  CK_FUSE_MLP_GATE_UP = 1 << 5, /* gate and up projections */
277  CK_FUSE_MLP_ACT_DOWN = 1 << 6, /* activation + down projection */
278  CK_FUSE_ADD_NORM = 1 << 7, /* residual add + layernorm */
279 } CKFusionFlags;
@ CK_FUSE_NORM_QKV
@ CK_FUSE_ATTN_PROJ
@ CK_FUSE_EMBED_NORM
@ CK_FUSE_ADD_NORM
@ CK_FUSE_MLP_ACT_DOWN
@ CK_FUSE_MLP_GATE_UP
@ CK_FUSE_QKV_ROPE
@ CK_FUSE_NORM_MLP
@ CK_FUSE_NONE

Function Documentation

◆ ck_memory_allocate()

int ck_memory_allocate ( CKModel model,
int  use_hugepages 
)

Allocate the planned memory.

Parameters
modelModel with planned offsets
use_hugepages0=regular malloc, 1=2MB hugepages, 2=1GB hugepages
Returns
0 on success, -1 on failure

◆ ck_memory_free()

void ck_memory_free ( CKModel model)

Free the model memory.

Parameters
modelModel to free

◆ ck_memory_plan()

size_t ck_memory_plan ( const CKSectionConfig sections,
int  num_sections,
int  mode,
uint32_t  fusion_flags,
CKModel out_model 
)

Plan memory layout for a model.

Parameters
sectionsArray of section configs
num_sectionsNumber of sections
mode0=inference, 1=training (includes gradients)
fusion_flagsWhich operations to fuse
out_modelOutput: model with all offsets computed
Returns
Total bytes needed (0 on error)