Single-Arena Memory Layout for CPU-Optimized Inference & Training. More...

#include <stddef.h>
#include <stdint.h>

Data Structures
struct	CKLayerGradOffsets

struct	CKLayerOffsets

struct	CKModel

struct	CKSection

struct	CKSectionConfig

Macros
#define	CK_CACHE_LINE 64ULL /* CPU cache line */

#define	CK_GRAD(model, section, layer) (&(model)->sections[section].grads[layer])

#define	CK_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

#define	CK_HUGE_2MB (2ULL << 20) /* 2MB hugepage */

#define	CK_LAYER(model, section, layer) (&(model)->sections[section].layers[layer])

#define	CK_PTR(model, offset) ((float)((char)(model)->base + (offset)))

#define	CK_SIMD_ALIGN 64ULL /* AVX-512 alignment */

Enumerations
enum	CKFusionFlags { CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_ADD_NORM = 1 << 7 , CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_RESIDUAL_NORM = 1 << 7 }

Functions
int	ck_memory_allocate (CKModel *model, int use_hugepages)

void	ck_memory_free (CKModel *model)

size_t	ck_memory_plan (const CKSectionConfig sections, int num_sections, int mode, uint32_t fusion_flags, CKModel out_model)

Detailed Description

Single-Arena Memory Layout for CPU-Optimized Inference & Training.

PHILOSOPHY:

The CPU doesn't care if data is a weight or activation - it just needs sequential memory access. This design:

ONE contiguous allocation (hugepage-backed)
ALL offsets from single base pointer
Weights and activations INTERLEAVED in execution order
Fusion = same offset (skip write, read directly)
No fusion = stride over (memory allocated but unused)

BENEFITS:

No malloc/free at runtime = no corruption, no double-free
Prefetch layer N+1 while computing layer N (different DRAM bank)
NUMA-aware: 1GB hugepages map to different memory channels
Offset arithmetic only, no pointer chasing

FUSION EXAMPLE:

Without fusion: rmsnorm writes to ln1_output (offset 1000) qkv_project reads from ln1_output (offset 1000)

With fusion (rmsnorm + qkv fused): fused kernel reads from input, writes directly to q/k/v ln1_output memory (offset 1000) is SKIPPED but still allocated CPU prefetch streams over it, no penalty

Definition in file ckernel_memory_layout.h.

Macro Definition Documentation

◆ CK_CACHE_LINE

#define CK_CACHE_LINE 64ULL /* CPU cache line */

Definition at line 48 of file ckernel_memory_layout.h.

◆ CK_GRAD

#define CK_GRAD	(	model,
		section,
		layer
	)	(&(model)->sections[section].grads[layer])

Definition at line 331 of file ckernel_memory_layout.h.

◆ CK_HUGE_1GB

#define CK_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

Definition at line 50 of file ckernel_memory_layout.h.

◆ CK_HUGE_2MB

#define CK_HUGE_2MB (2ULL << 20) /* 2MB hugepage */

Definition at line 49 of file ckernel_memory_layout.h.

◆ CK_LAYER

#define CK_LAYER	(	model,
		section,
		layer
	)	(&(model)->sections[section].layers[layer])

Definition at line 330 of file ckernel_memory_layout.h.

◆ CK_PTR

#define CK_PTR	(	model,
		offset
	)	((float)((char)(model)->base + (offset)))

Definition at line 329 of file ckernel_memory_layout.h.

◆ CK_SIMD_ALIGN

#define CK_SIMD_ALIGN 64ULL /* AVX-512 alignment */

Definition at line 51 of file ckernel_memory_layout.h.

Enumeration Type Documentation

◆ CKFusionFlags

enum CKFusionFlags

Enumerator
CK_FUSE_NONE
CK_FUSE_EMBED_NORM
CK_FUSE_NORM_QKV
CK_FUSE_QKV_ROPE
CK_FUSE_ATTN_PROJ
CK_FUSE_NORM_MLP
CK_FUSE_MLP_GATE_UP
CK_FUSE_MLP_ACT_DOWN
CK_FUSE_ADD_NORM
CK_FUSE_NONE
CK_FUSE_EMBED_NORM
CK_FUSE_NORM_QKV
CK_FUSE_QKV_ROPE
CK_FUSE_ATTN_PROJ
CK_FUSE_NORM_MLP
CK_FUSE_MLP_GATE_UP
CK_FUSE_MLP_ACT_DOWN
CK_FUSE_RESIDUAL_NORM

Definition at line 269 of file ckernel_memory_layout.h.

              {
     CK_FUSE_NONE            = 0,
     CK_FUSE_EMBED_NORM      = 1 << 0,   /* embed + first layernorm */
     CK_FUSE_NORM_QKV        = 1 << 1,   /* layernorm + qkv projection */
     CK_FUSE_QKV_ROPE        = 1 << 2,   /* qkv projection + rope */
     CK_FUSE_ATTN_PROJ       = 1 << 3,   /* attention + output projection */
     CK_FUSE_NORM_MLP        = 1 << 4,   /* layernorm + mlp gate/up */
     CK_FUSE_MLP_GATE_UP     = 1 << 5,   /* gate and up projections */
     CK_FUSE_MLP_ACT_DOWN    = 1 << 6,   /* activation + down projection */
     CK_FUSE_ADD_NORM        = 1 << 7,   /* residual add + layernorm */
 } CKFusionFlags;

Function Documentation

◆ ck_memory_allocate()

int ck_memory_allocate	(	CKModel *	model,
		int	use_hugepages
	)

Allocate the planned memory.

Parameters

model	Model with planned offsets
use_hugepages	0=regular malloc, 1=2MB hugepages, 2=1GB hugepages

Returns: 0 on success, -1 on failure

◆ ck_memory_free()

void ck_memory_free ( CKModel * model )

Free the model memory.

Parameters

model Model to free

◆ ck_memory_plan()

size_t ck_memory_plan	(	const CKSectionConfig *	sections,
		int	num_sections,
		int	mode,
		uint32_t	fusion_flags,
		CKModel *	out_model
	)

Plan memory layout for a model.

Parameters

sections	Array of section configs
num_sections	Number of sections
mode	0=inference, 1=training (includes gradients)
fusion_flags	Which operations to fuse
out_model	Output: model with all offsets computed

Returns: Total bytes needed (0 on error)

Data Structures

Macros

Enumerations

Functions

Detailed Description

PHILOSOPHY:

FUSION EXAMPLE:

Macro Definition Documentation

◆ CK_CACHE_LINE

◆ CK_GRAD

◆ CK_HUGE_1GB

◆ CK_HUGE_2MB

◆ CK_LAYER

◆ CK_PTR

◆ CK_SIMD_ALIGN

Enumeration Type Documentation

◆ CKFusionFlags

Function Documentation

◆ ck_memory_allocate()

◆ ck_memory_free()

◆ ck_memory_plan()