Section-Based Memory Layout: Header / Body / Footer Pattern. More...

#include <stddef.h>
#include <stdint.h>

Data Structures
struct	CKFooterGradOffsets

struct	CKFooterOffsets

struct	CKHeaderOffsets

struct	CKLayerGradOffsets

struct	CKLayerOffsets

struct	CKLayerOptimizerOffsets

struct	CKModel

struct	CKSection

struct	CKSectionConfig

Macros
#define	CK_ALIGN_CACHE 64ULL /* CPU cache line (64 bytes) */

#define	CK_ALIGN_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

#define	CK_ALIGN_HUGE_2MB (2ULL << 20) /* 2MB hugepage */

#define	CK_ALIGN_SIMD 64ULL /* AVX-512 register (64 bytes) */

#define	CK_FOOTER(model, s) (&(model)->sections[s].footer)

#define	CK_HEADER(model, s) (&(model)->sections[s].header)

#define	CK_LAYER(model, s, l) (&(model)->sections[s].layers[l])

#define	CK_LAYER_GRAD(model, s, l) (&(model)->sections[s].layer_grads[l])

#define	CK_PTR(model, offset) ((float)((char)(model)->base + (offset)))

#define	CK_PTR_BF16(model, offset) ((uint16_t)((char)(model)->base + (offset)))

Enumerations
enum	CKFusionFlags { CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_ADD_NORM = 1 << 7 , CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_RESIDUAL_NORM = 1 << 7 }

Functions
int	ck_model_allocate (CKModel *model, int hugepage_mode)

void	ck_model_free (CKModel *model)

size_t	ck_model_plan (CKModel model, const CKSectionConfig configs, int num_sections, int training_enabled, uint32_t fusion_flags)

void	ck_section_config_init (CKSectionConfig *config, size_t simd_align)

size_t	ck_section_plan (CKSection section, const CKSectionConfig config, int training_enabled, size_t base_offset)

Detailed Description

Section-Based Memory Layout: Header / Body / Footer Pattern.

PHILOSOPHY: WHY NO MEMORY REUSE?

CPU PREFETCH NEEDS PREDICTABLE PATTERNS
- Prefetcher learns: "he's streaming forward through memory"
- Reuse breaks this: same address, different data, different time
- Result: prefetcher gives up, cache misses explode
TRAINING NEEDS ALL ACTIVATIONS
- Forward: compute activations
- Backward: need EVERY activation to compute gradients
- Can't reuse what you still need!
MEMORY IS CHEAP, BANDWIDTH IS EXPENSIVE
- DDR5: ~$3/GB (1TB = $3000, trivial for training cluster)
- Bandwidth: 100-400 GB/s per socket (the real bottleneck)
- Optimize for streaming, not for saving bytes
SINGLE ALLOCATION = ZERO RUNTIME MALLOC
- No fragmentation, no free(), no double-free bugs
- Hugepage-backed: 1GB pages for NUMA optimization
- One base pointer + offsets = maximum simplicity

SECTION ARCHITECTURE

Multi-modal models have multiple SECTIONS. Each section is a complete encoder/decoder/vision/audio module with its own dimensions.

┌─────────────────────────────────────────────────────────────────┐ │ SINGLE ALLOCATION │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 0: Vision Encoder │ │ ├── HEADER: patch_embed, pos_embed │ │ ├── BODY: layer[0..N] (weights + activations interleaved) │ │ └── FOOTER: final_norm, bridge_to_text │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 1: Text Decoder │ │ ├── HEADER: token_embed, pos_embed │ │ ├── BODY: layer[0..N] (weights + activations interleaved) │ │ └── FOOTER: final_norm, lm_head, logits │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 2: Audio Encoder (optional) │ │ ├── HEADER: mel_embed │ │ ├── BODY: layer[0..N] │ │ └── FOOTER: bridge_to_text │ ├─────────────────────────────────────────────────────────────────┤ │ GRADIENTS (if training) │ │ └── Same layout: section[0].grads, section[1].grads, ... │ ├─────────────────────────────────────────────────────────────────┤ │ OPTIMIZER STATE (if training with Adam) │ │ └── m[], v[] for each weight │ └─────────────────────────────────────────────────────────────────┘

EXECUTION ORDER LAYOUT

Within each layer, memory is laid out in the ORDER operations execute:

weight → activation → weight → activation → ...

This is critical for CPU cache efficiency. The CPU streams forward, prefetching the next cache line while processing the current one.

Definition in file ckernel_section_layout.h.

Macro Definition Documentation

◆ CK_ALIGN_CACHE

#define CK_ALIGN_CACHE 64ULL /* CPU cache line (64 bytes) */

Definition at line 88 of file ckernel_section_layout.h.

◆ CK_ALIGN_HUGE_1GB

#define CK_ALIGN_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

Definition at line 91 of file ckernel_section_layout.h.

◆ CK_ALIGN_HUGE_2MB

#define CK_ALIGN_HUGE_2MB (2ULL << 20) /* 2MB hugepage */

Definition at line 90 of file ckernel_section_layout.h.

◆ CK_ALIGN_SIMD

#define CK_ALIGN_SIMD 64ULL /* AVX-512 register (64 bytes) */

Definition at line 89 of file ckernel_section_layout.h.

◆ CK_FOOTER

#define CK_FOOTER	(	model,
		s
	)	(&(model)->sections[s].footer)

Definition at line 541 of file ckernel_section_layout.h.

◆ CK_HEADER

#define CK_HEADER	(	model,
		s
	)	(&(model)->sections[s].header)

Definition at line 537 of file ckernel_section_layout.h.

◆ CK_LAYER

#define CK_LAYER	(	model,
		s,
		l
	)	(&(model)->sections[s].layers[l])

Definition at line 529 of file ckernel_section_layout.h.

◆ CK_LAYER_GRAD

#define CK_LAYER_GRAD	(	model,
		s,
		l
	)	(&(model)->sections[s].layer_grads[l])

Definition at line 533 of file ckernel_section_layout.h.

◆ CK_PTR

#define CK_PTR	(	model,
		offset
	)	((float)((char)(model)->base + (offset)))

Definition at line 522 of file ckernel_section_layout.h.

◆ CK_PTR_BF16

#define CK_PTR_BF16	(	model,
		offset
	)	((uint16_t)((char)(model)->base + (offset)))

Definition at line 525 of file ckernel_section_layout.h.

Enumeration Type Documentation

◆ CKFusionFlags

enum CKFusionFlags

Enumerator
CK_FUSE_NONE
CK_FUSE_EMBED_NORM
CK_FUSE_NORM_QKV
CK_FUSE_QKV_ROPE
CK_FUSE_ATTN_PROJ
CK_FUSE_NORM_MLP
CK_FUSE_MLP_GATE_UP
CK_FUSE_MLP_ACT_DOWN
CK_FUSE_ADD_NORM
CK_FUSE_NONE
CK_FUSE_EMBED_NORM
CK_FUSE_NORM_QKV
CK_FUSE_QKV_ROPE
CK_FUSE_ATTN_PROJ
CK_FUSE_NORM_MLP
CK_FUSE_MLP_GATE_UP
CK_FUSE_MLP_ACT_DOWN
CK_FUSE_RESIDUAL_NORM

Definition at line 459 of file ckernel_section_layout.h.

              {
     CK_FUSE_NONE            = 0,
     CK_FUSE_EMBED_NORM      = 1 << 0,   /* embedding + first layernorm */
     CK_FUSE_NORM_QKV        = 1 << 1,   /* layernorm + QKV projection */
     CK_FUSE_QKV_ROPE        = 1 << 2,   /* QKV + rotary position encoding */
     CK_FUSE_ATTN_PROJ       = 1 << 3,   /* attention output + projection */
     CK_FUSE_NORM_MLP        = 1 << 4,   /* layernorm + MLP input */
     CK_FUSE_MLP_GATE_UP     = 1 << 5,   /* gate and up projections together */
     CK_FUSE_MLP_ACT_DOWN    = 1 << 6,   /* activation + down projection */
     CK_FUSE_RESIDUAL_NORM   = 1 << 7,   /* residual add + layernorm */
 } CKFusionFlags;

Function Documentation

◆ ck_model_allocate()

int ck_model_allocate	(	CKModel *	model,
		int	hugepage_mode
	)

Allocate the planned memory.

Parameters

hugepage_mode 0=normal, 1=2MB hugepages, 2=1GB hugepages

Returns: 0 on success, -1 on failure

◆ ck_model_free()

void ck_model_free ( CKModel * model )

Free the model (single free, since single allocation).

◆ ck_model_plan()

size_t ck_model_plan	(	CKModel *	model,
		const CKSectionConfig *	configs,
		int	num_sections,
		int	training_enabled,
		uint32_t	fusion_flags
	)

Plan memory layout for complete model. Returns total bytes needed.

◆ ck_section_config_init()

void ck_section_config_init	(	CKSectionConfig *	config,
		size_t	simd_align
	)

Initialize section config with computed alignments.

◆ ck_section_plan()

size_t ck_section_plan	(	CKSection *	section,
		const CKSectionConfig *	config,
		int	training_enabled,
		size_t	base_offset
	)

Plan memory layout for a single section. Returns bytes needed for this section.

Data Structures

Macros

Enumerations

Functions