← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ckernel_section_layout.h File Reference

Section-Based Memory Layout: Header / Body / Footer Pattern. More...

#include <stddef.h>
#include <stdint.h>

Go to the source code of this file.

Data Structures

struct  CKFooterGradOffsets
 
struct  CKFooterOffsets
 
struct  CKHeaderOffsets
 
struct  CKLayerGradOffsets
 
struct  CKLayerOffsets
 
struct  CKLayerOptimizerOffsets
 
struct  CKModel
 
struct  CKSection
 
struct  CKSectionConfig
 

Macros

#define CK_ALIGN_CACHE   64ULL /* CPU cache line (64 bytes) */
 
#define CK_ALIGN_HUGE_1GB   (1ULL << 30) /* 1GB hugepage (NUMA optimal) */
 
#define CK_ALIGN_HUGE_2MB   (2ULL << 20) /* 2MB hugepage */
 
#define CK_ALIGN_SIMD   64ULL /* AVX-512 register (64 bytes) */
 
#define CK_FOOTER(model, s)    (&(model)->sections[s].footer)
 
#define CK_HEADER(model, s)    (&(model)->sections[s].header)
 
#define CK_LAYER(model, s, l)    (&(model)->sections[s].layers[l])
 
#define CK_LAYER_GRAD(model, s, l)    (&(model)->sections[s].layer_grads[l])
 
#define CK_PTR(model, offset)    ((float*)((char*)(model)->base + (offset)))
 
#define CK_PTR_BF16(model, offset)    ((uint16_t*)((char*)(model)->base + (offset)))
 

Enumerations

enum  CKFusionFlags {
  CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 , CK_FUSE_QKV_ROPE = 1 << 2 ,
  CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 , CK_FUSE_MLP_ACT_DOWN = 1 << 6 ,
  CK_FUSE_ADD_NORM = 1 << 7 , CK_FUSE_NONE = 0 , CK_FUSE_EMBED_NORM = 1 << 0 , CK_FUSE_NORM_QKV = 1 << 1 ,
  CK_FUSE_QKV_ROPE = 1 << 2 , CK_FUSE_ATTN_PROJ = 1 << 3 , CK_FUSE_NORM_MLP = 1 << 4 , CK_FUSE_MLP_GATE_UP = 1 << 5 ,
  CK_FUSE_MLP_ACT_DOWN = 1 << 6 , CK_FUSE_RESIDUAL_NORM = 1 << 7
}
 

Functions

int ck_model_allocate (CKModel *model, int hugepage_mode)
 
void ck_model_free (CKModel *model)
 
size_t ck_model_plan (CKModel *model, const CKSectionConfig *configs, int num_sections, int training_enabled, uint32_t fusion_flags)
 
void ck_section_config_init (CKSectionConfig *config, size_t simd_align)
 
size_t ck_section_plan (CKSection *section, const CKSectionConfig *config, int training_enabled, size_t base_offset)
 

Detailed Description

Section-Based Memory Layout: Header / Body / Footer Pattern.


PHILOSOPHY: WHY NO MEMORY REUSE?

  1. CPU PREFETCH NEEDS PREDICTABLE PATTERNS
    • Prefetcher learns: "he's streaming forward through memory"
    • Reuse breaks this: same address, different data, different time
    • Result: prefetcher gives up, cache misses explode
  2. TRAINING NEEDS ALL ACTIVATIONS
    • Forward: compute activations
    • Backward: need EVERY activation to compute gradients
    • Can't reuse what you still need!
  3. MEMORY IS CHEAP, BANDWIDTH IS EXPENSIVE
    • DDR5: ~$3/GB (1TB = $3000, trivial for training cluster)
    • Bandwidth: 100-400 GB/s per socket (the real bottleneck)
    • Optimize for streaming, not for saving bytes
  4. SINGLE ALLOCATION = ZERO RUNTIME MALLOC
    • No fragmentation, no free(), no double-free bugs
    • Hugepage-backed: 1GB pages for NUMA optimization
    • One base pointer + offsets = maximum simplicity

SECTION ARCHITECTURE

Multi-modal models have multiple SECTIONS. Each section is a complete encoder/decoder/vision/audio module with its own dimensions.

┌─────────────────────────────────────────────────────────────────┐ │ SINGLE ALLOCATION │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 0: Vision Encoder │ │ ├── HEADER: patch_embed, pos_embed │ │ ├── BODY: layer[0..N] (weights + activations interleaved) │ │ └── FOOTER: final_norm, bridge_to_text │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 1: Text Decoder │ │ ├── HEADER: token_embed, pos_embed │ │ ├── BODY: layer[0..N] (weights + activations interleaved) │ │ └── FOOTER: final_norm, lm_head, logits │ ├─────────────────────────────────────────────────────────────────┤ │ SECTION 2: Audio Encoder (optional) │ │ ├── HEADER: mel_embed │ │ ├── BODY: layer[0..N] │ │ └── FOOTER: bridge_to_text │ ├─────────────────────────────────────────────────────────────────┤ │ GRADIENTS (if training) │ │ └── Same layout: section[0].grads, section[1].grads, ... │ ├─────────────────────────────────────────────────────────────────┤ │ OPTIMIZER STATE (if training with Adam) │ │ └── m[], v[] for each weight │ └─────────────────────────────────────────────────────────────────┘


EXECUTION ORDER LAYOUT

Within each layer, memory is laid out in the ORDER operations execute:

weight → activation → weight → activation → ...

This is critical for CPU cache efficiency. The CPU streams forward, prefetching the next cache line while processing the current one.

Definition in file ckernel_section_layout.h.

Macro Definition Documentation

◆ CK_ALIGN_CACHE

#define CK_ALIGN_CACHE   64ULL /* CPU cache line (64 bytes) */

Definition at line 88 of file ckernel_section_layout.h.

◆ CK_ALIGN_HUGE_1GB

#define CK_ALIGN_HUGE_1GB   (1ULL << 30) /* 1GB hugepage (NUMA optimal) */

Definition at line 91 of file ckernel_section_layout.h.

◆ CK_ALIGN_HUGE_2MB

#define CK_ALIGN_HUGE_2MB   (2ULL << 20) /* 2MB hugepage */

Definition at line 90 of file ckernel_section_layout.h.

◆ CK_ALIGN_SIMD

#define CK_ALIGN_SIMD   64ULL /* AVX-512 register (64 bytes) */

Definition at line 89 of file ckernel_section_layout.h.

◆ CK_FOOTER

#define CK_FOOTER (   model,
 
)     (&(model)->sections[s].footer)

Definition at line 541 of file ckernel_section_layout.h.

◆ CK_HEADER

#define CK_HEADER (   model,
 
)     (&(model)->sections[s].header)

Definition at line 537 of file ckernel_section_layout.h.

◆ CK_LAYER

#define CK_LAYER (   model,
  s,
 
)     (&(model)->sections[s].layers[l])

Definition at line 529 of file ckernel_section_layout.h.

◆ CK_LAYER_GRAD

#define CK_LAYER_GRAD (   model,
  s,
 
)     (&(model)->sections[s].layer_grads[l])

Definition at line 533 of file ckernel_section_layout.h.

◆ CK_PTR

#define CK_PTR (   model,
  offset 
)     ((float*)((char*)(model)->base + (offset)))

Definition at line 522 of file ckernel_section_layout.h.

◆ CK_PTR_BF16

#define CK_PTR_BF16 (   model,
  offset 
)     ((uint16_t*)((char*)(model)->base + (offset)))

Definition at line 525 of file ckernel_section_layout.h.

Enumeration Type Documentation

◆ CKFusionFlags

Enumerator
CK_FUSE_NONE 
CK_FUSE_EMBED_NORM 
CK_FUSE_NORM_QKV 
CK_FUSE_QKV_ROPE 
CK_FUSE_ATTN_PROJ 
CK_FUSE_NORM_MLP 
CK_FUSE_MLP_GATE_UP 
CK_FUSE_MLP_ACT_DOWN 
CK_FUSE_ADD_NORM 
CK_FUSE_NONE 
CK_FUSE_EMBED_NORM 
CK_FUSE_NORM_QKV 
CK_FUSE_QKV_ROPE 
CK_FUSE_ATTN_PROJ 
CK_FUSE_NORM_MLP 
CK_FUSE_MLP_GATE_UP 
CK_FUSE_MLP_ACT_DOWN 
CK_FUSE_RESIDUAL_NORM 

Definition at line 459 of file ckernel_section_layout.h.

459  {
460  CK_FUSE_NONE = 0,
461  CK_FUSE_EMBED_NORM = 1 << 0, /* embedding + first layernorm */
462  CK_FUSE_NORM_QKV = 1 << 1, /* layernorm + QKV projection */
463  CK_FUSE_QKV_ROPE = 1 << 2, /* QKV + rotary position encoding */
464  CK_FUSE_ATTN_PROJ = 1 << 3, /* attention output + projection */
465  CK_FUSE_NORM_MLP = 1 << 4, /* layernorm + MLP input */
466  CK_FUSE_MLP_GATE_UP = 1 << 5, /* gate and up projections together */
467  CK_FUSE_MLP_ACT_DOWN = 1 << 6, /* activation + down projection */
468  CK_FUSE_RESIDUAL_NORM = 1 << 7, /* residual add + layernorm */
469 } CKFusionFlags;
@ CK_FUSE_NORM_QKV
@ CK_FUSE_ATTN_PROJ
@ CK_FUSE_EMBED_NORM
@ CK_FUSE_RESIDUAL_NORM
@ CK_FUSE_MLP_ACT_DOWN
@ CK_FUSE_MLP_GATE_UP
@ CK_FUSE_QKV_ROPE
@ CK_FUSE_NORM_MLP

Function Documentation

◆ ck_model_allocate()

int ck_model_allocate ( CKModel model,
int  hugepage_mode 
)

Allocate the planned memory.

Parameters
hugepage_mode0=normal, 1=2MB hugepages, 2=1GB hugepages
Returns
0 on success, -1 on failure

◆ ck_model_free()

void ck_model_free ( CKModel model)

Free the model (single free, since single allocation).

◆ ck_model_plan()

size_t ck_model_plan ( CKModel model,
const CKSectionConfig configs,
int  num_sections,
int  training_enabled,
uint32_t  fusion_flags 
)

Plan memory layout for complete model. Returns total bytes needed.

◆ ck_section_config_init()

void ck_section_config_init ( CKSectionConfig config,
size_t  simd_align 
)

Initialize section config with computed alignments.

◆ ck_section_plan()

size_t ck_section_plan ( CKSection section,
const CKSectionConfig config,
int  training_enabled,
size_t  base_offset 
)

Plan memory layout for a single section. Returns bytes needed for this section.