← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ckernel_memory_layout.h
Go to the documentation of this file.
1 /**
2  * @file ckernel_memory_layout.h
3  * @brief Single-Arena Memory Layout for CPU-Optimized Inference & Training
4  *
5  * PHILOSOPHY:
6  * -----------
7  * The CPU doesn't care if data is a weight or activation - it just needs
8  * sequential memory access. This design:
9  *
10  * 1. ONE contiguous allocation (hugepage-backed)
11  * 2. ALL offsets from single base pointer
12  * 3. Weights and activations INTERLEAVED in execution order
13  * 4. Fusion = same offset (skip write, read directly)
14  * 5. No fusion = stride over (memory allocated but unused)
15  *
16  * BENEFITS:
17  * - No malloc/free at runtime = no corruption, no double-free
18  * - Prefetch layer N+1 while computing layer N (different DRAM bank)
19  * - NUMA-aware: 1GB hugepages map to different memory channels
20  * - Offset arithmetic only, no pointer chasing
21  *
22  * FUSION EXAMPLE:
23  * ---------------
24  * Without fusion:
25  * rmsnorm writes to ln1_output (offset 1000)
26  * qkv_project reads from ln1_output (offset 1000)
27  *
28  * With fusion (rmsnorm + qkv fused):
29  * fused kernel reads from input, writes directly to q/k/v
30  * ln1_output memory (offset 1000) is SKIPPED but still allocated
31  * CPU prefetch streams over it, no penalty
32  */
33 
34 #ifndef CKERNEL_MEMORY_LAYOUT_H
35 #define CKERNEL_MEMORY_LAYOUT_H
36 
37 #include <stddef.h>
38 #include <stdint.h>
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 /* ============================================================================
45  * ALIGNMENT CONSTANTS
46  * ============================================================================ */
47 
48 #define CK_CACHE_LINE 64ULL /* CPU cache line */
49 #define CK_HUGE_2MB (2ULL << 20) /* 2MB hugepage */
50 #define CK_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */
51 #define CK_SIMD_ALIGN 64ULL /* AVX-512 alignment */
52 
53 /* ============================================================================
54  * SECTION CONFIG (each section = encoder/decoder/vision/audio)
55  *
56  * Multi-modal models have multiple sections. Each section has its own config
57  * but shares the SAME memory arena. Bridges connect sections.
58  * ============================================================================ */
59 
60 typedef struct {
61  /* Model dimensions for this section */
62  int embed_dim; /* hidden_size */
63  int num_heads; /* attention heads */
64  int num_kv_heads; /* key-value heads (GQA) */
65  int head_dim; /* embed_dim / num_heads */
66  int intermediate_dim; /* MLP hidden size */
67  int num_layers; /* transformer layers in this section */
68  int vocab_size; /* only for sections with embeddings */
69  int max_seq_len; /* context window */
70 
71  /* Computed alignments */
72  int aligned_embed; /* embed_dim aligned to SIMD */
73  int aligned_head; /* head_dim aligned to SIMD */
74  int aligned_intermediate; /* intermediate aligned to SIMD */
76 
77 /* ============================================================================
78  * LAYER OFFSETS - Weights and Activations INTERLEAVED in execution order
79  *
80  * Pattern: weight -> activation -> weight -> activation ...
81  * This is the KEY insight: CPU streams through memory sequentially.
82  * Whether fused or not, data is laid out in the order it's needed.
83  * ============================================================================ */
84 
85 typedef struct {
86  /* === PRE-ATTENTION NORM === */
87  size_t ln1_gamma; /* [embed] weight */
88  size_t ln1_beta; /* [embed] weight (optional, some use RMSNorm) */
89  size_t ln1_output; /* [tokens, embed] activation */
90 
91  /* === QKV PROJECTION === */
92  size_t wq; /* [embed, num_heads * head_dim] weight */
93  size_t wk; /* [embed, num_kv_heads * head_dim] weight */
94  size_t wv; /* [embed, num_kv_heads * head_dim] weight */
95  size_t bq; /* [num_heads * head_dim] bias (optional) */
96  size_t bk; /* [num_kv_heads * head_dim] bias (optional) */
97  size_t bv; /* [num_kv_heads * head_dim] bias (optional) */
98  size_t q; /* [tokens, num_heads, head_dim] activation */
99  size_t k; /* [tokens, num_kv_heads, head_dim] activation */
100  size_t v; /* [tokens, num_kv_heads, head_dim] activation */
101 
102  /* === ROPE (if applicable) === */
103  size_t q_rope; /* [tokens, num_heads, head_dim] activation */
104  size_t k_rope; /* [tokens, num_kv_heads, head_dim] activation */
105 
106  /* === ATTENTION === */
107  size_t attn_scores; /* [num_heads, tokens, tokens] activation */
108  size_t attn_probs; /* [num_heads, tokens, tokens] activation (after softmax) */
109  size_t attn_output; /* [tokens, num_heads, head_dim] activation */
110 
111  /* === OUTPUT PROJECTION === */
112  size_t wo; /* [num_heads * head_dim, embed] weight */
113  size_t bo; /* [embed] bias (optional) */
114  size_t proj_output; /* [tokens, embed] activation */
115 
116  /* === RESIDUAL 1 === */
117  size_t residual1; /* [tokens, embed] activation */
118 
119  /* === POST-ATTENTION NORM === */
120  size_t ln2_gamma; /* [embed] weight */
121  size_t ln2_beta; /* [embed] weight (optional) */
122  size_t ln2_output; /* [tokens, embed] activation */
123 
124  /* === MLP === */
125  size_t mlp_gate_w; /* [embed, intermediate] weight (gated MLP) */
126  size_t mlp_up_w; /* [embed, intermediate] weight */
127  size_t mlp_down_w; /* [intermediate, embed] weight */
128  size_t mlp_gate_b; /* [intermediate] bias (optional) */
129  size_t mlp_up_b; /* [intermediate] bias (optional) */
130  size_t mlp_down_b; /* [embed] bias (optional) */
131  size_t mlp_gate_out; /* [tokens, intermediate] activation */
132  size_t mlp_up_out; /* [tokens, intermediate] activation */
133  size_t mlp_act_out; /* [tokens, intermediate] activation (after SiLU/GELU) */
134  size_t mlp_down_out; /* [tokens, embed] activation */
135 
136  /* === RESIDUAL 2 (layer output) === */
137  size_t residual2; /* [tokens, embed] activation = layer output */
138 
139  /* === KV CACHE (decode mode) === */
140  size_t k_cache; /* [max_seq, num_kv_heads, head_dim] */
141  size_t v_cache; /* [max_seq, num_kv_heads, head_dim] */
142 
144 
145 /* ============================================================================
146  * GRADIENT OFFSETS - Same pattern, follows forward offsets
147  *
148  * For training: gradients laid out AFTER forward activations.
149  * Same interleaved pattern: d_weight, d_activation, d_weight, d_activation...
150  * ============================================================================ */
151 
152 typedef struct {
153  /* Gradients mirror forward structure */
154  size_t d_ln1_gamma;
155  size_t d_ln1_beta;
156  size_t d_ln1_output;
157 
158  size_t d_wq, d_wk, d_wv;
159  size_t d_bq, d_bk, d_bv;
160  size_t d_q, d_k, d_v;
161 
164 
165  size_t d_wo, d_bo;
167 
168  size_t d_ln2_gamma, d_ln2_beta;
169  size_t d_ln2_output;
170 
171  size_t d_mlp_gate_w, d_mlp_up_w, d_mlp_down_w;
172  size_t d_mlp_gate_out, d_mlp_up_out, d_mlp_down_out;
173 
174  /* Adam optimizer state (m and v) - also in same arena */
175  size_t m_ln1_gamma, v_ln1_gamma;
176  size_t m_wq, v_wq;
177  size_t m_wk, v_wk;
178  size_t m_wv, v_wv;
179  size_t m_wo, v_wo;
180  size_t m_ln2_gamma, v_ln2_gamma;
181  size_t m_mlp_gate, v_mlp_gate;
182  size_t m_mlp_up, v_mlp_up;
183  size_t m_mlp_down, v_mlp_down;
184 
186 
187 /* ============================================================================
188  * SECTION - A complete encoder/decoder/vision/audio module
189  *
190  * Each section has:
191  * - Its own config (embed_dim, heads, etc.)
192  * - Embedding layer (optional - only first section usually)
193  * - N transformer layers
194  * - Bridge to next section (optional)
195  * ============================================================================ */
196 
197 typedef struct {
199 
200  /* === EMBEDDINGS (optional) === */
201  size_t token_embed; /* [vocab, embed] weight */
202  size_t pos_embed; /* [max_seq, embed] weight (if absolute) */
203  size_t embed_output; /* [tokens, embed] activation */
204 
205  /* === TRANSFORMER LAYERS === */
207  CKLayerOffsets *layers; /* Array of per-layer offsets */
208  CKLayerGradOffsets *grads; /* Array of per-layer gradient offsets (training) */
209 
210  /* === FINAL NORM === */
211  size_t final_ln_gamma; /* [embed] weight */
212  size_t final_ln_beta; /* [embed] weight */
213  size_t final_ln_output; /* [tokens, embed] activation */
214 
215  /* === OUTPUT HEAD (optional) === */
216  size_t lm_head; /* [embed, vocab] weight (may tie with token_embed) */
217  size_t logits; /* [tokens, vocab] activation */
218 
219  /* === BRIDGE TO NEXT SECTION (optional) === */
220  size_t bridge_proj_w; /* [this_embed, next_embed] weight */
221  size_t bridge_proj_b; /* [next_embed] bias */
222  size_t bridge_output; /* [tokens, next_embed] activation */
223 
224  /* === SECTION BOUNDARIES (for NUMA planning) === */
225  size_t section_start; /* Byte offset where this section starts */
226  size_t section_end; /* Byte offset where this section ends */
227 
228 } CKSection;
229 
230 /* ============================================================================
231  * MODEL - The complete multi-section model
232  *
233  * ONE allocation. ONE base pointer. ALL offsets relative to base.
234  * ============================================================================ */
235 
236 typedef struct {
237  /* === MEMORY === */
238  void *base; /* THE one and only allocation */
239  size_t total_bytes; /* Total allocated size */
240  size_t weight_bytes; /* Bytes used for weights */
241  size_t activation_bytes; /* Bytes used for activations */
242  size_t grad_bytes; /* Bytes used for gradients (0 if inference) */
243 
244  /* === SECTIONS === */
247 
248  /* === GLOBAL BUFFERS (shared across sections) === */
249  size_t rope_cos; /* [max_seq, head_dim] RoPE cosine */
250  size_t rope_sin; /* [max_seq, head_dim] RoPE sine */
251  size_t causal_mask; /* [max_seq, max_seq] attention mask */
252 
253  /* === FUSION FLAGS === */
254  uint32_t fusion_flags; /* Bitmask of enabled fusions */
255 
256  /* === RUNTIME STATE (not offsets, actual values) === */
257  int current_seq_len; /* Tokens processed so far */
258  int batch_size; /* Always 1 for now */
259 
260 } CKModel;
261 
262 /* ============================================================================
263  * FUSION FLAGS
264  *
265  * When a fusion is enabled, the intermediate activation is SKIPPED.
266  * Memory is still allocated (for consistency), but kernel reads/writes bypass it.
267  * ============================================================================ */
268 
269 typedef enum {
271  CK_FUSE_EMBED_NORM = 1 << 0, /* embed + first layernorm */
272  CK_FUSE_NORM_QKV = 1 << 1, /* layernorm + qkv projection */
273  CK_FUSE_QKV_ROPE = 1 << 2, /* qkv projection + rope */
274  CK_FUSE_ATTN_PROJ = 1 << 3, /* attention + output projection */
275  CK_FUSE_NORM_MLP = 1 << 4, /* layernorm + mlp gate/up */
276  CK_FUSE_MLP_GATE_UP = 1 << 5, /* gate and up projections */
277  CK_FUSE_MLP_ACT_DOWN = 1 << 6, /* activation + down projection */
278  CK_FUSE_ADD_NORM = 1 << 7, /* residual add + layernorm */
279 } CKFusionFlags;
280 
281 /* ============================================================================
282  * MEMORY PLANNING API
283  *
284  * Two-pass allocation:
285  * 1. Dry run: compute all offsets, calculate total size
286  * 2. Allocate: single hugepage-backed mmap
287  * 3. Return model with all offsets filled in
288  * ============================================================================ */
289 
290 /**
291  * Plan memory layout for a model.
292  *
293  * @param sections Array of section configs
294  * @param num_sections Number of sections
295  * @param mode 0=inference, 1=training (includes gradients)
296  * @param fusion_flags Which operations to fuse
297  * @param out_model Output: model with all offsets computed
298  * @return Total bytes needed (0 on error)
299  */
300 size_t ck_memory_plan(const CKSectionConfig *sections,
301  int num_sections,
302  int mode,
303  uint32_t fusion_flags,
304  CKModel *out_model);
305 
306 /**
307  * Allocate the planned memory.
308  *
309  * @param model Model with planned offsets
310  * @param use_hugepages 0=regular malloc, 1=2MB hugepages, 2=1GB hugepages
311  * @return 0 on success, -1 on failure
312  */
313 int ck_memory_allocate(CKModel *model, int use_hugepages);
314 
315 /**
316  * Free the model memory.
317  *
318  * @param model Model to free
319  */
320 void ck_memory_free(CKModel *model);
321 
322 /* ============================================================================
323  * ACCESSOR MACROS
324  *
325  * All access is: (float*)(model->base + offset)
326  * These macros make it cleaner.
327  * ============================================================================ */
328 
329 #define CK_PTR(model, offset) ((float*)((char*)(model)->base + (offset)))
330 #define CK_LAYER(model, section, layer) (&(model)->sections[section].layers[layer])
331 #define CK_GRAD(model, section, layer) (&(model)->sections[section].grads[layer])
332 
333 /* Example usage:
334  *
335  * CKModel model;
336  * ck_memory_plan(&config, 1, 0, CK_FUSE_NORM_QKV, &model);
337  * ck_memory_allocate(&model, 1); // 2MB hugepages
338  *
339  * // Access layer 5 Q weights:
340  * float *wq = CK_PTR(&model, CK_LAYER(&model, 0, 5)->wq);
341  *
342  * // Access layer 5 Q activation:
343  * float *q = CK_PTR(&model, CK_LAYER(&model, 0, 5)->q);
344  *
345  * // With fusion (CK_FUSE_NORM_QKV), the kernel skips ln1_output
346  * // and writes directly to q/k/v. The ln1_output memory exists
347  * // but is never touched - CPU prefetch streams over it.
348  */
349 
350 #ifdef __cplusplus
351 }
352 #endif
353 
354 #endif /* CKERNEL_MEMORY_LAYOUT_H */
int ck_memory_allocate(CKModel *model, int use_hugepages)
size_t ck_memory_plan(const CKSectionConfig *sections, int num_sections, int mode, uint32_t fusion_flags, CKModel *out_model)
void ck_memory_free(CKModel *model)
@ CK_FUSE_NORM_QKV
@ CK_FUSE_ATTN_PROJ
@ CK_FUSE_EMBED_NORM
@ CK_FUSE_ADD_NORM
@ CK_FUSE_MLP_ACT_DOWN
@ CK_FUSE_MLP_GATE_UP
@ CK_FUSE_QKV_ROPE
@ CK_FUSE_NORM_MLP
@ CK_FUSE_NONE
uint32_t fusion_flags
CKSection * sections
size_t activation_bytes
CKLayerOffsets * layers
CKLayerGradOffsets * grads
CKSectionConfig config