CK
← Back to C-Kernel-Engine Docs
Doxygen Source Documentation
ckernel_memory_layout.h
Go to the documentation of this file.
1
/**
2
* @file ckernel_memory_layout.h
3
* @brief Single-Arena Memory Layout for CPU-Optimized Inference & Training
4
*
5
* PHILOSOPHY:
6
* -----------
7
* The CPU doesn't care if data is a weight or activation - it just needs
8
* sequential memory access. This design:
9
*
10
* 1. ONE contiguous allocation (hugepage-backed)
11
* 2. ALL offsets from single base pointer
12
* 3. Weights and activations INTERLEAVED in execution order
13
* 4. Fusion = same offset (skip write, read directly)
14
* 5. No fusion = stride over (memory allocated but unused)
15
*
16
* BENEFITS:
17
* - No malloc/free at runtime = no corruption, no double-free
18
* - Prefetch layer N+1 while computing layer N (different DRAM bank)
19
* - NUMA-aware: 1GB hugepages map to different memory channels
20
* - Offset arithmetic only, no pointer chasing
21
*
22
* FUSION EXAMPLE:
23
* ---------------
24
* Without fusion:
25
* rmsnorm writes to ln1_output (offset 1000)
26
* qkv_project reads from ln1_output (offset 1000)
27
*
28
* With fusion (rmsnorm + qkv fused):
29
* fused kernel reads from input, writes directly to q/k/v
30
* ln1_output memory (offset 1000) is SKIPPED but still allocated
31
* CPU prefetch streams over it, no penalty
32
*/
33
34
#ifndef CKERNEL_MEMORY_LAYOUT_H
35
#define CKERNEL_MEMORY_LAYOUT_H
36
37
#include <stddef.h>
38
#include <stdint.h>
39
40
#ifdef __cplusplus
41
extern
"C"
{
42
#endif
43
44
/* ============================================================================
45
* ALIGNMENT CONSTANTS
46
* ============================================================================ */
47
48
#define CK_CACHE_LINE 64ULL
/* CPU cache line */
49
#define CK_HUGE_2MB (2ULL << 20)
/* 2MB hugepage */
50
#define CK_HUGE_1GB (1ULL << 30)
/* 1GB hugepage (NUMA optimal) */
51
#define CK_SIMD_ALIGN 64ULL
/* AVX-512 alignment */
52
53
/* ============================================================================
54
* SECTION CONFIG (each section = encoder/decoder/vision/audio)
55
*
56
* Multi-modal models have multiple sections. Each section has its own config
57
* but shares the SAME memory arena. Bridges connect sections.
58
* ============================================================================ */
59
60
typedef
struct
{
61
/* Model dimensions for this section */
62
int
embed_dim
;
/* hidden_size */
63
int
num_heads
;
/* attention heads */
64
int
num_kv_heads
;
/* key-value heads (GQA) */
65
int
head_dim
;
/* embed_dim / num_heads */
66
int
intermediate_dim
;
/* MLP hidden size */
67
int
num_layers
;
/* transformer layers in this section */
68
int
vocab_size
;
/* only for sections with embeddings */
69
int
max_seq_len
;
/* context window */
70
71
/* Computed alignments */
72
int
aligned_embed
;
/* embed_dim aligned to SIMD */
73
int
aligned_head
;
/* head_dim aligned to SIMD */
74
int
aligned_intermediate
;
/* intermediate aligned to SIMD */
75
}
CKSectionConfig
;
76
77
/* ============================================================================
78
* LAYER OFFSETS - Weights and Activations INTERLEAVED in execution order
79
*
80
* Pattern: weight -> activation -> weight -> activation ...
81
* This is the KEY insight: CPU streams through memory sequentially.
82
* Whether fused or not, data is laid out in the order it's needed.
83
* ============================================================================ */
84
85
typedef
struct
{
86
/* === PRE-ATTENTION NORM === */
87
size_t
ln1_gamma
;
/* [embed] weight */
88
size_t
ln1_beta
;
/* [embed] weight (optional, some use RMSNorm) */
89
size_t
ln1_output
;
/* [tokens, embed] activation */
90
91
/* === QKV PROJECTION === */
92
size_t
wq
;
/* [embed, num_heads * head_dim] weight */
93
size_t
wk
;
/* [embed, num_kv_heads * head_dim] weight */
94
size_t
wv
;
/* [embed, num_kv_heads * head_dim] weight */
95
size_t
bq
;
/* [num_heads * head_dim] bias (optional) */
96
size_t
bk
;
/* [num_kv_heads * head_dim] bias (optional) */
97
size_t
bv
;
/* [num_kv_heads * head_dim] bias (optional) */
98
size_t
q
;
/* [tokens, num_heads, head_dim] activation */
99
size_t
k
;
/* [tokens, num_kv_heads, head_dim] activation */
100
size_t
v
;
/* [tokens, num_kv_heads, head_dim] activation */
101
102
/* === ROPE (if applicable) === */
103
size_t
q_rope
;
/* [tokens, num_heads, head_dim] activation */
104
size_t
k_rope
;
/* [tokens, num_kv_heads, head_dim] activation */
105
106
/* === ATTENTION === */
107
size_t
attn_scores
;
/* [num_heads, tokens, tokens] activation */
108
size_t
attn_probs
;
/* [num_heads, tokens, tokens] activation (after softmax) */
109
size_t
attn_output
;
/* [tokens, num_heads, head_dim] activation */
110
111
/* === OUTPUT PROJECTION === */
112
size_t
wo
;
/* [num_heads * head_dim, embed] weight */
113
size_t
bo
;
/* [embed] bias (optional) */
114
size_t
proj_output
;
/* [tokens, embed] activation */
115
116
/* === RESIDUAL 1 === */
117
size_t
residual1
;
/* [tokens, embed] activation */
118
119
/* === POST-ATTENTION NORM === */
120
size_t
ln2_gamma
;
/* [embed] weight */
121
size_t
ln2_beta
;
/* [embed] weight (optional) */
122
size_t
ln2_output
;
/* [tokens, embed] activation */
123
124
/* === MLP === */
125
size_t
mlp_gate_w
;
/* [embed, intermediate] weight (gated MLP) */
126
size_t
mlp_up_w
;
/* [embed, intermediate] weight */
127
size_t
mlp_down_w
;
/* [intermediate, embed] weight */
128
size_t
mlp_gate_b
;
/* [intermediate] bias (optional) */
129
size_t
mlp_up_b
;
/* [intermediate] bias (optional) */
130
size_t
mlp_down_b
;
/* [embed] bias (optional) */
131
size_t
mlp_gate_out
;
/* [tokens, intermediate] activation */
132
size_t
mlp_up_out
;
/* [tokens, intermediate] activation */
133
size_t
mlp_act_out
;
/* [tokens, intermediate] activation (after SiLU/GELU) */
134
size_t
mlp_down_out
;
/* [tokens, embed] activation */
135
136
/* === RESIDUAL 2 (layer output) === */
137
size_t
residual2
;
/* [tokens, embed] activation = layer output */
138
139
/* === KV CACHE (decode mode) === */
140
size_t
k_cache
;
/* [max_seq, num_kv_heads, head_dim] */
141
size_t
v_cache
;
/* [max_seq, num_kv_heads, head_dim] */
142
143
}
CKLayerOffsets
;
144
145
/* ============================================================================
146
* GRADIENT OFFSETS - Same pattern, follows forward offsets
147
*
148
* For training: gradients laid out AFTER forward activations.
149
* Same interleaved pattern: d_weight, d_activation, d_weight, d_activation...
150
* ============================================================================ */
151
152
typedef
struct
{
153
/* Gradients mirror forward structure */
154
size_t
d_ln1_gamma
;
155
size_t
d_ln1_beta
;
156
size_t
d_ln1_output
;
157
158
size_t
d_wq,
d_wk
, d_wv;
159
size_t
d_bq,
d_bk
, d_bv;
160
size_t
d_q,
d_k
, d_v;
161
162
size_t
d_attn_scores
;
163
size_t
d_attn_output
;
164
165
size_t
d_wo,
d_bo
;
166
size_t
d_proj_output
;
167
168
size_t
d_ln2_gamma,
d_ln2_beta
;
169
size_t
d_ln2_output
;
170
171
size_t
d_mlp_gate_w, d_mlp_up_w,
d_mlp_down_w
;
172
size_t
d_mlp_gate_out, d_mlp_up_out,
d_mlp_down_out
;
173
174
/* Adam optimizer state (m and v) - also in same arena */
175
size_t
m_ln1_gamma
, v_ln1_gamma;
176
size_t
m_wq
, v_wq;
177
size_t
m_wk
, v_wk;
178
size_t
m_wv
, v_wv;
179
size_t
m_wo
, v_wo;
180
size_t
m_ln2_gamma
, v_ln2_gamma;
181
size_t
m_mlp_gate
, v_mlp_gate;
182
size_t
m_mlp_up
, v_mlp_up;
183
size_t
m_mlp_down
, v_mlp_down;
184
185
}
CKLayerGradOffsets
;
186
187
/* ============================================================================
188
* SECTION - A complete encoder/decoder/vision/audio module
189
*
190
* Each section has:
191
* - Its own config (embed_dim, heads, etc.)
192
* - Embedding layer (optional - only first section usually)
193
* - N transformer layers
194
* - Bridge to next section (optional)
195
* ============================================================================ */
196
197
typedef
struct
{
198
CKSectionConfig
config
;
199
200
/* === EMBEDDINGS (optional) === */
201
size_t
token_embed
;
/* [vocab, embed] weight */
202
size_t
pos_embed
;
/* [max_seq, embed] weight (if absolute) */
203
size_t
embed_output
;
/* [tokens, embed] activation */
204
205
/* === TRANSFORMER LAYERS === */
206
int
num_layers
;
207
CKLayerOffsets
*
layers
;
/* Array of per-layer offsets */
208
CKLayerGradOffsets
*
grads
;
/* Array of per-layer gradient offsets (training) */
209
210
/* === FINAL NORM === */
211
size_t
final_ln_gamma
;
/* [embed] weight */
212
size_t
final_ln_beta
;
/* [embed] weight */
213
size_t
final_ln_output
;
/* [tokens, embed] activation */
214
215
/* === OUTPUT HEAD (optional) === */
216
size_t
lm_head
;
/* [embed, vocab] weight (may tie with token_embed) */
217
size_t
logits
;
/* [tokens, vocab] activation */
218
219
/* === BRIDGE TO NEXT SECTION (optional) === */
220
size_t
bridge_proj_w
;
/* [this_embed, next_embed] weight */
221
size_t
bridge_proj_b
;
/* [next_embed] bias */
222
size_t
bridge_output
;
/* [tokens, next_embed] activation */
223
224
/* === SECTION BOUNDARIES (for NUMA planning) === */
225
size_t
section_start
;
/* Byte offset where this section starts */
226
size_t
section_end
;
/* Byte offset where this section ends */
227
228
}
CKSection
;
229
230
/* ============================================================================
231
* MODEL - The complete multi-section model
232
*
233
* ONE allocation. ONE base pointer. ALL offsets relative to base.
234
* ============================================================================ */
235
236
typedef
struct
{
237
/* === MEMORY === */
238
void
*
base
;
/* THE one and only allocation */
239
size_t
total_bytes
;
/* Total allocated size */
240
size_t
weight_bytes
;
/* Bytes used for weights */
241
size_t
activation_bytes
;
/* Bytes used for activations */
242
size_t
grad_bytes
;
/* Bytes used for gradients (0 if inference) */
243
244
/* === SECTIONS === */
245
int
num_sections
;
246
CKSection
*
sections
;
247
248
/* === GLOBAL BUFFERS (shared across sections) === */
249
size_t
rope_cos
;
/* [max_seq, head_dim] RoPE cosine */
250
size_t
rope_sin
;
/* [max_seq, head_dim] RoPE sine */
251
size_t
causal_mask
;
/* [max_seq, max_seq] attention mask */
252
253
/* === FUSION FLAGS === */
254
uint32_t
fusion_flags
;
/* Bitmask of enabled fusions */
255
256
/* === RUNTIME STATE (not offsets, actual values) === */
257
int
current_seq_len
;
/* Tokens processed so far */
258
int
batch_size
;
/* Always 1 for now */
259
260
}
CKModel
;
261
262
/* ============================================================================
263
* FUSION FLAGS
264
*
265
* When a fusion is enabled, the intermediate activation is SKIPPED.
266
* Memory is still allocated (for consistency), but kernel reads/writes bypass it.
267
* ============================================================================ */
268
269
typedef
enum
{
270
CK_FUSE_NONE
= 0,
271
CK_FUSE_EMBED_NORM
= 1 << 0,
/* embed + first layernorm */
272
CK_FUSE_NORM_QKV
= 1 << 1,
/* layernorm + qkv projection */
273
CK_FUSE_QKV_ROPE
= 1 << 2,
/* qkv projection + rope */
274
CK_FUSE_ATTN_PROJ
= 1 << 3,
/* attention + output projection */
275
CK_FUSE_NORM_MLP
= 1 << 4,
/* layernorm + mlp gate/up */
276
CK_FUSE_MLP_GATE_UP
= 1 << 5,
/* gate and up projections */
277
CK_FUSE_MLP_ACT_DOWN
= 1 << 6,
/* activation + down projection */
278
CK_FUSE_ADD_NORM
= 1 << 7,
/* residual add + layernorm */
279
}
CKFusionFlags
;
280
281
/* ============================================================================
282
* MEMORY PLANNING API
283
*
284
* Two-pass allocation:
285
* 1. Dry run: compute all offsets, calculate total size
286
* 2. Allocate: single hugepage-backed mmap
287
* 3. Return model with all offsets filled in
288
* ============================================================================ */
289
290
/**
291
* Plan memory layout for a model.
292
*
293
* @param sections Array of section configs
294
* @param num_sections Number of sections
295
* @param mode 0=inference, 1=training (includes gradients)
296
* @param fusion_flags Which operations to fuse
297
* @param out_model Output: model with all offsets computed
298
* @return Total bytes needed (0 on error)
299
*/
300
size_t
ck_memory_plan
(
const
CKSectionConfig
*sections,
301
int
num_sections,
302
int
mode,
303
uint32_t fusion_flags,
304
CKModel
*out_model);
305
306
/**
307
* Allocate the planned memory.
308
*
309
* @param model Model with planned offsets
310
* @param use_hugepages 0=regular malloc, 1=2MB hugepages, 2=1GB hugepages
311
* @return 0 on success, -1 on failure
312
*/
313
int
ck_memory_allocate
(
CKModel
*model,
int
use_hugepages);
314
315
/**
316
* Free the model memory.
317
*
318
* @param model Model to free
319
*/
320
void
ck_memory_free
(
CKModel
*model);
321
322
/* ============================================================================
323
* ACCESSOR MACROS
324
*
325
* All access is: (float*)(model->base + offset)
326
* These macros make it cleaner.
327
* ============================================================================ */
328
329
#define CK_PTR(model, offset) ((float*)((char*)(model)->base + (offset)))
330
#define CK_LAYER(model, section, layer) (&(model)->sections[section].layers[layer])
331
#define CK_GRAD(model, section, layer) (&(model)->sections[section].grads[layer])
332
333
/* Example usage:
334
*
335
* CKModel model;
336
* ck_memory_plan(&config, 1, 0, CK_FUSE_NORM_QKV, &model);
337
* ck_memory_allocate(&model, 1); // 2MB hugepages
338
*
339
* // Access layer 5 Q weights:
340
* float *wq = CK_PTR(&model, CK_LAYER(&model, 0, 5)->wq);
341
*
342
* // Access layer 5 Q activation:
343
* float *q = CK_PTR(&model, CK_LAYER(&model, 0, 5)->q);
344
*
345
* // With fusion (CK_FUSE_NORM_QKV), the kernel skips ln1_output
346
* // and writes directly to q/k/v. The ln1_output memory exists
347
* // but is never touched - CPU prefetch streams over it.
348
*/
349
350
#ifdef __cplusplus
351
}
352
#endif
353
354
#endif
/* CKERNEL_MEMORY_LAYOUT_H */
ck_memory_allocate
int ck_memory_allocate(CKModel *model, int use_hugepages)
ck_memory_plan
size_t ck_memory_plan(const CKSectionConfig *sections, int num_sections, int mode, uint32_t fusion_flags, CKModel *out_model)
ck_memory_free
void ck_memory_free(CKModel *model)
CKFusionFlags
CKFusionFlags
Definition:
ckernel_memory_layout.h:269
CK_FUSE_NORM_QKV
@ CK_FUSE_NORM_QKV
Definition:
ckernel_memory_layout.h:272
CK_FUSE_ATTN_PROJ
@ CK_FUSE_ATTN_PROJ
Definition:
ckernel_memory_layout.h:274
CK_FUSE_EMBED_NORM
@ CK_FUSE_EMBED_NORM
Definition:
ckernel_memory_layout.h:271
CK_FUSE_ADD_NORM
@ CK_FUSE_ADD_NORM
Definition:
ckernel_memory_layout.h:278
CK_FUSE_MLP_ACT_DOWN
@ CK_FUSE_MLP_ACT_DOWN
Definition:
ckernel_memory_layout.h:277
CK_FUSE_MLP_GATE_UP
@ CK_FUSE_MLP_GATE_UP
Definition:
ckernel_memory_layout.h:276
CK_FUSE_QKV_ROPE
@ CK_FUSE_QKV_ROPE
Definition:
ckernel_memory_layout.h:273
CK_FUSE_NORM_MLP
@ CK_FUSE_NORM_MLP
Definition:
ckernel_memory_layout.h:275
CK_FUSE_NONE
@ CK_FUSE_NONE
Definition:
ckernel_memory_layout.h:270
CKLayerGradOffsets
Definition:
ckernel_memory_layout.h:152
CKLayerGradOffsets::d_mlp_down_out
size_t d_mlp_down_out
Definition:
ckernel_memory_layout.h:172
CKLayerGradOffsets::d_ln1_beta
size_t d_ln1_beta
Definition:
ckernel_memory_layout.h:155
CKLayerGradOffsets::m_wq
size_t m_wq
Definition:
ckernel_memory_layout.h:176
CKLayerGradOffsets::m_wk
size_t m_wk
Definition:
ckernel_memory_layout.h:177
CKLayerGradOffsets::m_wo
size_t m_wo
Definition:
ckernel_memory_layout.h:179
CKLayerGradOffsets::d_attn_output
size_t d_attn_output
Definition:
ckernel_memory_layout.h:163
CKLayerGradOffsets::d_ln1_output
size_t d_ln1_output
Definition:
ckernel_memory_layout.h:156
CKLayerGradOffsets::m_ln1_gamma
size_t m_ln1_gamma
Definition:
ckernel_memory_layout.h:175
CKLayerGradOffsets::d_proj_output
size_t d_proj_output
Definition:
ckernel_memory_layout.h:166
CKLayerGradOffsets::m_ln2_gamma
size_t m_ln2_gamma
Definition:
ckernel_memory_layout.h:180
CKLayerGradOffsets::d_ln2_beta
size_t d_ln2_beta
Definition:
ckernel_memory_layout.h:168
CKLayerGradOffsets::m_mlp_down
size_t m_mlp_down
Definition:
ckernel_memory_layout.h:183
CKLayerGradOffsets::d_bk
size_t d_bk
Definition:
ckernel_memory_layout.h:159
CKLayerGradOffsets::m_mlp_gate
size_t m_mlp_gate
Definition:
ckernel_memory_layout.h:181
CKLayerGradOffsets::d_mlp_down_w
size_t d_mlp_down_w
Definition:
ckernel_memory_layout.h:171
CKLayerGradOffsets::d_ln1_gamma
size_t d_ln1_gamma
Definition:
ckernel_memory_layout.h:154
CKLayerGradOffsets::d_wk
size_t d_wk
Definition:
ckernel_memory_layout.h:158
CKLayerGradOffsets::d_attn_scores
size_t d_attn_scores
Definition:
ckernel_memory_layout.h:162
CKLayerGradOffsets::d_bo
size_t d_bo
Definition:
ckernel_memory_layout.h:165
CKLayerGradOffsets::d_ln2_output
size_t d_ln2_output
Definition:
ckernel_memory_layout.h:169
CKLayerGradOffsets::d_k
size_t d_k
Definition:
ckernel_memory_layout.h:160
CKLayerGradOffsets::m_wv
size_t m_wv
Definition:
ckernel_memory_layout.h:178
CKLayerGradOffsets::m_mlp_up
size_t m_mlp_up
Definition:
ckernel_memory_layout.h:182
CKLayerOffsets
Definition:
ckernel_memory_layout.h:85
CKLayerOffsets::ln2_output
size_t ln2_output
Definition:
ckernel_memory_layout.h:122
CKLayerOffsets::mlp_down_out
size_t mlp_down_out
Definition:
ckernel_memory_layout.h:134
CKLayerOffsets::bq
size_t bq
Definition:
ckernel_memory_layout.h:95
CKLayerOffsets::v
size_t v
Definition:
ckernel_memory_layout.h:100
CKLayerOffsets::k
size_t k
Definition:
ckernel_memory_layout.h:99
CKLayerOffsets::proj_output
size_t proj_output
Definition:
ckernel_memory_layout.h:114
CKLayerOffsets::ln2_gamma
size_t ln2_gamma
Definition:
ckernel_memory_layout.h:120
CKLayerOffsets::mlp_down_b
size_t mlp_down_b
Definition:
ckernel_memory_layout.h:130
CKLayerOffsets::mlp_gate_b
size_t mlp_gate_b
Definition:
ckernel_memory_layout.h:128
CKLayerOffsets::q
size_t q
Definition:
ckernel_memory_layout.h:98
CKLayerOffsets::wo
size_t wo
Definition:
ckernel_memory_layout.h:112
CKLayerOffsets::bk
size_t bk
Definition:
ckernel_memory_layout.h:96
CKLayerOffsets::k_rope
size_t k_rope
Definition:
ckernel_memory_layout.h:104
CKLayerOffsets::attn_probs
size_t attn_probs
Definition:
ckernel_memory_layout.h:108
CKLayerOffsets::attn_scores
size_t attn_scores
Definition:
ckernel_memory_layout.h:107
CKLayerOffsets::residual2
size_t residual2
Definition:
ckernel_memory_layout.h:137
CKLayerOffsets::mlp_up_w
size_t mlp_up_w
Definition:
ckernel_memory_layout.h:126
CKLayerOffsets::mlp_up_b
size_t mlp_up_b
Definition:
ckernel_memory_layout.h:129
CKLayerOffsets::mlp_gate_w
size_t mlp_gate_w
Definition:
ckernel_memory_layout.h:125
CKLayerOffsets::residual1
size_t residual1
Definition:
ckernel_memory_layout.h:117
CKLayerOffsets::k_cache
size_t k_cache
Definition:
ckernel_memory_layout.h:140
CKLayerOffsets::ln1_output
size_t ln1_output
Definition:
ckernel_memory_layout.h:89
CKLayerOffsets::ln2_beta
size_t ln2_beta
Definition:
ckernel_memory_layout.h:121
CKLayerOffsets::wk
size_t wk
Definition:
ckernel_memory_layout.h:93
CKLayerOffsets::wq
size_t wq
Definition:
ckernel_memory_layout.h:92
CKLayerOffsets::q_rope
size_t q_rope
Definition:
ckernel_memory_layout.h:103
CKLayerOffsets::mlp_down_w
size_t mlp_down_w
Definition:
ckernel_memory_layout.h:127
CKLayerOffsets::mlp_gate_out
size_t mlp_gate_out
Definition:
ckernel_memory_layout.h:131
CKLayerOffsets::bo
size_t bo
Definition:
ckernel_memory_layout.h:113
CKLayerOffsets::mlp_up_out
size_t mlp_up_out
Definition:
ckernel_memory_layout.h:132
CKLayerOffsets::ln1_gamma
size_t ln1_gamma
Definition:
ckernel_memory_layout.h:87
CKLayerOffsets::v_cache
size_t v_cache
Definition:
ckernel_memory_layout.h:141
CKLayerOffsets::wv
size_t wv
Definition:
ckernel_memory_layout.h:94
CKLayerOffsets::bv
size_t bv
Definition:
ckernel_memory_layout.h:97
CKLayerOffsets::ln1_beta
size_t ln1_beta
Definition:
ckernel_memory_layout.h:88
CKLayerOffsets::attn_output
size_t attn_output
Definition:
ckernel_memory_layout.h:109
CKLayerOffsets::mlp_act_out
size_t mlp_act_out
Definition:
ckernel_memory_layout.h:133
CKModel
Definition:
ckernel_memory_layout.h:236
CKModel::total_bytes
size_t total_bytes
Definition:
ckernel_memory_layout.h:239
CKModel::batch_size
int batch_size
Definition:
ckernel_memory_layout.h:258
CKModel::base
void * base
Definition:
ckernel_memory_layout.h:238
CKModel::num_sections
int num_sections
Definition:
ckernel_memory_layout.h:245
CKModel::causal_mask
size_t causal_mask
Definition:
ckernel_memory_layout.h:251
CKModel::rope_sin
size_t rope_sin
Definition:
ckernel_memory_layout.h:250
CKModel::fusion_flags
uint32_t fusion_flags
Definition:
ckernel_memory_layout.h:254
CKModel::grad_bytes
size_t grad_bytes
Definition:
ckernel_memory_layout.h:242
CKModel::rope_cos
size_t rope_cos
Definition:
ckernel_memory_layout.h:249
CKModel::sections
CKSection * sections
Definition:
ckernel_memory_layout.h:246
CKModel::activation_bytes
size_t activation_bytes
Definition:
ckernel_memory_layout.h:241
CKModel::weight_bytes
size_t weight_bytes
Definition:
ckernel_memory_layout.h:240
CKModel::current_seq_len
int current_seq_len
Definition:
ckernel_memory_layout.h:257
CKSectionConfig
Definition:
ckernel_memory_layout.h:60
CKSectionConfig::aligned_intermediate
int aligned_intermediate
Definition:
ckernel_memory_layout.h:74
CKSectionConfig::vocab_size
int vocab_size
Definition:
ckernel_memory_layout.h:68
CKSectionConfig::num_kv_heads
int num_kv_heads
Definition:
ckernel_memory_layout.h:64
CKSectionConfig::num_heads
int num_heads
Definition:
ckernel_memory_layout.h:63
CKSectionConfig::head_dim
int head_dim
Definition:
ckernel_memory_layout.h:65
CKSectionConfig::intermediate_dim
int intermediate_dim
Definition:
ckernel_memory_layout.h:66
CKSectionConfig::max_seq_len
int max_seq_len
Definition:
ckernel_memory_layout.h:69
CKSectionConfig::aligned_head
int aligned_head
Definition:
ckernel_memory_layout.h:73
CKSectionConfig::embed_dim
int embed_dim
Definition:
ckernel_memory_layout.h:62
CKSectionConfig::aligned_embed
int aligned_embed
Definition:
ckernel_memory_layout.h:72
CKSectionConfig::num_layers
int num_layers
Definition:
ckernel_memory_layout.h:67
CKSection
Definition:
ckernel_memory_layout.h:197
CKSection::layers
CKLayerOffsets * layers
Definition:
ckernel_memory_layout.h:207
CKSection::pos_embed
size_t pos_embed
Definition:
ckernel_memory_layout.h:202
CKSection::final_ln_gamma
size_t final_ln_gamma
Definition:
ckernel_memory_layout.h:211
CKSection::grads
CKLayerGradOffsets * grads
Definition:
ckernel_memory_layout.h:208
CKSection::bridge_output
size_t bridge_output
Definition:
ckernel_memory_layout.h:222
CKSection::embed_output
size_t embed_output
Definition:
ckernel_memory_layout.h:203
CKSection::lm_head
size_t lm_head
Definition:
ckernel_memory_layout.h:216
CKSection::bridge_proj_b
size_t bridge_proj_b
Definition:
ckernel_memory_layout.h:221
CKSection::final_ln_output
size_t final_ln_output
Definition:
ckernel_memory_layout.h:213
CKSection::final_ln_beta
size_t final_ln_beta
Definition:
ckernel_memory_layout.h:212
CKSection::config
CKSectionConfig config
Definition:
ckernel_memory_layout.h:198
CKSection::num_layers
int num_layers
Definition:
ckernel_memory_layout.h:206
CKSection::token_embed
size_t token_embed
Definition:
ckernel_memory_layout.h:201
CKSection::section_start
size_t section_start
Definition:
ckernel_memory_layout.h:225
CKSection::logits
size_t logits
Definition:
ckernel_memory_layout.h:217
CKSection::section_end
size_t section_end
Definition:
ckernel_memory_layout.h:226
CKSection::bridge_proj_w
size_t bridge_proj_w
Definition:
ckernel_memory_layout.h:220
include
ckernel_memory_layout.h
Generated by
1.9.1