← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ckernel_section_layout.h
Go to the documentation of this file.
1 /**
2  * @file ckernel_section_layout.h
3  * @brief Section-Based Memory Layout: Header / Body / Footer Pattern
4  *
5  * =============================================================================
6  * PHILOSOPHY: WHY NO MEMORY REUSE?
7  * =============================================================================
8  *
9  * 1. CPU PREFETCH NEEDS PREDICTABLE PATTERNS
10  * - Prefetcher learns: "he's streaming forward through memory"
11  * - Reuse breaks this: same address, different data, different time
12  * - Result: prefetcher gives up, cache misses explode
13  *
14  * 2. TRAINING NEEDS ALL ACTIVATIONS
15  * - Forward: compute activations
16  * - Backward: need EVERY activation to compute gradients
17  * - Can't reuse what you still need!
18  *
19  * 3. MEMORY IS CHEAP, BANDWIDTH IS EXPENSIVE
20  * - DDR5: ~$3/GB (1TB = $3000, trivial for training cluster)
21  * - Bandwidth: 100-400 GB/s per socket (the real bottleneck)
22  * - Optimize for streaming, not for saving bytes
23  *
24  * 4. SINGLE ALLOCATION = ZERO RUNTIME MALLOC
25  * - No fragmentation, no free(), no double-free bugs
26  * - Hugepage-backed: 1GB pages for NUMA optimization
27  * - One base pointer + offsets = maximum simplicity
28  *
29  * =============================================================================
30  * SECTION ARCHITECTURE
31  * =============================================================================
32  *
33  * Multi-modal models have multiple SECTIONS. Each section is a complete
34  * encoder/decoder/vision/audio module with its own dimensions.
35  *
36  * ┌─────────────────────────────────────────────────────────────────┐
37  * │ SINGLE ALLOCATION │
38  * ├─────────────────────────────────────────────────────────────────┤
39  * │ SECTION 0: Vision Encoder │
40  * │ ├── HEADER: patch_embed, pos_embed │
41  * │ ├── BODY: layer[0..N] (weights + activations interleaved) │
42  * │ └── FOOTER: final_norm, bridge_to_text │
43  * ├─────────────────────────────────────────────────────────────────┤
44  * │ SECTION 1: Text Decoder │
45  * │ ├── HEADER: token_embed, pos_embed │
46  * │ ├── BODY: layer[0..N] (weights + activations interleaved) │
47  * │ └── FOOTER: final_norm, lm_head, logits │
48  * ├─────────────────────────────────────────────────────────────────┤
49  * │ SECTION 2: Audio Encoder (optional) │
50  * │ ├── HEADER: mel_embed │
51  * │ ├── BODY: layer[0..N] │
52  * │ └── FOOTER: bridge_to_text │
53  * ├─────────────────────────────────────────────────────────────────┤
54  * │ GRADIENTS (if training) │
55  * │ └── Same layout: section[0].grads, section[1].grads, ... │
56  * ├─────────────────────────────────────────────────────────────────┤
57  * │ OPTIMIZER STATE (if training with Adam) │
58  * │ └── m[], v[] for each weight │
59  * └─────────────────────────────────────────────────────────────────┘
60  *
61  * =============================================================================
62  * EXECUTION ORDER LAYOUT
63  * =============================================================================
64  *
65  * Within each layer, memory is laid out in the ORDER operations execute:
66  *
67  * weight → activation → weight → activation → ...
68  *
69  * This is critical for CPU cache efficiency. The CPU streams forward,
70  * prefetching the next cache line while processing the current one.
71  *
72  */
73 
74 #ifndef CKERNEL_SECTION_LAYOUT_H
75 #define CKERNEL_SECTION_LAYOUT_H
76 
77 #include <stddef.h>
78 #include <stdint.h>
79 
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83 
84 /* ============================================================================
85  * ALIGNMENT CONSTANTS
86  * ============================================================================ */
87 
88 #define CK_ALIGN_CACHE 64ULL /* CPU cache line (64 bytes) */
89 #define CK_ALIGN_SIMD 64ULL /* AVX-512 register (64 bytes) */
90 #define CK_ALIGN_HUGE_2MB (2ULL << 20) /* 2MB hugepage */
91 #define CK_ALIGN_HUGE_1GB (1ULL << 30) /* 1GB hugepage (NUMA optimal) */
92 
93 /* ============================================================================
94  * SECTION CONFIG
95  *
96  * Each section (encoder/decoder/vision/audio) has its own dimensions.
97  * A vision encoder might have embed_dim=1024, while text decoder has 4096.
98  * ============================================================================ */
99 
100 typedef struct {
101  /* === DIMENSIONS === */
102  int embed_dim; /* hidden_size for this section */
103  int num_heads; /* attention heads */
104  int num_kv_heads; /* key-value heads (for GQA) */
105  int head_dim; /* embed_dim / num_heads */
106  int intermediate_dim; /* MLP hidden dimension */
107  int num_layers; /* transformer layers in this section */
108 
109  /* === VOCABULARY (optional, only for sections with embeddings) === */
110  int vocab_size; /* 0 if no token embedding in this section */
111 
112  /* === SEQUENCE === */
113  int max_seq_len; /* maximum sequence length */
114 
115  /* === ALIGNED DIMENSIONS (computed, for SIMD) === */
116  int aligned_embed; /* embed_dim padded to SIMD boundary */
117  int aligned_head; /* head_dim padded to SIMD boundary */
118  int aligned_intermediate; /* intermediate_dim padded to SIMD boundary */
119 
120  /* === FEATURES === */
121  int has_bias; /* 1 if layers have bias terms */
122  int has_rope; /* 1 if using rotary position embeddings */
123  int has_pos_embed; /* 1 if using learned position embeddings */
124  int gated_mlp; /* 1 if MLP is gated (SwiGLU/GeGLU) */
125  int norm_type; /* 0=LayerNorm, 1=RMSNorm */
126 
128 
129 /* ============================================================================
130  * HEADER OFFSETS
131  *
132  * The HEADER contains embeddings that happen BEFORE the transformer layers.
133  * For text: token embeddings + position embeddings
134  * For vision: patch embeddings + position embeddings
135  * For audio: mel-spectrogram projection
136  * ============================================================================ */
137 
138 typedef struct {
139  /* === TOKEN/PATCH EMBEDDING === */
140  size_t embed_weight; /* [vocab_size, embed_dim] or [patch_dim, embed_dim] */
141  size_t embed_output; /* [seq_len, embed_dim] activation */
142 
143  /* === POSITION EMBEDDING (if learned) === */
144  size_t pos_embed_weight; /* [max_seq_len, embed_dim] weight */
145  size_t pos_embed_output; /* [seq_len, embed_dim] activation (added to embed) */
146 
147  /* === COMBINED OUTPUT === */
148  size_t header_output; /* [seq_len, embed_dim] = embed + pos (input to body) */
149 
150  /* === BYTE BOUNDARIES === */
151  size_t header_start; /* first byte of header */
152  size_t header_end; /* last byte + 1 of header */
153 
155 
156 /* ============================================================================
157  * LAYER OFFSETS (within BODY)
158  *
159  * Each layer's memory is laid out in EXECUTION ORDER:
160  * pre_norm → qkv_proj → rope → attention → out_proj → residual →
161  * post_norm → mlp_gate → mlp_up → activation → mlp_down → residual
162  *
163  * Weights and activations are INTERLEAVED, not separated.
164  * ============================================================================ */
165 
166 typedef struct {
167  /* === LAYER INPUT === */
168  size_t input; /* [seq, embed] activation (from previous layer or header) */
169 
170  /* ========== ATTENTION BLOCK ========== */
171 
172  /* Pre-Attention Normalization */
173  size_t ln1_gamma; /* [embed] weight */
174  size_t ln1_beta; /* [embed] weight (NULL if RMSNorm) */
175  size_t ln1_output; /* [seq, embed] activation */
176 
177  /* Q Projection */
178  size_t wq; /* [embed, num_heads * head_dim] weight */
179  size_t bq; /* [num_heads * head_dim] bias (optional) */
180  size_t q; /* [seq, num_heads, head_dim] activation */
181 
182  /* K Projection */
183  size_t wk; /* [embed, num_kv_heads * head_dim] weight */
184  size_t bk; /* [num_kv_heads * head_dim] bias (optional) */
185  size_t k; /* [seq, num_kv_heads, head_dim] activation */
186 
187  /* V Projection */
188  size_t wv; /* [embed, num_kv_heads * head_dim] weight */
189  size_t bv; /* [num_kv_heads * head_dim] bias (optional) */
190  size_t v; /* [seq, num_kv_heads, head_dim] activation */
191 
192  /* RoPE (if enabled) */
193  size_t q_rope; /* [seq, num_heads, head_dim] activation */
194  size_t k_rope; /* [seq, num_kv_heads, head_dim] activation */
195 
196  /* Attention Scores */
197  size_t attn_scores; /* [num_heads, seq, seq] activation (Q @ K^T) */
198  size_t attn_probs; /* [num_heads, seq, seq] activation (softmax) */
199 
200  /* Attention Output */
201  size_t attn_out; /* [seq, num_heads, head_dim] activation (probs @ V) */
202 
203  /* Output Projection */
204  size_t wo; /* [num_heads * head_dim, embed] weight */
205  size_t bo; /* [embed] bias (optional) */
206  size_t proj_out; /* [seq, embed] activation */
207 
208  /* Residual Connection 1 */
209  size_t residual1; /* [seq, embed] activation (input + proj_out) */
210 
211  /* ========== MLP BLOCK ========== */
212 
213  /* Post-Attention Normalization */
214  size_t ln2_gamma; /* [embed] weight */
215  size_t ln2_beta; /* [embed] weight (NULL if RMSNorm) */
216  size_t ln2_output; /* [seq, embed] activation */
217 
218  /* MLP Gate Projection (for gated MLP like SwiGLU) */
219  size_t mlp_gate_w; /* [embed, intermediate] weight */
220  size_t mlp_gate_b; /* [intermediate] bias (optional) */
221  size_t mlp_gate_out; /* [seq, intermediate] activation */
222 
223  /* MLP Up Projection */
224  size_t mlp_up_w; /* [embed, intermediate] weight */
225  size_t mlp_up_b; /* [intermediate] bias (optional) */
226  size_t mlp_up_out; /* [seq, intermediate] activation */
227 
228  /* MLP Activation (SiLU/GELU) */
229  size_t mlp_act_out; /* [seq, intermediate] activation (gate * silu(up)) */
230 
231  /* MLP Down Projection */
232  size_t mlp_down_w; /* [intermediate, embed] weight */
233  size_t mlp_down_b; /* [embed] bias (optional) */
234  size_t mlp_down_out; /* [seq, embed] activation */
235 
236  /* Residual Connection 2 */
237  size_t residual2; /* [seq, embed] activation (residual1 + mlp_down_out) */
238 
239  /* ========== LAYER OUTPUT ========== */
240  size_t output; /* [seq, embed] = residual2 (input to next layer) */
241 
242  /* ========== KV CACHE (decode mode) ========== */
243  size_t k_cache; /* [max_seq, num_kv_heads, head_dim] persistent */
244  size_t v_cache; /* [max_seq, num_kv_heads, head_dim] persistent */
245 
246  /* ========== BYTE BOUNDARIES ========== */
247  size_t layer_start; /* first byte of this layer */
248  size_t layer_end; /* last byte + 1 of this layer */
249 
251 
252 /* ============================================================================
253  * LAYER GRADIENT OFFSETS (for training)
254  *
255  * Gradients follow the same interleaved pattern.
256  * Laid out AFTER all forward activations (or interleaved per-layer).
257  * ============================================================================ */
258 
259 typedef struct {
260  /* === ATTENTION BLOCK GRADIENTS === */
261  size_t d_ln1_gamma, d_ln1_beta;
262  size_t d_ln1_output;
263 
264  size_t d_wq, d_bq, d_q;
265  size_t d_wk, d_bk, d_k;
266  size_t d_wv, d_bv, d_v;
267 
268  size_t d_attn_scores, d_attn_probs;
269  size_t d_attn_out;
270 
271  size_t d_wo, d_bo;
272  size_t d_proj_out;
273  size_t d_residual1;
274 
275  /* === MLP BLOCK GRADIENTS === */
276  size_t d_ln2_gamma, d_ln2_beta;
277  size_t d_ln2_output;
278 
279  size_t d_mlp_gate_w, d_mlp_gate_b, d_mlp_gate_out;
280  size_t d_mlp_up_w, d_mlp_up_b, d_mlp_up_out;
282  size_t d_mlp_down_w, d_mlp_down_b, d_mlp_down_out;
283 
284  size_t d_residual2;
285  size_t d_output;
286 
287  /* === BYTE BOUNDARIES === */
288  size_t grad_start;
289  size_t grad_end;
290 
292 
293 /* ============================================================================
294  * LAYER OPTIMIZER STATE (Adam m and v)
295  * ============================================================================ */
296 
297 typedef struct {
298  /* First moment (m) for each weight */
299  size_t m_ln1_gamma, m_ln1_beta;
300  size_t m_wq, m_bq, m_wk, m_bk, m_wv, m_bv;
301  size_t m_wo, m_bo;
302  size_t m_ln2_gamma, m_ln2_beta;
303  size_t m_mlp_gate_w, m_mlp_gate_b;
304  size_t m_mlp_up_w, m_mlp_up_b;
305  size_t m_mlp_down_w, m_mlp_down_b;
306 
307  /* Second moment (v) for each weight */
308  size_t v_ln1_gamma, v_ln1_beta;
309  size_t v_wq, v_bq, v_wk, v_bk, v_wv, v_bv;
310  size_t v_wo, v_bo;
311  size_t v_ln2_gamma, v_ln2_beta;
312  size_t v_mlp_gate_w, v_mlp_gate_b;
313  size_t v_mlp_up_w, v_mlp_up_b;
314  size_t v_mlp_down_w, v_mlp_down_b;
315 
316  /* Byte boundaries */
317  size_t opt_start;
318  size_t opt_end;
319 
321 
322 /* ============================================================================
323  * FOOTER OFFSETS
324  *
325  * The FOOTER contains layers AFTER the transformer body:
326  * - Final normalization
327  * - Output projection (lm_head for text, classifier for vision)
328  * - Bridge projection (to connect to next section)
329  * ============================================================================ */
330 
331 typedef struct {
332  /* === FINAL NORMALIZATION === */
333  size_t final_ln_gamma; /* [embed] weight */
334  size_t final_ln_beta; /* [embed] weight (NULL if RMSNorm) */
335  size_t final_ln_output; /* [seq, embed] activation */
336 
337  /* === OUTPUT HEAD (optional) === */
338  size_t lm_head_w; /* [embed, vocab] weight (may tie with embed_weight) */
339  size_t lm_head_b; /* [vocab] bias (optional) */
340  size_t logits; /* [seq, vocab] activation */
341 
342  /* === BRIDGE TO NEXT SECTION (optional) === */
343  size_t bridge_w; /* [this_embed, next_embed] weight */
344  size_t bridge_b; /* [next_embed] bias (optional) */
345  size_t bridge_output; /* [seq, next_embed] activation */
346 
347  /* === BYTE BOUNDARIES === */
348  size_t footer_start;
349  size_t footer_end;
350 
352 
353 /* ============================================================================
354  * FOOTER GRADIENTS (for training)
355  * ============================================================================ */
356 
357 typedef struct {
358  size_t d_final_ln_gamma, d_final_ln_beta;
360 
361  size_t d_lm_head_w, d_lm_head_b;
362  size_t d_logits;
363 
364  size_t d_bridge_w, d_bridge_b;
366 
367  size_t grad_start;
368  size_t grad_end;
369 
371 
372 /* ============================================================================
373  * SECTION - Complete Header + Body + Footer
374  *
375  * Each section is a self-contained module (encoder, decoder, vision, audio).
376  * ============================================================================ */
377 
378 typedef struct {
379  /* === CONFIGURATION === */
381  const char *name; /* "vision_encoder", "text_decoder", etc. */
382  int section_id; /* 0, 1, 2, ... */
383 
384  /* === HEADER === */
386 
387  /* === BODY (transformer layers) === */
388  int num_layers;
389  CKLayerOffsets *layers; /* Array of per-layer offsets */
390  CKLayerGradOffsets *layer_grads; /* Array of per-layer gradient offsets */
391  CKLayerOptimizerOffsets *layer_opt; /* Array of per-layer optimizer state */
392 
393  /* === FOOTER === */
396 
397  /* === GLOBAL BUFFERS FOR THIS SECTION === */
398  size_t rope_cos; /* [max_seq, head_dim/2] RoPE cosines */
399  size_t rope_sin; /* [max_seq, head_dim/2] RoPE sines */
400  size_t causal_mask; /* [max_seq, max_seq] causal attention mask */
401 
402  /* === SECTION BYTE BOUNDARIES === */
403  size_t section_start; /* First byte of this section */
404  size_t section_end; /* Last byte + 1 of this section */
409 
410 } CKSection;
411 
412 /* ============================================================================
413  * MODEL - The Complete Multi-Section Model
414  *
415  * ONE allocation. ONE base pointer. ALL offsets relative to base.
416  * ============================================================================ */
417 
418 typedef struct {
419  /* === MEMORY === */
420  void *base; /* THE single allocation */
421  size_t total_bytes; /* Total size of allocation */
422 
423  /* === BREAKDOWN === */
424  size_t weight_bytes; /* Total weights across all sections */
425  size_t activation_bytes; /* Total activations */
426  size_t grad_bytes; /* Total gradients (0 if inference) */
427  size_t opt_bytes; /* Total optimizer state (0 if inference) */
428 
429  /* === SECTIONS === */
430  int num_sections;
431  CKSection *sections;
432 
433  /* === GLOBAL SHARED BUFFERS === */
434  size_t shared_scratch; /* Scratch buffer for temp computations */
436 
437  /* === MODE FLAGS === */
438  int training_enabled; /* 0=inference, 1=training */
439  int kv_cache_enabled; /* 0=prefill, 1=decode with cache */
440  uint32_t fusion_flags; /* Bitmask of enabled kernel fusions */
441 
442  /* === RUNTIME STATE (not offsets) === */
443  int current_seq_len; /* Tokens processed in current sequence */
444  int current_pos; /* Position for decode mode */
445 
446  /* === NUMA INFO === */
447  int num_numa_nodes; /* Number of NUMA nodes detected */
448  size_t hugepage_size; /* Size of hugepages (2MB or 1GB) */
449 
450 } CKModel;
451 
452 /* ============================================================================
453  * FUSION FLAGS
454  *
455  * When a fusion is enabled, intermediate activations are SKIPPED.
456  * Memory is still allocated, but the fused kernel bypasses it.
457  * ============================================================================ */
458 
459 typedef enum {
461  CK_FUSE_EMBED_NORM = 1 << 0, /* embedding + first layernorm */
462  CK_FUSE_NORM_QKV = 1 << 1, /* layernorm + QKV projection */
463  CK_FUSE_QKV_ROPE = 1 << 2, /* QKV + rotary position encoding */
464  CK_FUSE_ATTN_PROJ = 1 << 3, /* attention output + projection */
465  CK_FUSE_NORM_MLP = 1 << 4, /* layernorm + MLP input */
466  CK_FUSE_MLP_GATE_UP = 1 << 5, /* gate and up projections together */
467  CK_FUSE_MLP_ACT_DOWN = 1 << 6, /* activation + down projection */
468  CK_FUSE_RESIDUAL_NORM = 1 << 7, /* residual add + layernorm */
469 } CKFusionFlags;
470 
471 /* ============================================================================
472  * MEMORY PLANNER API
473  *
474  * Two-phase allocation:
475  * 1. Plan: dry run to compute all offsets and total size
476  * 2. Allocate: single hugepage-backed mmap
477  * ============================================================================ */
478 
479 /**
480  * Initialize section config with computed alignments.
481  */
483 
484 /**
485  * Plan memory layout for a single section.
486  * Returns bytes needed for this section.
487  */
488 size_t ck_section_plan(CKSection *section,
489  const CKSectionConfig *config,
490  int training_enabled,
491  size_t base_offset);
492 
493 /**
494  * Plan memory layout for complete model.
495  * Returns total bytes needed.
496  */
497 size_t ck_model_plan(CKModel *model,
498  const CKSectionConfig *configs,
499  int num_sections,
500  int training_enabled,
501  uint32_t fusion_flags);
502 
503 /**
504  * Allocate the planned memory.
505  * @param hugepage_mode: 0=normal, 1=2MB hugepages, 2=1GB hugepages
506  * @return 0 on success, -1 on failure
507  */
508 int ck_model_allocate(CKModel *model, int hugepage_mode);
509 
510 /**
511  * Free the model (single free, since single allocation).
512  */
513 void ck_model_free(CKModel *model);
514 
515 /* ============================================================================
516  * ACCESSOR MACROS
517  *
518  * Clean syntax for accessing tensors from offsets.
519  * ============================================================================ */
520 
521 /* Get pointer from offset */
522 #define CK_PTR(model, offset) \
523  ((float*)((char*)(model)->base + (offset)))
524 
525 #define CK_PTR_BF16(model, offset) \
526  ((uint16_t*)((char*)(model)->base + (offset)))
527 
528 /* Get layer offsets for section s, layer l */
529 #define CK_LAYER(model, s, l) \
530  (&(model)->sections[s].layers[l])
531 
532 /* Get layer gradient offsets */
533 #define CK_LAYER_GRAD(model, s, l) \
534  (&(model)->sections[s].layer_grads[l])
535 
536 /* Get header offsets for section s */
537 #define CK_HEADER(model, s) \
538  (&(model)->sections[s].header)
539 
540 /* Get footer offsets for section s */
541 #define CK_FOOTER(model, s) \
542  (&(model)->sections[s].footer)
543 
544 /* ============================================================================
545  * EXAMPLE USAGE
546  * ============================================================================
547  *
548  * // Define a vision-language model (2 sections)
549  * CKSectionConfig configs[2] = {
550  * // Section 0: Vision Encoder (ViT-L)
551  * { .embed_dim = 1024, .num_heads = 16, .num_kv_heads = 16,
552  * .head_dim = 64, .intermediate_dim = 4096, .num_layers = 24,
553  * .vocab_size = 0, .max_seq_len = 576, // 24x24 patches
554  * .has_bias = 1, .has_rope = 0, .has_pos_embed = 1,
555  * .gated_mlp = 0, .norm_type = 0 },
556  *
557  * // Section 1: Text Decoder (LLaMA-7B style)
558  * { .embed_dim = 4096, .num_heads = 32, .num_kv_heads = 32,
559  * .head_dim = 128, .intermediate_dim = 11008, .num_layers = 32,
560  * .vocab_size = 32000, .max_seq_len = 2048,
561  * .has_bias = 0, .has_rope = 1, .has_pos_embed = 0,
562  * .gated_mlp = 1, .norm_type = 1 }
563  * };
564  *
565  * CKModel model = {0};
566  * size_t total = ck_model_plan(&model, configs, 2, 1, CK_FUSE_NORM_QKV);
567  * printf("Total memory: %.2f GB\n", total / 1e9);
568  *
569  * ck_model_allocate(&model, 2); // 1GB hugepages
570  *
571  * // Access vision encoder layer 5 Q weights:
572  * float *wq = CK_PTR(&model, CK_LAYER(&model, 0, 5)->wq);
573  *
574  * // Access text decoder layer 10 attention output:
575  * float *attn = CK_PTR(&model, CK_LAYER(&model, 1, 10)->attn_out);
576  *
577  * // When done:
578  * ck_model_free(&model); // Single free!
579  *
580  * ============================================================================
581  */
582 
583 #ifdef __cplusplus
584 }
585 #endif
586 
587 #endif /* CKERNEL_SECTION_LAYOUT_H */
void ck_model_free(CKModel *model)
void ck_section_config_init(CKSectionConfig *config, size_t simd_align)
size_t ck_section_plan(CKSection *section, const CKSectionConfig *config, int training_enabled, size_t base_offset)
size_t ck_model_plan(CKModel *model, const CKSectionConfig *configs, int num_sections, int training_enabled, uint32_t fusion_flags)
int ck_model_allocate(CKModel *model, int hugepage_mode)
@ CK_FUSE_NORM_QKV
@ CK_FUSE_ATTN_PROJ
@ CK_FUSE_EMBED_NORM
@ CK_FUSE_RESIDUAL_NORM
@ CK_FUSE_MLP_ACT_DOWN
@ CK_FUSE_MLP_GATE_UP
@ CK_FUSE_QKV_ROPE
@ CK_FUSE_NORM_MLP
size_t shared_scratch_bytes
const char * name
CKLayerOptimizerOffsets * layer_opt
CKFooterOffsets footer
size_t section_activation_bytes
CKLayerGradOffsets * layer_grads
CKHeaderOffsets header
CKFooterGradOffsets footer_grads
const CKBPEConfig * config
Definition: true_bpe.h:171
int vocab_size
Definition: true_bpe.h:185