← Back to C-Kernel-Engine Docs Doxygen Source Documentation
v6.6/generated/ck-kernel-inference.c
Go to the documentation of this file.
1 /**
2  * @file ck-kernel-inference.c
3  * @brief AUTO-GENERATED: qwen2_0.5b_decode Implementation (IR v6 - Explicit Unrolled)
4  *
5  * Generated: 2026-01-15T19:23:13.600148 UTC
6  * Total Memory: 3.65 GB
7  * Mode: decode
8  * Layers: 24 (fully unrolled)
9  *
10  * Per-layer quant types:
11  * Layer 0: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
12  * Layer 1: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
13  * Layer 2: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
14  * ... (21 more layers)
15  *
16  * ═══════════════════════════════════════════════════════════════════════════
17  * MANIFEST VALIDATION (from weights_manifest.json)
18  * ═══════════════════════════════════════════════════════════════════════════
19  *
20  * Layer | WQ | WK | WV | WO | W1 | W2 | BQ | BK | BV | BO
21  * ------|-------|-------|-------|-------|-------|-------|----|----|----|----|
22  * 0 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
23  * 1 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
24  * 2 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
25  * 3 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
26  * 4 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
27  * 5 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
28  * 6 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
29  * 7 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
30  * 8 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
31  * 9 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
32  * 10 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
33  * 11 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
34  * 12 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
35  * 13 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
36  * 14 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
37  * 15 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
38  * 16 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
39  * 17 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
40  * 18 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
41  * 19 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
42  * 20 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
43  * 21 | q5_0 | q5_0 | q8_0 | q5_0 | q5_0 | q6_k | ✓ | ✓ | ✓ | ○
44  * 22 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
45  * 23 | q5_0 | q5_0 | q5_0 | q5_0 | q5_0 | q4_k | ✓ | ✓ | ✓ | ○
46  *
47  * Total manifest entries: 269
48  * Attention biases: PRESENT (Qwen2-style)
49  * ═══════════════════════════════════════════════════════════════════════════
50  *
51  *
52  * DO NOT EDIT - Regenerate with build_ir_v6.py or codegen_v6.py
53  */
54 
55 #define _GNU_SOURCE /* For MAP_ANONYMOUS, MAP_HUGETLB */
56 
57 #include "ck-kernel-inference.h"
58 
59 #include "ckernel_engine.h"
60 
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <string.h>
64 #include <stdint.h>
65 #include <math.h>
66 
67 #ifdef __linux__
68 #include <sys/mman.h>
69 #endif
70 
71 #if QWEN2_0_5B_DECODE_DTYPE_BYTES != 4
72 #error "qwen2_0.5b_decode: v6 codegen currently supports fp32 only. Use --dtype=fp32."
73 #endif
74 
75 /* ============================================================================
76  * LOCAL HELPERS (no orchestration dependency)
77  * ============================================================================ */
78 
80  const float *a,
81  const float *b,
82  float *out,
83  int tokens,
84  int aligned_embed_dim
85 ) {
86  if (!a || !b || !out) {
87  return;
88  }
89  for (int t = 0; t < tokens; ++t) {
90  const float *pa = a + (size_t)t * (size_t)aligned_embed_dim;
91  const float *pb = b + (size_t)t * (size_t)aligned_embed_dim;
92  float *pc = out + (size_t)t * (size_t)aligned_embed_dim;
93  for (int d = 0; d < aligned_embed_dim; ++d) {
94  pc[d] = pa[d] + pb[d];
95  }
96  }
97 }
98 
99 /* ============================================================================
100  * MAGIC HEADER
101  * ============================================================================ */
102 
103 typedef struct __attribute__((packed)) {
104  uint32_t magic; /* 0x434B454E */
105  uint32_t version; /* IR version */
106  uint64_t total_bytes;
107  uint64_t weight_bytes;
108  uint64_t activation_bytes;
109  uint32_t num_layers;
110  uint32_t embed_dim;
111  uint32_t num_heads;
112  uint32_t vocab_size;
113  uint32_t max_seq_len;
114  uint32_t canary_count;
115  uint8_t reserved[8]; /* Pad to 64 bytes */
117 
118 _Static_assert(sizeof(MagicHeader) == 64, "MagicHeader must be 64 bytes");
119 
120 /* ============================================================================
121  * ALLOCATION
122  * ============================================================================ */
123 
125  size_t total = QWEN2_0_5B_DECODE_TOTAL_BYTES;
126 
127 #ifdef __linux__
128  model->base = mmap(NULL, total,
129  PROT_READ | PROT_WRITE,
130  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
131  -1, 0);
132  if (model->base == MAP_FAILED) {
133  model->base = mmap(NULL, total,
134  PROT_READ | PROT_WRITE,
135  MAP_PRIVATE | MAP_ANONYMOUS,
136  -1, 0);
137  }
138  if (model->base == MAP_FAILED) {
139  perror("mmap failed");
140  return -1;
141  }
142 #else
143  model->base = aligned_alloc(64, total);
144  if (!model->base) {
145  perror("aligned_alloc failed");
146  return -1;
147  }
148 #endif
149 
150  model->total_bytes = total;
151 
152  /* Initialize magic header */
153  MagicHeader *header = (MagicHeader *)model->base;
154  header->magic = QWEN2_0_5B_DECODE_MAGIC;
155  header->version = 5;
156  header->total_bytes = QWEN2_0_5B_DECODE_TOTAL_BYTES;
157  header->weight_bytes = QWEN2_0_5B_DECODE_WEIGHT_BYTES;
158  header->activation_bytes = QWEN2_0_5B_DECODE_ACTIVATION_BYTES;
159  header->num_layers = QWEN2_0_5B_DECODE_NUM_LAYERS;
160  header->embed_dim = QWEN2_0_5B_DECODE_EMBED_DIM;
161  header->num_heads = QWEN2_0_5B_DECODE_NUM_HEADS;
162  header->vocab_size = QWEN2_0_5B_DECODE_VOCAB_SIZE;
163  header->max_seq_len = QWEN2_0_5B_DECODE_MAX_SEQ_LEN;
164  header->canary_count = QWEN2_0_5B_DECODE_CANARY_COUNT;
165 
166  /* Initialize canary guards */
167  for (int i = 0; i < QWEN2_0_5B_DECODE_CANARY_COUNT; i++) {
168  uint32_t *ptr = (uint32_t*)((char*)model->base + QWEN2_0_5B_DECODE_CANARIES[i].offset);
169  for (int j = 0; j < (QWEN2_0_5B_DECODE_CANARY_SIZE / 4); j++) {
171  }
172  }
173 
174  return 0;
175 }
176 
178  if (!model || !model->base) return;
179 #ifdef __linux__
180  munmap(model->base, model->total_bytes);
181 #else
182  free(model->base);
183 #endif
184  model->base = NULL;
185  model->total_bytes = 0;
186 }
187 
189  int errors = 0;
190  uint32_t *ptr;
191 
192  for (int i = 0; i < QWEN2_0_5B_DECODE_CANARY_COUNT; i++) {
193  ptr = (uint32_t*)((char*)model->base + QWEN2_0_5B_DECODE_CANARIES[i].offset);
194  for (int j = 0; j < 4; j++) {
195  if (ptr[j] != QWEN2_0_5B_DECODE_CANARY_VALUE) {
196  fprintf(stderr, "CANARY CORRUPTION: %s at offset 0x%lX\n",
198  QWEN2_0_5B_DECODE_CANARIES[i].offset);
199  errors++;
200  break;
201  }
202  }
203  }
204 
205  return errors;
206 }
207 
208 /* ============================================================================
209  * ALIGNMENT HELPERS
210  * ============================================================================ */
211 
212 static int qwen2_0_5b_decode_align_elems(int elems, int elem_bytes, int align_bytes) {
213  int bytes = elems * elem_bytes;
214  int aligned = (bytes + align_bytes - 1) / align_bytes * align_bytes;
215  return aligned / elem_bytes;
216 }
217 
218 /* ============================================================================
219  * ROPE PRECOMPUTE
220  * ============================================================================ */
221 
223  const int T = QWEN2_0_5B_DECODE_MAX_SEQ_LEN;
224  const int D = QWEN2_0_5B_DECODE_HEAD_DIM / 2;
225  const float theta = 1000000.0f;
226 
229 
230  for (int pos = 0; pos < T; pos++) {
231  for (int i = 0; i < D; i++) {
232  float freq = 1.0f / powf(theta, (float)(2 * i) / (float)(D * 2));
233  float angle = (float)pos * freq;
234  cos_ptr[pos * D + i] = cosf(angle);
235  sin_ptr[pos * D + i] = sinf(angle);
236  }
237  }
238 }
239 
240 /* ============================================================================
241  * EXPLICIT PER-LAYER PREFILL FUNCTIONS (v6 unrolled)
242  * ============================================================================ */
243 
244 /*
245  * Layer 0: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
246  */
247 
249  QWEN2_0_5B_DECODEModel *model,
250  const int *tokens,
251  int num_tokens
252 ) {
253  if (!model || !tokens || num_tokens <= 0) {
254  return;
255  }
256 
257  const int elem_bytes = QWEN2_0_5B_DECODE_DTYPE_BYTES;
258  const int aligned_embed_dim = 896;
259  const int aligned_head_dim = 64;
260  const int aligned_intermediate_dim = 4864;
261  const int aligned_context_window = 131072;
262 
264  const void *embed_weight = (const void *)QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_HEADER.token_emb);
265  embedding_forward_q8_0((const int32_t *)tokens,
266  num_tokens,
268  embed_weight,
269  NULL,
270  embed_out,
272  aligned_embed_dim,
273  num_tokens,
274  0);
275 
277  model,
278  num_tokens,
279  aligned_embed_dim,
280  aligned_head_dim,
281  aligned_intermediate_dim,
282  aligned_context_window);
283 
287  num_tokens,
288  aligned_context_window,
289  aligned_head_dim);
293  num_tokens,
294  aligned_context_window,
295  aligned_head_dim);
296 
298  model,
299  num_tokens,
300  aligned_embed_dim,
301  aligned_head_dim,
302  aligned_intermediate_dim,
303  aligned_context_window);
304 
308  num_tokens,
309  aligned_context_window,
310  aligned_head_dim);
314  num_tokens,
315  aligned_context_window,
316  aligned_head_dim);
317 
319  model,
320  num_tokens,
321  aligned_embed_dim,
322  aligned_head_dim,
323  aligned_intermediate_dim,
324  aligned_context_window);
325 
329  num_tokens,
330  aligned_context_window,
331  aligned_head_dim);
335  num_tokens,
336  aligned_context_window,
337  aligned_head_dim);
338 
340  model,
341  num_tokens,
342  aligned_embed_dim,
343  aligned_head_dim,
344  aligned_intermediate_dim,
345  aligned_context_window);
346 
350  num_tokens,
351  aligned_context_window,
352  aligned_head_dim);
356  num_tokens,
357  aligned_context_window,
358  aligned_head_dim);
359 
361  model,
362  num_tokens,
363  aligned_embed_dim,
364  aligned_head_dim,
365  aligned_intermediate_dim,
366  aligned_context_window);
367 
371  num_tokens,
372  aligned_context_window,
373  aligned_head_dim);
377  num_tokens,
378  aligned_context_window,
379  aligned_head_dim);
380 
382  model,
383  num_tokens,
384  aligned_embed_dim,
385  aligned_head_dim,
386  aligned_intermediate_dim,
387  aligned_context_window);
388 
392  num_tokens,
393  aligned_context_window,
394  aligned_head_dim);
398  num_tokens,
399  aligned_context_window,
400  aligned_head_dim);
401 
403  model,
404  num_tokens,
405  aligned_embed_dim,
406  aligned_head_dim,
407  aligned_intermediate_dim,
408  aligned_context_window);
409 
413  num_tokens,
414  aligned_context_window,
415  aligned_head_dim);
419  num_tokens,
420  aligned_context_window,
421  aligned_head_dim);
422 
424  model,
425  num_tokens,
426  aligned_embed_dim,
427  aligned_head_dim,
428  aligned_intermediate_dim,
429  aligned_context_window);
430 
434  num_tokens,
435  aligned_context_window,
436  aligned_head_dim);
440  num_tokens,
441  aligned_context_window,
442  aligned_head_dim);
443 
445  model,
446  num_tokens,
447  aligned_embed_dim,
448  aligned_head_dim,
449  aligned_intermediate_dim,
450  aligned_context_window);
451 
455  num_tokens,
456  aligned_context_window,
457  aligned_head_dim);
461  num_tokens,
462  aligned_context_window,
463  aligned_head_dim);
464 
466  model,
467  num_tokens,
468  aligned_embed_dim,
469  aligned_head_dim,
470  aligned_intermediate_dim,
471  aligned_context_window);
472 
476  num_tokens,
477  aligned_context_window,
478  aligned_head_dim);
482  num_tokens,
483  aligned_context_window,
484  aligned_head_dim);
485 
487  model,
488  num_tokens,
489  aligned_embed_dim,
490  aligned_head_dim,
491  aligned_intermediate_dim,
492  aligned_context_window);
493 
497  num_tokens,
498  aligned_context_window,
499  aligned_head_dim);
503  num_tokens,
504  aligned_context_window,
505  aligned_head_dim);
506 
508  model,
509  num_tokens,
510  aligned_embed_dim,
511  aligned_head_dim,
512  aligned_intermediate_dim,
513  aligned_context_window);
514 
518  num_tokens,
519  aligned_context_window,
520  aligned_head_dim);
524  num_tokens,
525  aligned_context_window,
526  aligned_head_dim);
527 
529  model,
530  num_tokens,
531  aligned_embed_dim,
532  aligned_head_dim,
533  aligned_intermediate_dim,
534  aligned_context_window);
535 
539  num_tokens,
540  aligned_context_window,
541  aligned_head_dim);
545  num_tokens,
546  aligned_context_window,
547  aligned_head_dim);
548 
550  model,
551  num_tokens,
552  aligned_embed_dim,
553  aligned_head_dim,
554  aligned_intermediate_dim,
555  aligned_context_window);
556 
560  num_tokens,
561  aligned_context_window,
562  aligned_head_dim);
566  num_tokens,
567  aligned_context_window,
568  aligned_head_dim);
569 
571  model,
572  num_tokens,
573  aligned_embed_dim,
574  aligned_head_dim,
575  aligned_intermediate_dim,
576  aligned_context_window);
577 
581  num_tokens,
582  aligned_context_window,
583  aligned_head_dim);
587  num_tokens,
588  aligned_context_window,
589  aligned_head_dim);
590 
592  model,
593  num_tokens,
594  aligned_embed_dim,
595  aligned_head_dim,
596  aligned_intermediate_dim,
597  aligned_context_window);
598 
602  num_tokens,
603  aligned_context_window,
604  aligned_head_dim);
608  num_tokens,
609  aligned_context_window,
610  aligned_head_dim);
611 
613  model,
614  num_tokens,
615  aligned_embed_dim,
616  aligned_head_dim,
617  aligned_intermediate_dim,
618  aligned_context_window);
619 
623  num_tokens,
624  aligned_context_window,
625  aligned_head_dim);
629  num_tokens,
630  aligned_context_window,
631  aligned_head_dim);
632 
634  model,
635  num_tokens,
636  aligned_embed_dim,
637  aligned_head_dim,
638  aligned_intermediate_dim,
639  aligned_context_window);
640 
644  num_tokens,
645  aligned_context_window,
646  aligned_head_dim);
650  num_tokens,
651  aligned_context_window,
652  aligned_head_dim);
653 
655  model,
656  num_tokens,
657  aligned_embed_dim,
658  aligned_head_dim,
659  aligned_intermediate_dim,
660  aligned_context_window);
661 
665  num_tokens,
666  aligned_context_window,
667  aligned_head_dim);
671  num_tokens,
672  aligned_context_window,
673  aligned_head_dim);
674 
676  model,
677  num_tokens,
678  aligned_embed_dim,
679  aligned_head_dim,
680  aligned_intermediate_dim,
681  aligned_context_window);
682 
686  num_tokens,
687  aligned_context_window,
688  aligned_head_dim);
692  num_tokens,
693  aligned_context_window,
694  aligned_head_dim);
695 
697  model,
698  num_tokens,
699  aligned_embed_dim,
700  aligned_head_dim,
701  aligned_intermediate_dim,
702  aligned_context_window);
703 
707  num_tokens,
708  aligned_context_window,
709  aligned_head_dim);
713  num_tokens,
714  aligned_context_window,
715  aligned_head_dim);
716 
718  model,
719  num_tokens,
720  aligned_embed_dim,
721  aligned_head_dim,
722  aligned_intermediate_dim,
723  aligned_context_window);
724 
728  num_tokens,
729  aligned_context_window,
730  aligned_head_dim);
734  num_tokens,
735  aligned_context_window,
736  aligned_head_dim);
737 
739  model,
740  num_tokens,
741  aligned_embed_dim,
742  aligned_head_dim,
743  aligned_intermediate_dim,
744  aligned_context_window);
745 
749  num_tokens,
750  aligned_context_window,
751  aligned_head_dim);
755  num_tokens,
756  aligned_context_window,
757  aligned_head_dim);
758 
760  model,
761  num_tokens,
762  aligned_embed_dim,
763  aligned_head_dim,
764  aligned_intermediate_dim,
765  aligned_context_window);
766 
770  num_tokens,
771  aligned_context_window,
772  aligned_head_dim);
776  num_tokens,
777  aligned_context_window,
778  aligned_head_dim);
779 
780  float *last_hidden = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[QWEN2_0_5B_DECODE_NUM_LAYERS - 1].output);
781  float *final_ln_weight = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.final_ln_weight);
783  rmsnorm_forward(last_hidden,
784  final_ln_weight,
785  final_out,
786  NULL,
787  num_tokens,
789  aligned_embed_dim,
790  1e-06f);
791 
792  float *logits = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.logits);
793  const void *lm_head = (const void *)QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.lm_head_weight);
794  for (int t = 0; t < num_tokens; ++t) {
795  const float *row = final_out + (size_t)t * (size_t)aligned_embed_dim;
796  float *logits_row = logits + (size_t)t * (size_t)QWEN2_0_5B_DECODE_VOCAB_SIZE;
797  gemm_nt_q8_0(row,
798  lm_head,
799  NULL,
800  logits_row,
801  1,
803  aligned_embed_dim);
804  }
805 }
806 
807 /* ============================================================================
808  * EXPLICIT PER-LAYER DECODE FUNCTIONS (v6 unrolled)
809  * ============================================================================ */
810 
811 /*
812  * Layer 0: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
813  */
815  QWEN2_0_5B_DECODEModel *model,
816  int token_index,
817  int aligned_embed_dim,
818  int aligned_head_dim,
819  int aligned_intermediate_dim,
820  int aligned_context_window
821 ) {
823 
825 
826  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
827  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
828  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
829  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
830  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
831  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
832  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
833  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
834  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
835  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
836  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
837 
838  /* Weights (explicit types for layer 0) */
839  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
840  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
841  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
842  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
843  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
844  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
845 
846  /* Attention biases (Qwen2-style) */
847  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
848  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
849  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
850 
853 
854  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
855  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
856  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
857 
858  float q_token[H * aligned_head_dim];
859  float k_token[H_kv * aligned_head_dim];
860  float v_token[H_kv * aligned_head_dim];
861  float attn_token[H * aligned_head_dim];
862 
863  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
864  float fc1_out[2 * aligned_intermediate_dim];
865  float swiglu_out[aligned_intermediate_dim];
866 
867  /* Step 1: RMSNorm before attention */
868  rmsnorm_forward(input,
869  ln1_gamma,
870  ln1_out,
871  NULL,
872  1,
874  aligned_embed_dim,
875  1e-06f);
876 
877  /* Step 2: QKV projection */
878  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
879  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
880 
881  /* K projection: Q5_0 -> gemm_nt_q5_0 */
882  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
883 
884  /* V projection: Q8_0 -> gemm_nt_q8_0 */
885  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
886 
887  /* Step 3: RoPE */
888  rope_forward_qk(q_token,
889  k_token,
890  rope_cos,
891  rope_sin,
892  H,
893  H_kv,
894  1,
895  head_dim,
896  aligned_head_dim,
897  token_index);
898 
899  /* Step 4: KV cache write */
901  v_token,
902  k_cache,
903  v_cache,
904  H_kv,
905  token_index,
906  aligned_context_window,
907  head_dim,
908  aligned_head_dim);
909 
910  /* Step 5: Attention (decode) */
912  k_cache,
913  v_cache,
914  attn_token,
915  H,
916  H_kv,
917  token_index + 1,
918  aligned_context_window,
919  head_dim,
920  aligned_head_dim);
921 
922  /* Step 6: Output projection */
923  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
924  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
925 
926  /* Step 7: Residual add */
927  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
928 
929  /* Step 8: RMSNorm before MLP */
930  rmsnorm_forward(residual1,
931  ln2_gamma,
932  ln2_out,
933  NULL,
934  1,
936  aligned_embed_dim,
937  1e-06f);
938 
939  /* Step 9: MLP (SwiGLU) */
940  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
941  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
942 
943  /* SwiGLU activation */
944  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
945 
946  /* Down projection: Q6_K -> gemm_nt_q6_k */
947  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
948 
949  /* Step 10: Final residual add */
950  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
951 }
952 
953 /*
954  * Layer 1: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
955  */
957  QWEN2_0_5B_DECODEModel *model,
958  int token_index,
959  int aligned_embed_dim,
960  int aligned_head_dim,
961  int aligned_intermediate_dim,
962  int aligned_context_window
963 ) {
965 
966  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[0].output);
967 
968  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
969  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
970  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
971  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
972  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
973  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
974  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
975  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
976  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
977  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
978  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
979 
980  /* Weights (explicit types for layer 1) */
981  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
982  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
983  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
984  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
985  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
986  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
987 
988  /* Attention biases (Qwen2-style) */
989  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
990  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
991  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
992 
995 
996  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
997  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
998  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
999 
1000  float q_token[H * aligned_head_dim];
1001  float k_token[H_kv * aligned_head_dim];
1002  float v_token[H_kv * aligned_head_dim];
1003  float attn_token[H * aligned_head_dim];
1004 
1005  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1006  float fc1_out[2 * aligned_intermediate_dim];
1007  float swiglu_out[aligned_intermediate_dim];
1008 
1009  /* Step 1: RMSNorm before attention */
1010  rmsnorm_forward(input,
1011  ln1_gamma,
1012  ln1_out,
1013  NULL,
1014  1,
1016  aligned_embed_dim,
1017  1e-06f);
1018 
1019  /* Step 2: QKV projection */
1020  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1021  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1022 
1023  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1024  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1025 
1026  /* V projection: Q8_0 -> gemm_nt_q8_0 */
1027  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1028 
1029  /* Step 3: RoPE */
1030  rope_forward_qk(q_token,
1031  k_token,
1032  rope_cos,
1033  rope_sin,
1034  H,
1035  H_kv,
1036  1,
1037  head_dim,
1038  aligned_head_dim,
1039  token_index);
1040 
1041  /* Step 4: KV cache write */
1042  kv_cache_write_head_major(k_token,
1043  v_token,
1044  k_cache,
1045  v_cache,
1046  H_kv,
1047  token_index,
1048  aligned_context_window,
1049  head_dim,
1050  aligned_head_dim);
1051 
1052  /* Step 5: Attention (decode) */
1054  k_cache,
1055  v_cache,
1056  attn_token,
1057  H,
1058  H_kv,
1059  token_index + 1,
1060  aligned_context_window,
1061  head_dim,
1062  aligned_head_dim);
1063 
1064  /* Step 6: Output projection */
1065  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1066  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1067 
1068  /* Step 7: Residual add */
1069  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1070 
1071  /* Step 8: RMSNorm before MLP */
1072  rmsnorm_forward(residual1,
1073  ln2_gamma,
1074  ln2_out,
1075  NULL,
1076  1,
1078  aligned_embed_dim,
1079  1e-06f);
1080 
1081  /* Step 9: MLP (SwiGLU) */
1082  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1083  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1084 
1085  /* SwiGLU activation */
1086  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1087 
1088  /* Down projection: Q6_K -> gemm_nt_q6_k */
1089  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1090 
1091  /* Step 10: Final residual add */
1092  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1093 }
1094 
1095 /*
1096  * Layer 2: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
1097  */
1099  QWEN2_0_5B_DECODEModel *model,
1100  int token_index,
1101  int aligned_embed_dim,
1102  int aligned_head_dim,
1103  int aligned_intermediate_dim,
1104  int aligned_context_window
1105 ) {
1107 
1108  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[1].output);
1109 
1110  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1111  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1112  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1113  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1114  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1115  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1116  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1117  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1118  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1119  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1120  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1121 
1122  /* Weights (explicit types for layer 2) */
1123  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1124  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1125  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
1126  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1127  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1128  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
1129 
1130  /* Attention biases (Qwen2-style) */
1131  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1132  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1133  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1134 
1137 
1138  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1139  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1140  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1141 
1142  float q_token[H * aligned_head_dim];
1143  float k_token[H_kv * aligned_head_dim];
1144  float v_token[H_kv * aligned_head_dim];
1145  float attn_token[H * aligned_head_dim];
1146 
1147  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1148  float fc1_out[2 * aligned_intermediate_dim];
1149  float swiglu_out[aligned_intermediate_dim];
1150 
1151  /* Step 1: RMSNorm before attention */
1152  rmsnorm_forward(input,
1153  ln1_gamma,
1154  ln1_out,
1155  NULL,
1156  1,
1158  aligned_embed_dim,
1159  1e-06f);
1160 
1161  /* Step 2: QKV projection */
1162  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1163  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1164 
1165  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1166  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1167 
1168  /* V projection: Q5_0 -> gemm_nt_q5_0 */
1169  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1170 
1171  /* Step 3: RoPE */
1172  rope_forward_qk(q_token,
1173  k_token,
1174  rope_cos,
1175  rope_sin,
1176  H,
1177  H_kv,
1178  1,
1179  head_dim,
1180  aligned_head_dim,
1181  token_index);
1182 
1183  /* Step 4: KV cache write */
1184  kv_cache_write_head_major(k_token,
1185  v_token,
1186  k_cache,
1187  v_cache,
1188  H_kv,
1189  token_index,
1190  aligned_context_window,
1191  head_dim,
1192  aligned_head_dim);
1193 
1194  /* Step 5: Attention (decode) */
1196  k_cache,
1197  v_cache,
1198  attn_token,
1199  H,
1200  H_kv,
1201  token_index + 1,
1202  aligned_context_window,
1203  head_dim,
1204  aligned_head_dim);
1205 
1206  /* Step 6: Output projection */
1207  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1208  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1209 
1210  /* Step 7: Residual add */
1211  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1212 
1213  /* Step 8: RMSNorm before MLP */
1214  rmsnorm_forward(residual1,
1215  ln2_gamma,
1216  ln2_out,
1217  NULL,
1218  1,
1220  aligned_embed_dim,
1221  1e-06f);
1222 
1223  /* Step 9: MLP (SwiGLU) */
1224  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1225  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1226 
1227  /* SwiGLU activation */
1228  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1229 
1230  /* Down projection: Q4_K -> gemm_nt_q4_k */
1231  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1232 
1233  /* Step 10: Final residual add */
1234  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1235 }
1236 
1237 /*
1238  * Layer 3: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
1239  */
1241  QWEN2_0_5B_DECODEModel *model,
1242  int token_index,
1243  int aligned_embed_dim,
1244  int aligned_head_dim,
1245  int aligned_intermediate_dim,
1246  int aligned_context_window
1247 ) {
1249 
1250  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[2].output);
1251 
1252  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1253  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1254  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1255  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1256  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1257  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1258  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1259  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1260  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1261  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1262  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1263 
1264  /* Weights (explicit types for layer 3) */
1265  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1266  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1267  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
1268  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1269  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1270  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
1271 
1272  /* Attention biases (Qwen2-style) */
1273  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1274  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1275  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1276 
1279 
1280  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1281  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1282  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1283 
1284  float q_token[H * aligned_head_dim];
1285  float k_token[H_kv * aligned_head_dim];
1286  float v_token[H_kv * aligned_head_dim];
1287  float attn_token[H * aligned_head_dim];
1288 
1289  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1290  float fc1_out[2 * aligned_intermediate_dim];
1291  float swiglu_out[aligned_intermediate_dim];
1292 
1293  /* Step 1: RMSNorm before attention */
1294  rmsnorm_forward(input,
1295  ln1_gamma,
1296  ln1_out,
1297  NULL,
1298  1,
1300  aligned_embed_dim,
1301  1e-06f);
1302 
1303  /* Step 2: QKV projection */
1304  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1305  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1306 
1307  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1308  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1309 
1310  /* V projection: Q8_0 -> gemm_nt_q8_0 */
1311  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1312 
1313  /* Step 3: RoPE */
1314  rope_forward_qk(q_token,
1315  k_token,
1316  rope_cos,
1317  rope_sin,
1318  H,
1319  H_kv,
1320  1,
1321  head_dim,
1322  aligned_head_dim,
1323  token_index);
1324 
1325  /* Step 4: KV cache write */
1326  kv_cache_write_head_major(k_token,
1327  v_token,
1328  k_cache,
1329  v_cache,
1330  H_kv,
1331  token_index,
1332  aligned_context_window,
1333  head_dim,
1334  aligned_head_dim);
1335 
1336  /* Step 5: Attention (decode) */
1338  k_cache,
1339  v_cache,
1340  attn_token,
1341  H,
1342  H_kv,
1343  token_index + 1,
1344  aligned_context_window,
1345  head_dim,
1346  aligned_head_dim);
1347 
1348  /* Step 6: Output projection */
1349  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1350  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1351 
1352  /* Step 7: Residual add */
1353  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1354 
1355  /* Step 8: RMSNorm before MLP */
1356  rmsnorm_forward(residual1,
1357  ln2_gamma,
1358  ln2_out,
1359  NULL,
1360  1,
1362  aligned_embed_dim,
1363  1e-06f);
1364 
1365  /* Step 9: MLP (SwiGLU) */
1366  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1367  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1368 
1369  /* SwiGLU activation */
1370  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1371 
1372  /* Down projection: Q6_K -> gemm_nt_q6_k */
1373  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1374 
1375  /* Step 10: Final residual add */
1376  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1377 }
1378 
1379 /*
1380  * Layer 4: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
1381  */
1383  QWEN2_0_5B_DECODEModel *model,
1384  int token_index,
1385  int aligned_embed_dim,
1386  int aligned_head_dim,
1387  int aligned_intermediate_dim,
1388  int aligned_context_window
1389 ) {
1391 
1392  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[3].output);
1393 
1394  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1395  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1396  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1397  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1398  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1399  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1400  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1401  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1402  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1403  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1404  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1405 
1406  /* Weights (explicit types for layer 4) */
1407  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1408  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1409  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
1410  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1411  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1412  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
1413 
1414  /* Attention biases (Qwen2-style) */
1415  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1416  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1417  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1418 
1421 
1422  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1423  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1424  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1425 
1426  float q_token[H * aligned_head_dim];
1427  float k_token[H_kv * aligned_head_dim];
1428  float v_token[H_kv * aligned_head_dim];
1429  float attn_token[H * aligned_head_dim];
1430 
1431  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1432  float fc1_out[2 * aligned_intermediate_dim];
1433  float swiglu_out[aligned_intermediate_dim];
1434 
1435  /* Step 1: RMSNorm before attention */
1436  rmsnorm_forward(input,
1437  ln1_gamma,
1438  ln1_out,
1439  NULL,
1440  1,
1442  aligned_embed_dim,
1443  1e-06f);
1444 
1445  /* Step 2: QKV projection */
1446  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1447  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1448 
1449  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1450  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1451 
1452  /* V projection: Q5_0 -> gemm_nt_q5_0 */
1453  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1454 
1455  /* Step 3: RoPE */
1456  rope_forward_qk(q_token,
1457  k_token,
1458  rope_cos,
1459  rope_sin,
1460  H,
1461  H_kv,
1462  1,
1463  head_dim,
1464  aligned_head_dim,
1465  token_index);
1466 
1467  /* Step 4: KV cache write */
1468  kv_cache_write_head_major(k_token,
1469  v_token,
1470  k_cache,
1471  v_cache,
1472  H_kv,
1473  token_index,
1474  aligned_context_window,
1475  head_dim,
1476  aligned_head_dim);
1477 
1478  /* Step 5: Attention (decode) */
1480  k_cache,
1481  v_cache,
1482  attn_token,
1483  H,
1484  H_kv,
1485  token_index + 1,
1486  aligned_context_window,
1487  head_dim,
1488  aligned_head_dim);
1489 
1490  /* Step 6: Output projection */
1491  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1492  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1493 
1494  /* Step 7: Residual add */
1495  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1496 
1497  /* Step 8: RMSNorm before MLP */
1498  rmsnorm_forward(residual1,
1499  ln2_gamma,
1500  ln2_out,
1501  NULL,
1502  1,
1504  aligned_embed_dim,
1505  1e-06f);
1506 
1507  /* Step 9: MLP (SwiGLU) */
1508  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1509  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1510 
1511  /* SwiGLU activation */
1512  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1513 
1514  /* Down projection: Q4_K -> gemm_nt_q4_k */
1515  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1516 
1517  /* Step 10: Final residual add */
1518  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1519 }
1520 
1521 /*
1522  * Layer 5: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
1523  */
1525  QWEN2_0_5B_DECODEModel *model,
1526  int token_index,
1527  int aligned_embed_dim,
1528  int aligned_head_dim,
1529  int aligned_intermediate_dim,
1530  int aligned_context_window
1531 ) {
1533 
1534  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[4].output);
1535 
1536  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1537  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1538  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1539  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1540  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1541  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1542  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1543  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1544  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1545  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1546  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1547 
1548  /* Weights (explicit types for layer 5) */
1549  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1550  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1551  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
1552  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1553  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1554  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
1555 
1556  /* Attention biases (Qwen2-style) */
1557  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1558  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1559  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1560 
1563 
1564  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1565  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1566  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1567 
1568  float q_token[H * aligned_head_dim];
1569  float k_token[H_kv * aligned_head_dim];
1570  float v_token[H_kv * aligned_head_dim];
1571  float attn_token[H * aligned_head_dim];
1572 
1573  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1574  float fc1_out[2 * aligned_intermediate_dim];
1575  float swiglu_out[aligned_intermediate_dim];
1576 
1577  /* Step 1: RMSNorm before attention */
1578  rmsnorm_forward(input,
1579  ln1_gamma,
1580  ln1_out,
1581  NULL,
1582  1,
1584  aligned_embed_dim,
1585  1e-06f);
1586 
1587  /* Step 2: QKV projection */
1588  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1589  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1590 
1591  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1592  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1593 
1594  /* V projection: Q5_0 -> gemm_nt_q5_0 */
1595  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1596 
1597  /* Step 3: RoPE */
1598  rope_forward_qk(q_token,
1599  k_token,
1600  rope_cos,
1601  rope_sin,
1602  H,
1603  H_kv,
1604  1,
1605  head_dim,
1606  aligned_head_dim,
1607  token_index);
1608 
1609  /* Step 4: KV cache write */
1610  kv_cache_write_head_major(k_token,
1611  v_token,
1612  k_cache,
1613  v_cache,
1614  H_kv,
1615  token_index,
1616  aligned_context_window,
1617  head_dim,
1618  aligned_head_dim);
1619 
1620  /* Step 5: Attention (decode) */
1622  k_cache,
1623  v_cache,
1624  attn_token,
1625  H,
1626  H_kv,
1627  token_index + 1,
1628  aligned_context_window,
1629  head_dim,
1630  aligned_head_dim);
1631 
1632  /* Step 6: Output projection */
1633  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1634  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1635 
1636  /* Step 7: Residual add */
1637  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1638 
1639  /* Step 8: RMSNorm before MLP */
1640  rmsnorm_forward(residual1,
1641  ln2_gamma,
1642  ln2_out,
1643  NULL,
1644  1,
1646  aligned_embed_dim,
1647  1e-06f);
1648 
1649  /* Step 9: MLP (SwiGLU) */
1650  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1651  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1652 
1653  /* SwiGLU activation */
1654  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1655 
1656  /* Down projection: Q4_K -> gemm_nt_q4_k */
1657  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1658 
1659  /* Step 10: Final residual add */
1660  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1661 }
1662 
1663 /*
1664  * Layer 6: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
1665  */
1667  QWEN2_0_5B_DECODEModel *model,
1668  int token_index,
1669  int aligned_embed_dim,
1670  int aligned_head_dim,
1671  int aligned_intermediate_dim,
1672  int aligned_context_window
1673 ) {
1675 
1676  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[5].output);
1677 
1678  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1679  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1680  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1681  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1682  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1683  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1684  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1685  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1686  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1687  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1688  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1689 
1690  /* Weights (explicit types for layer 6) */
1691  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1692  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1693  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
1694  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1695  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1696  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
1697 
1698  /* Attention biases (Qwen2-style) */
1699  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1700  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1701  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1702 
1705 
1706  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1707  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1708  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1709 
1710  float q_token[H * aligned_head_dim];
1711  float k_token[H_kv * aligned_head_dim];
1712  float v_token[H_kv * aligned_head_dim];
1713  float attn_token[H * aligned_head_dim];
1714 
1715  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1716  float fc1_out[2 * aligned_intermediate_dim];
1717  float swiglu_out[aligned_intermediate_dim];
1718 
1719  /* Step 1: RMSNorm before attention */
1720  rmsnorm_forward(input,
1721  ln1_gamma,
1722  ln1_out,
1723  NULL,
1724  1,
1726  aligned_embed_dim,
1727  1e-06f);
1728 
1729  /* Step 2: QKV projection */
1730  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1731  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1732 
1733  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1734  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1735 
1736  /* V projection: Q8_0 -> gemm_nt_q8_0 */
1737  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1738 
1739  /* Step 3: RoPE */
1740  rope_forward_qk(q_token,
1741  k_token,
1742  rope_cos,
1743  rope_sin,
1744  H,
1745  H_kv,
1746  1,
1747  head_dim,
1748  aligned_head_dim,
1749  token_index);
1750 
1751  /* Step 4: KV cache write */
1752  kv_cache_write_head_major(k_token,
1753  v_token,
1754  k_cache,
1755  v_cache,
1756  H_kv,
1757  token_index,
1758  aligned_context_window,
1759  head_dim,
1760  aligned_head_dim);
1761 
1762  /* Step 5: Attention (decode) */
1764  k_cache,
1765  v_cache,
1766  attn_token,
1767  H,
1768  H_kv,
1769  token_index + 1,
1770  aligned_context_window,
1771  head_dim,
1772  aligned_head_dim);
1773 
1774  /* Step 6: Output projection */
1775  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1776  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1777 
1778  /* Step 7: Residual add */
1779  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1780 
1781  /* Step 8: RMSNorm before MLP */
1782  rmsnorm_forward(residual1,
1783  ln2_gamma,
1784  ln2_out,
1785  NULL,
1786  1,
1788  aligned_embed_dim,
1789  1e-06f);
1790 
1791  /* Step 9: MLP (SwiGLU) */
1792  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1793  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1794 
1795  /* SwiGLU activation */
1796  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1797 
1798  /* Down projection: Q6_K -> gemm_nt_q6_k */
1799  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1800 
1801  /* Step 10: Final residual add */
1802  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1803 }
1804 
1805 /*
1806  * Layer 7: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
1807  */
1809  QWEN2_0_5B_DECODEModel *model,
1810  int token_index,
1811  int aligned_embed_dim,
1812  int aligned_head_dim,
1813  int aligned_intermediate_dim,
1814  int aligned_context_window
1815 ) {
1817 
1818  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[6].output);
1819 
1820  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1821  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1822  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1823  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1824  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1825  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1826  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1827  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1828  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1829  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1830  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1831 
1832  /* Weights (explicit types for layer 7) */
1833  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1834  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1835  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
1836  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1837  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1838  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
1839 
1840  /* Attention biases (Qwen2-style) */
1841  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1842  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1843  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1844 
1847 
1848  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1849  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1850  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1851 
1852  float q_token[H * aligned_head_dim];
1853  float k_token[H_kv * aligned_head_dim];
1854  float v_token[H_kv * aligned_head_dim];
1855  float attn_token[H * aligned_head_dim];
1856 
1857  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
1858  float fc1_out[2 * aligned_intermediate_dim];
1859  float swiglu_out[aligned_intermediate_dim];
1860 
1861  /* Step 1: RMSNorm before attention */
1862  rmsnorm_forward(input,
1863  ln1_gamma,
1864  ln1_out,
1865  NULL,
1866  1,
1868  aligned_embed_dim,
1869  1e-06f);
1870 
1871  /* Step 2: QKV projection */
1872  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
1873  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1874 
1875  /* K projection: Q5_0 -> gemm_nt_q5_0 */
1876  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1877 
1878  /* V projection: Q8_0 -> gemm_nt_q8_0 */
1879  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1880 
1881  /* Step 3: RoPE */
1882  rope_forward_qk(q_token,
1883  k_token,
1884  rope_cos,
1885  rope_sin,
1886  H,
1887  H_kv,
1888  1,
1889  head_dim,
1890  aligned_head_dim,
1891  token_index);
1892 
1893  /* Step 4: KV cache write */
1894  kv_cache_write_head_major(k_token,
1895  v_token,
1896  k_cache,
1897  v_cache,
1898  H_kv,
1899  token_index,
1900  aligned_context_window,
1901  head_dim,
1902  aligned_head_dim);
1903 
1904  /* Step 5: Attention (decode) */
1906  k_cache,
1907  v_cache,
1908  attn_token,
1909  H,
1910  H_kv,
1911  token_index + 1,
1912  aligned_context_window,
1913  head_dim,
1914  aligned_head_dim);
1915 
1916  /* Step 6: Output projection */
1917  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
1918  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1919 
1920  /* Step 7: Residual add */
1921  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
1922 
1923  /* Step 8: RMSNorm before MLP */
1924  rmsnorm_forward(residual1,
1925  ln2_gamma,
1926  ln2_out,
1927  NULL,
1928  1,
1930  aligned_embed_dim,
1931  1e-06f);
1932 
1933  /* Step 9: MLP (SwiGLU) */
1934  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
1935  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1936 
1937  /* SwiGLU activation */
1938  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1939 
1940  /* Down projection: Q6_K -> gemm_nt_q6_k */
1941  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1942 
1943  /* Step 10: Final residual add */
1944  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
1945 }
1946 
1947 /*
1948  * Layer 8: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
1949  */
1951  QWEN2_0_5B_DECODEModel *model,
1952  int token_index,
1953  int aligned_embed_dim,
1954  int aligned_head_dim,
1955  int aligned_intermediate_dim,
1956  int aligned_context_window
1957 ) {
1959 
1960  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[7].output);
1961 
1962  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
1963  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
1964  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
1965  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
1966  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
1967  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
1968  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
1969  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
1970  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
1971  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
1972  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
1973 
1974  /* Weights (explicit types for layer 8) */
1975  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
1976  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
1977  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
1978  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
1979  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
1980  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
1981 
1982  /* Attention biases (Qwen2-style) */
1983  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
1984  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
1985  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
1986 
1989 
1990  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
1991  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
1992  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
1993 
1994  float q_token[H * aligned_head_dim];
1995  float k_token[H_kv * aligned_head_dim];
1996  float v_token[H_kv * aligned_head_dim];
1997  float attn_token[H * aligned_head_dim];
1998 
1999  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2000  float fc1_out[2 * aligned_intermediate_dim];
2001  float swiglu_out[aligned_intermediate_dim];
2002 
2003  /* Step 1: RMSNorm before attention */
2004  rmsnorm_forward(input,
2005  ln1_gamma,
2006  ln1_out,
2007  NULL,
2008  1,
2010  aligned_embed_dim,
2011  1e-06f);
2012 
2013  /* Step 2: QKV projection */
2014  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2015  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2016 
2017  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2018  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2019 
2020  /* V projection: Q8_0 -> gemm_nt_q8_0 */
2021  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2022 
2023  /* Step 3: RoPE */
2024  rope_forward_qk(q_token,
2025  k_token,
2026  rope_cos,
2027  rope_sin,
2028  H,
2029  H_kv,
2030  1,
2031  head_dim,
2032  aligned_head_dim,
2033  token_index);
2034 
2035  /* Step 4: KV cache write */
2036  kv_cache_write_head_major(k_token,
2037  v_token,
2038  k_cache,
2039  v_cache,
2040  H_kv,
2041  token_index,
2042  aligned_context_window,
2043  head_dim,
2044  aligned_head_dim);
2045 
2046  /* Step 5: Attention (decode) */
2048  k_cache,
2049  v_cache,
2050  attn_token,
2051  H,
2052  H_kv,
2053  token_index + 1,
2054  aligned_context_window,
2055  head_dim,
2056  aligned_head_dim);
2057 
2058  /* Step 6: Output projection */
2059  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2060  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2061 
2062  /* Step 7: Residual add */
2063  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2064 
2065  /* Step 8: RMSNorm before MLP */
2066  rmsnorm_forward(residual1,
2067  ln2_gamma,
2068  ln2_out,
2069  NULL,
2070  1,
2072  aligned_embed_dim,
2073  1e-06f);
2074 
2075  /* Step 9: MLP (SwiGLU) */
2076  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2077  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2078 
2079  /* SwiGLU activation */
2080  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2081 
2082  /* Down projection: Q6_K -> gemm_nt_q6_k */
2083  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2084 
2085  /* Step 10: Final residual add */
2086  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2087 }
2088 
2089 /*
2090  * Layer 9: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
2091  */
2093  QWEN2_0_5B_DECODEModel *model,
2094  int token_index,
2095  int aligned_embed_dim,
2096  int aligned_head_dim,
2097  int aligned_intermediate_dim,
2098  int aligned_context_window
2099 ) {
2101 
2102  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[8].output);
2103 
2104  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2105  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2106  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2107  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2108  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2109  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2110  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2111  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2112  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2113  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2114  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2115 
2116  /* Weights (explicit types for layer 9) */
2117  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2118  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2119  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
2120  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2121  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2122  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
2123 
2124  /* Attention biases (Qwen2-style) */
2125  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2126  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2127  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2128 
2131 
2132  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2133  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2134  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2135 
2136  float q_token[H * aligned_head_dim];
2137  float k_token[H_kv * aligned_head_dim];
2138  float v_token[H_kv * aligned_head_dim];
2139  float attn_token[H * aligned_head_dim];
2140 
2141  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2142  float fc1_out[2 * aligned_intermediate_dim];
2143  float swiglu_out[aligned_intermediate_dim];
2144 
2145  /* Step 1: RMSNorm before attention */
2146  rmsnorm_forward(input,
2147  ln1_gamma,
2148  ln1_out,
2149  NULL,
2150  1,
2152  aligned_embed_dim,
2153  1e-06f);
2154 
2155  /* Step 2: QKV projection */
2156  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2157  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2158 
2159  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2160  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2161 
2162  /* V projection: Q8_0 -> gemm_nt_q8_0 */
2163  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2164 
2165  /* Step 3: RoPE */
2166  rope_forward_qk(q_token,
2167  k_token,
2168  rope_cos,
2169  rope_sin,
2170  H,
2171  H_kv,
2172  1,
2173  head_dim,
2174  aligned_head_dim,
2175  token_index);
2176 
2177  /* Step 4: KV cache write */
2178  kv_cache_write_head_major(k_token,
2179  v_token,
2180  k_cache,
2181  v_cache,
2182  H_kv,
2183  token_index,
2184  aligned_context_window,
2185  head_dim,
2186  aligned_head_dim);
2187 
2188  /* Step 5: Attention (decode) */
2190  k_cache,
2191  v_cache,
2192  attn_token,
2193  H,
2194  H_kv,
2195  token_index + 1,
2196  aligned_context_window,
2197  head_dim,
2198  aligned_head_dim);
2199 
2200  /* Step 6: Output projection */
2201  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2202  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2203 
2204  /* Step 7: Residual add */
2205  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2206 
2207  /* Step 8: RMSNorm before MLP */
2208  rmsnorm_forward(residual1,
2209  ln2_gamma,
2210  ln2_out,
2211  NULL,
2212  1,
2214  aligned_embed_dim,
2215  1e-06f);
2216 
2217  /* Step 9: MLP (SwiGLU) */
2218  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2219  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2220 
2221  /* SwiGLU activation */
2222  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2223 
2224  /* Down projection: Q6_K -> gemm_nt_q6_k */
2225  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2226 
2227  /* Step 10: Final residual add */
2228  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2229 }
2230 
2231 /*
2232  * Layer 10: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
2233  */
2235  QWEN2_0_5B_DECODEModel *model,
2236  int token_index,
2237  int aligned_embed_dim,
2238  int aligned_head_dim,
2239  int aligned_intermediate_dim,
2240  int aligned_context_window
2241 ) {
2243 
2244  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[9].output);
2245 
2246  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2247  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2248  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2249  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2250  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2251  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2252  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2253  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2254  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2255  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2256  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2257 
2258  /* Weights (explicit types for layer 10) */
2259  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2260  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2261  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
2262  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2263  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2264  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
2265 
2266  /* Attention biases (Qwen2-style) */
2267  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2268  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2269  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2270 
2273 
2274  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2275  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2276  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2277 
2278  float q_token[H * aligned_head_dim];
2279  float k_token[H_kv * aligned_head_dim];
2280  float v_token[H_kv * aligned_head_dim];
2281  float attn_token[H * aligned_head_dim];
2282 
2283  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2284  float fc1_out[2 * aligned_intermediate_dim];
2285  float swiglu_out[aligned_intermediate_dim];
2286 
2287  /* Step 1: RMSNorm before attention */
2288  rmsnorm_forward(input,
2289  ln1_gamma,
2290  ln1_out,
2291  NULL,
2292  1,
2294  aligned_embed_dim,
2295  1e-06f);
2296 
2297  /* Step 2: QKV projection */
2298  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2299  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2300 
2301  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2302  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2303 
2304  /* V projection: Q8_0 -> gemm_nt_q8_0 */
2305  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2306 
2307  /* Step 3: RoPE */
2308  rope_forward_qk(q_token,
2309  k_token,
2310  rope_cos,
2311  rope_sin,
2312  H,
2313  H_kv,
2314  1,
2315  head_dim,
2316  aligned_head_dim,
2317  token_index);
2318 
2319  /* Step 4: KV cache write */
2320  kv_cache_write_head_major(k_token,
2321  v_token,
2322  k_cache,
2323  v_cache,
2324  H_kv,
2325  token_index,
2326  aligned_context_window,
2327  head_dim,
2328  aligned_head_dim);
2329 
2330  /* Step 5: Attention (decode) */
2332  k_cache,
2333  v_cache,
2334  attn_token,
2335  H,
2336  H_kv,
2337  token_index + 1,
2338  aligned_context_window,
2339  head_dim,
2340  aligned_head_dim);
2341 
2342  /* Step 6: Output projection */
2343  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2344  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2345 
2346  /* Step 7: Residual add */
2347  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2348 
2349  /* Step 8: RMSNorm before MLP */
2350  rmsnorm_forward(residual1,
2351  ln2_gamma,
2352  ln2_out,
2353  NULL,
2354  1,
2356  aligned_embed_dim,
2357  1e-06f);
2358 
2359  /* Step 9: MLP (SwiGLU) */
2360  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2361  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2362 
2363  /* SwiGLU activation */
2364  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2365 
2366  /* Down projection: Q6_K -> gemm_nt_q6_k */
2367  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2368 
2369  /* Step 10: Final residual add */
2370  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2371 }
2372 
2373 /*
2374  * Layer 11: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
2375  */
2377  QWEN2_0_5B_DECODEModel *model,
2378  int token_index,
2379  int aligned_embed_dim,
2380  int aligned_head_dim,
2381  int aligned_intermediate_dim,
2382  int aligned_context_window
2383 ) {
2385 
2386  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[10].output);
2387 
2388  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2389  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2390  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2391  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2392  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2393  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2394  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2395  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2396  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2397  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2398  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2399 
2400  /* Weights (explicit types for layer 11) */
2401  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2402  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2403  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
2404  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2405  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2406  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
2407 
2408  /* Attention biases (Qwen2-style) */
2409  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2410  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2411  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2412 
2415 
2416  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2417  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2418  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2419 
2420  float q_token[H * aligned_head_dim];
2421  float k_token[H_kv * aligned_head_dim];
2422  float v_token[H_kv * aligned_head_dim];
2423  float attn_token[H * aligned_head_dim];
2424 
2425  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2426  float fc1_out[2 * aligned_intermediate_dim];
2427  float swiglu_out[aligned_intermediate_dim];
2428 
2429  /* Step 1: RMSNorm before attention */
2430  rmsnorm_forward(input,
2431  ln1_gamma,
2432  ln1_out,
2433  NULL,
2434  1,
2436  aligned_embed_dim,
2437  1e-06f);
2438 
2439  /* Step 2: QKV projection */
2440  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2441  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2442 
2443  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2444  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2445 
2446  /* V projection: Q5_0 -> gemm_nt_q5_0 */
2447  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2448 
2449  /* Step 3: RoPE */
2450  rope_forward_qk(q_token,
2451  k_token,
2452  rope_cos,
2453  rope_sin,
2454  H,
2455  H_kv,
2456  1,
2457  head_dim,
2458  aligned_head_dim,
2459  token_index);
2460 
2461  /* Step 4: KV cache write */
2462  kv_cache_write_head_major(k_token,
2463  v_token,
2464  k_cache,
2465  v_cache,
2466  H_kv,
2467  token_index,
2468  aligned_context_window,
2469  head_dim,
2470  aligned_head_dim);
2471 
2472  /* Step 5: Attention (decode) */
2474  k_cache,
2475  v_cache,
2476  attn_token,
2477  H,
2478  H_kv,
2479  token_index + 1,
2480  aligned_context_window,
2481  head_dim,
2482  aligned_head_dim);
2483 
2484  /* Step 6: Output projection */
2485  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2486  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2487 
2488  /* Step 7: Residual add */
2489  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2490 
2491  /* Step 8: RMSNorm before MLP */
2492  rmsnorm_forward(residual1,
2493  ln2_gamma,
2494  ln2_out,
2495  NULL,
2496  1,
2498  aligned_embed_dim,
2499  1e-06f);
2500 
2501  /* Step 9: MLP (SwiGLU) */
2502  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2503  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2504 
2505  /* SwiGLU activation */
2506  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2507 
2508  /* Down projection: Q4_K -> gemm_nt_q4_k */
2509  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2510 
2511  /* Step 10: Final residual add */
2512  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2513 }
2514 
2515 /*
2516  * Layer 12: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
2517  */
2519  QWEN2_0_5B_DECODEModel *model,
2520  int token_index,
2521  int aligned_embed_dim,
2522  int aligned_head_dim,
2523  int aligned_intermediate_dim,
2524  int aligned_context_window
2525 ) {
2527 
2528  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[11].output);
2529 
2530  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2531  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2532  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2533  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2534  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2535  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2536  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2537  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2538  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2539  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2540  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2541 
2542  /* Weights (explicit types for layer 12) */
2543  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2544  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2545  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
2546  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2547  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2548  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
2549 
2550  /* Attention biases (Qwen2-style) */
2551  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2552  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2553  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2554 
2557 
2558  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2559  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2560  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2561 
2562  float q_token[H * aligned_head_dim];
2563  float k_token[H_kv * aligned_head_dim];
2564  float v_token[H_kv * aligned_head_dim];
2565  float attn_token[H * aligned_head_dim];
2566 
2567  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2568  float fc1_out[2 * aligned_intermediate_dim];
2569  float swiglu_out[aligned_intermediate_dim];
2570 
2571  /* Step 1: RMSNorm before attention */
2572  rmsnorm_forward(input,
2573  ln1_gamma,
2574  ln1_out,
2575  NULL,
2576  1,
2578  aligned_embed_dim,
2579  1e-06f);
2580 
2581  /* Step 2: QKV projection */
2582  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2583  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2584 
2585  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2586  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2587 
2588  /* V projection: Q5_0 -> gemm_nt_q5_0 */
2589  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2590 
2591  /* Step 3: RoPE */
2592  rope_forward_qk(q_token,
2593  k_token,
2594  rope_cos,
2595  rope_sin,
2596  H,
2597  H_kv,
2598  1,
2599  head_dim,
2600  aligned_head_dim,
2601  token_index);
2602 
2603  /* Step 4: KV cache write */
2604  kv_cache_write_head_major(k_token,
2605  v_token,
2606  k_cache,
2607  v_cache,
2608  H_kv,
2609  token_index,
2610  aligned_context_window,
2611  head_dim,
2612  aligned_head_dim);
2613 
2614  /* Step 5: Attention (decode) */
2616  k_cache,
2617  v_cache,
2618  attn_token,
2619  H,
2620  H_kv,
2621  token_index + 1,
2622  aligned_context_window,
2623  head_dim,
2624  aligned_head_dim);
2625 
2626  /* Step 6: Output projection */
2627  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2628  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2629 
2630  /* Step 7: Residual add */
2631  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2632 
2633  /* Step 8: RMSNorm before MLP */
2634  rmsnorm_forward(residual1,
2635  ln2_gamma,
2636  ln2_out,
2637  NULL,
2638  1,
2640  aligned_embed_dim,
2641  1e-06f);
2642 
2643  /* Step 9: MLP (SwiGLU) */
2644  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2645  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2646 
2647  /* SwiGLU activation */
2648  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2649 
2650  /* Down projection: Q4_K -> gemm_nt_q4_k */
2651  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2652 
2653  /* Step 10: Final residual add */
2654  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2655 }
2656 
2657 /*
2658  * Layer 13: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
2659  */
2661  QWEN2_0_5B_DECODEModel *model,
2662  int token_index,
2663  int aligned_embed_dim,
2664  int aligned_head_dim,
2665  int aligned_intermediate_dim,
2666  int aligned_context_window
2667 ) {
2669 
2670  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[12].output);
2671 
2672  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2673  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2674  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2675  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2676  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2677  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2678  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2679  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2680  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2681  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2682  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2683 
2684  /* Weights (explicit types for layer 13) */
2685  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2686  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2687  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
2688  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2689  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2690  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
2691 
2692  /* Attention biases (Qwen2-style) */
2693  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2694  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2695  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2696 
2699 
2700  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2701  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2702  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2703 
2704  float q_token[H * aligned_head_dim];
2705  float k_token[H_kv * aligned_head_dim];
2706  float v_token[H_kv * aligned_head_dim];
2707  float attn_token[H * aligned_head_dim];
2708 
2709  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2710  float fc1_out[2 * aligned_intermediate_dim];
2711  float swiglu_out[aligned_intermediate_dim];
2712 
2713  /* Step 1: RMSNorm before attention */
2714  rmsnorm_forward(input,
2715  ln1_gamma,
2716  ln1_out,
2717  NULL,
2718  1,
2720  aligned_embed_dim,
2721  1e-06f);
2722 
2723  /* Step 2: QKV projection */
2724  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2725  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2726 
2727  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2728  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2729 
2730  /* V projection: Q8_0 -> gemm_nt_q8_0 */
2731  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2732 
2733  /* Step 3: RoPE */
2734  rope_forward_qk(q_token,
2735  k_token,
2736  rope_cos,
2737  rope_sin,
2738  H,
2739  H_kv,
2740  1,
2741  head_dim,
2742  aligned_head_dim,
2743  token_index);
2744 
2745  /* Step 4: KV cache write */
2746  kv_cache_write_head_major(k_token,
2747  v_token,
2748  k_cache,
2749  v_cache,
2750  H_kv,
2751  token_index,
2752  aligned_context_window,
2753  head_dim,
2754  aligned_head_dim);
2755 
2756  /* Step 5: Attention (decode) */
2758  k_cache,
2759  v_cache,
2760  attn_token,
2761  H,
2762  H_kv,
2763  token_index + 1,
2764  aligned_context_window,
2765  head_dim,
2766  aligned_head_dim);
2767 
2768  /* Step 6: Output projection */
2769  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2770  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2771 
2772  /* Step 7: Residual add */
2773  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2774 
2775  /* Step 8: RMSNorm before MLP */
2776  rmsnorm_forward(residual1,
2777  ln2_gamma,
2778  ln2_out,
2779  NULL,
2780  1,
2782  aligned_embed_dim,
2783  1e-06f);
2784 
2785  /* Step 9: MLP (SwiGLU) */
2786  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2787  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2788 
2789  /* SwiGLU activation */
2790  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2791 
2792  /* Down projection: Q6_K -> gemm_nt_q6_k */
2793  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2794 
2795  /* Step 10: Final residual add */
2796  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2797 }
2798 
2799 /*
2800  * Layer 14: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
2801  */
2803  QWEN2_0_5B_DECODEModel *model,
2804  int token_index,
2805  int aligned_embed_dim,
2806  int aligned_head_dim,
2807  int aligned_intermediate_dim,
2808  int aligned_context_window
2809 ) {
2811 
2812  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[13].output);
2813 
2814  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2815  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2816  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2817  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2818  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2819  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2820  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2821  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2822  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2823  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2824  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2825 
2826  /* Weights (explicit types for layer 14) */
2827  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2828  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2829  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
2830  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2831  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2832  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
2833 
2834  /* Attention biases (Qwen2-style) */
2835  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2836  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2837  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2838 
2841 
2842  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2843  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2844  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2845 
2846  float q_token[H * aligned_head_dim];
2847  float k_token[H_kv * aligned_head_dim];
2848  float v_token[H_kv * aligned_head_dim];
2849  float attn_token[H * aligned_head_dim];
2850 
2851  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2852  float fc1_out[2 * aligned_intermediate_dim];
2853  float swiglu_out[aligned_intermediate_dim];
2854 
2855  /* Step 1: RMSNorm before attention */
2856  rmsnorm_forward(input,
2857  ln1_gamma,
2858  ln1_out,
2859  NULL,
2860  1,
2862  aligned_embed_dim,
2863  1e-06f);
2864 
2865  /* Step 2: QKV projection */
2866  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
2867  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2868 
2869  /* K projection: Q5_0 -> gemm_nt_q5_0 */
2870  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2871 
2872  /* V projection: Q5_0 -> gemm_nt_q5_0 */
2873  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2874 
2875  /* Step 3: RoPE */
2876  rope_forward_qk(q_token,
2877  k_token,
2878  rope_cos,
2879  rope_sin,
2880  H,
2881  H_kv,
2882  1,
2883  head_dim,
2884  aligned_head_dim,
2885  token_index);
2886 
2887  /* Step 4: KV cache write */
2888  kv_cache_write_head_major(k_token,
2889  v_token,
2890  k_cache,
2891  v_cache,
2892  H_kv,
2893  token_index,
2894  aligned_context_window,
2895  head_dim,
2896  aligned_head_dim);
2897 
2898  /* Step 5: Attention (decode) */
2900  k_cache,
2901  v_cache,
2902  attn_token,
2903  H,
2904  H_kv,
2905  token_index + 1,
2906  aligned_context_window,
2907  head_dim,
2908  aligned_head_dim);
2909 
2910  /* Step 6: Output projection */
2911  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
2912  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2913 
2914  /* Step 7: Residual add */
2915  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
2916 
2917  /* Step 8: RMSNorm before MLP */
2918  rmsnorm_forward(residual1,
2919  ln2_gamma,
2920  ln2_out,
2921  NULL,
2922  1,
2924  aligned_embed_dim,
2925  1e-06f);
2926 
2927  /* Step 9: MLP (SwiGLU) */
2928  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
2929  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2930 
2931  /* SwiGLU activation */
2932  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2933 
2934  /* Down projection: Q4_K -> gemm_nt_q4_k */
2935  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2936 
2937  /* Step 10: Final residual add */
2938  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
2939 }
2940 
2941 /*
2942  * Layer 15: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
2943  */
2945  QWEN2_0_5B_DECODEModel *model,
2946  int token_index,
2947  int aligned_embed_dim,
2948  int aligned_head_dim,
2949  int aligned_intermediate_dim,
2950  int aligned_context_window
2951 ) {
2953 
2954  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[14].output);
2955 
2956  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
2957  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
2958  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
2959  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
2960  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
2961  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
2962  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
2963  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
2964  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
2965  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
2966  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
2967 
2968  /* Weights (explicit types for layer 15) */
2969  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
2970  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
2971  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
2972  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
2973  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
2974  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
2975 
2976  /* Attention biases (Qwen2-style) */
2977  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
2978  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
2979  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
2980 
2983 
2984  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
2985  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
2986  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
2987 
2988  float q_token[H * aligned_head_dim];
2989  float k_token[H_kv * aligned_head_dim];
2990  float v_token[H_kv * aligned_head_dim];
2991  float attn_token[H * aligned_head_dim];
2992 
2993  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
2994  float fc1_out[2 * aligned_intermediate_dim];
2995  float swiglu_out[aligned_intermediate_dim];
2996 
2997  /* Step 1: RMSNorm before attention */
2998  rmsnorm_forward(input,
2999  ln1_gamma,
3000  ln1_out,
3001  NULL,
3002  1,
3004  aligned_embed_dim,
3005  1e-06f);
3006 
3007  /* Step 2: QKV projection */
3008  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3009  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3010 
3011  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3012  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3013 
3014  /* V projection: Q5_0 -> gemm_nt_q5_0 */
3015  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3016 
3017  /* Step 3: RoPE */
3018  rope_forward_qk(q_token,
3019  k_token,
3020  rope_cos,
3021  rope_sin,
3022  H,
3023  H_kv,
3024  1,
3025  head_dim,
3026  aligned_head_dim,
3027  token_index);
3028 
3029  /* Step 4: KV cache write */
3030  kv_cache_write_head_major(k_token,
3031  v_token,
3032  k_cache,
3033  v_cache,
3034  H_kv,
3035  token_index,
3036  aligned_context_window,
3037  head_dim,
3038  aligned_head_dim);
3039 
3040  /* Step 5: Attention (decode) */
3042  k_cache,
3043  v_cache,
3044  attn_token,
3045  H,
3046  H_kv,
3047  token_index + 1,
3048  aligned_context_window,
3049  head_dim,
3050  aligned_head_dim);
3051 
3052  /* Step 6: Output projection */
3053  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3054  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3055 
3056  /* Step 7: Residual add */
3057  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3058 
3059  /* Step 8: RMSNorm before MLP */
3060  rmsnorm_forward(residual1,
3061  ln2_gamma,
3062  ln2_out,
3063  NULL,
3064  1,
3066  aligned_embed_dim,
3067  1e-06f);
3068 
3069  /* Step 9: MLP (SwiGLU) */
3070  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3071  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3072 
3073  /* SwiGLU activation */
3074  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3075 
3076  /* Down projection: Q4_K -> gemm_nt_q4_k */
3077  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3078 
3079  /* Step 10: Final residual add */
3080  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3081 }
3082 
3083 /*
3084  * Layer 16: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
3085  */
3087  QWEN2_0_5B_DECODEModel *model,
3088  int token_index,
3089  int aligned_embed_dim,
3090  int aligned_head_dim,
3091  int aligned_intermediate_dim,
3092  int aligned_context_window
3093 ) {
3095 
3096  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[15].output);
3097 
3098  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3099  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3100  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3101  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3102  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3103  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3104  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3105  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3106  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3107  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3108  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3109 
3110  /* Weights (explicit types for layer 16) */
3111  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3112  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3113  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
3114  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3115  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3116  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
3117 
3118  /* Attention biases (Qwen2-style) */
3119  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3120  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3121  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3122 
3125 
3126  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3127  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3128  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3129 
3130  float q_token[H * aligned_head_dim];
3131  float k_token[H_kv * aligned_head_dim];
3132  float v_token[H_kv * aligned_head_dim];
3133  float attn_token[H * aligned_head_dim];
3134 
3135  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3136  float fc1_out[2 * aligned_intermediate_dim];
3137  float swiglu_out[aligned_intermediate_dim];
3138 
3139  /* Step 1: RMSNorm before attention */
3140  rmsnorm_forward(input,
3141  ln1_gamma,
3142  ln1_out,
3143  NULL,
3144  1,
3146  aligned_embed_dim,
3147  1e-06f);
3148 
3149  /* Step 2: QKV projection */
3150  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3151  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3152 
3153  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3154  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3155 
3156  /* V projection: Q8_0 -> gemm_nt_q8_0 */
3157  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3158 
3159  /* Step 3: RoPE */
3160  rope_forward_qk(q_token,
3161  k_token,
3162  rope_cos,
3163  rope_sin,
3164  H,
3165  H_kv,
3166  1,
3167  head_dim,
3168  aligned_head_dim,
3169  token_index);
3170 
3171  /* Step 4: KV cache write */
3172  kv_cache_write_head_major(k_token,
3173  v_token,
3174  k_cache,
3175  v_cache,
3176  H_kv,
3177  token_index,
3178  aligned_context_window,
3179  head_dim,
3180  aligned_head_dim);
3181 
3182  /* Step 5: Attention (decode) */
3184  k_cache,
3185  v_cache,
3186  attn_token,
3187  H,
3188  H_kv,
3189  token_index + 1,
3190  aligned_context_window,
3191  head_dim,
3192  aligned_head_dim);
3193 
3194  /* Step 6: Output projection */
3195  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3196  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3197 
3198  /* Step 7: Residual add */
3199  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3200 
3201  /* Step 8: RMSNorm before MLP */
3202  rmsnorm_forward(residual1,
3203  ln2_gamma,
3204  ln2_out,
3205  NULL,
3206  1,
3208  aligned_embed_dim,
3209  1e-06f);
3210 
3211  /* Step 9: MLP (SwiGLU) */
3212  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3213  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3214 
3215  /* SwiGLU activation */
3216  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3217 
3218  /* Down projection: Q6_K -> gemm_nt_q6_k */
3219  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3220 
3221  /* Step 10: Final residual add */
3222  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3223 }
3224 
3225 /*
3226  * Layer 17: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
3227  */
3229  QWEN2_0_5B_DECODEModel *model,
3230  int token_index,
3231  int aligned_embed_dim,
3232  int aligned_head_dim,
3233  int aligned_intermediate_dim,
3234  int aligned_context_window
3235 ) {
3237 
3238  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[16].output);
3239 
3240  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3241  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3242  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3243  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3244  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3245  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3246  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3247  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3248  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3249  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3250  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3251 
3252  /* Weights (explicit types for layer 17) */
3253  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3254  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3255  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
3256  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3257  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3258  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
3259 
3260  /* Attention biases (Qwen2-style) */
3261  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3262  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3263  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3264 
3267 
3268  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3269  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3270  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3271 
3272  float q_token[H * aligned_head_dim];
3273  float k_token[H_kv * aligned_head_dim];
3274  float v_token[H_kv * aligned_head_dim];
3275  float attn_token[H * aligned_head_dim];
3276 
3277  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3278  float fc1_out[2 * aligned_intermediate_dim];
3279  float swiglu_out[aligned_intermediate_dim];
3280 
3281  /* Step 1: RMSNorm before attention */
3282  rmsnorm_forward(input,
3283  ln1_gamma,
3284  ln1_out,
3285  NULL,
3286  1,
3288  aligned_embed_dim,
3289  1e-06f);
3290 
3291  /* Step 2: QKV projection */
3292  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3293  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3294 
3295  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3296  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3297 
3298  /* V projection: Q5_0 -> gemm_nt_q5_0 */
3299  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3300 
3301  /* Step 3: RoPE */
3302  rope_forward_qk(q_token,
3303  k_token,
3304  rope_cos,
3305  rope_sin,
3306  H,
3307  H_kv,
3308  1,
3309  head_dim,
3310  aligned_head_dim,
3311  token_index);
3312 
3313  /* Step 4: KV cache write */
3314  kv_cache_write_head_major(k_token,
3315  v_token,
3316  k_cache,
3317  v_cache,
3318  H_kv,
3319  token_index,
3320  aligned_context_window,
3321  head_dim,
3322  aligned_head_dim);
3323 
3324  /* Step 5: Attention (decode) */
3326  k_cache,
3327  v_cache,
3328  attn_token,
3329  H,
3330  H_kv,
3331  token_index + 1,
3332  aligned_context_window,
3333  head_dim,
3334  aligned_head_dim);
3335 
3336  /* Step 6: Output projection */
3337  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3338  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3339 
3340  /* Step 7: Residual add */
3341  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3342 
3343  /* Step 8: RMSNorm before MLP */
3344  rmsnorm_forward(residual1,
3345  ln2_gamma,
3346  ln2_out,
3347  NULL,
3348  1,
3350  aligned_embed_dim,
3351  1e-06f);
3352 
3353  /* Step 9: MLP (SwiGLU) */
3354  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3355  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3356 
3357  /* SwiGLU activation */
3358  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3359 
3360  /* Down projection: Q4_K -> gemm_nt_q4_k */
3361  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3362 
3363  /* Step 10: Final residual add */
3364  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3365 }
3366 
3367 /*
3368  * Layer 18: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
3369  */
3371  QWEN2_0_5B_DECODEModel *model,
3372  int token_index,
3373  int aligned_embed_dim,
3374  int aligned_head_dim,
3375  int aligned_intermediate_dim,
3376  int aligned_context_window
3377 ) {
3379 
3380  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[17].output);
3381 
3382  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3383  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3384  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3385  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3386  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3387  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3388  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3389  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3390  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3391  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3392  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3393 
3394  /* Weights (explicit types for layer 18) */
3395  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3396  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3397  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
3398  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3399  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3400  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
3401 
3402  /* Attention biases (Qwen2-style) */
3403  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3404  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3405  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3406 
3409 
3410  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3411  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3412  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3413 
3414  float q_token[H * aligned_head_dim];
3415  float k_token[H_kv * aligned_head_dim];
3416  float v_token[H_kv * aligned_head_dim];
3417  float attn_token[H * aligned_head_dim];
3418 
3419  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3420  float fc1_out[2 * aligned_intermediate_dim];
3421  float swiglu_out[aligned_intermediate_dim];
3422 
3423  /* Step 1: RMSNorm before attention */
3424  rmsnorm_forward(input,
3425  ln1_gamma,
3426  ln1_out,
3427  NULL,
3428  1,
3430  aligned_embed_dim,
3431  1e-06f);
3432 
3433  /* Step 2: QKV projection */
3434  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3435  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3436 
3437  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3438  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3439 
3440  /* V projection: Q5_0 -> gemm_nt_q5_0 */
3441  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3442 
3443  /* Step 3: RoPE */
3444  rope_forward_qk(q_token,
3445  k_token,
3446  rope_cos,
3447  rope_sin,
3448  H,
3449  H_kv,
3450  1,
3451  head_dim,
3452  aligned_head_dim,
3453  token_index);
3454 
3455  /* Step 4: KV cache write */
3456  kv_cache_write_head_major(k_token,
3457  v_token,
3458  k_cache,
3459  v_cache,
3460  H_kv,
3461  token_index,
3462  aligned_context_window,
3463  head_dim,
3464  aligned_head_dim);
3465 
3466  /* Step 5: Attention (decode) */
3468  k_cache,
3469  v_cache,
3470  attn_token,
3471  H,
3472  H_kv,
3473  token_index + 1,
3474  aligned_context_window,
3475  head_dim,
3476  aligned_head_dim);
3477 
3478  /* Step 6: Output projection */
3479  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3480  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3481 
3482  /* Step 7: Residual add */
3483  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3484 
3485  /* Step 8: RMSNorm before MLP */
3486  rmsnorm_forward(residual1,
3487  ln2_gamma,
3488  ln2_out,
3489  NULL,
3490  1,
3492  aligned_embed_dim,
3493  1e-06f);
3494 
3495  /* Step 9: MLP (SwiGLU) */
3496  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3497  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3498 
3499  /* SwiGLU activation */
3500  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3501 
3502  /* Down projection: Q4_K -> gemm_nt_q4_k */
3503  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3504 
3505  /* Step 10: Final residual add */
3506  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3507 }
3508 
3509 /*
3510  * Layer 19: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
3511  */
3513  QWEN2_0_5B_DECODEModel *model,
3514  int token_index,
3515  int aligned_embed_dim,
3516  int aligned_head_dim,
3517  int aligned_intermediate_dim,
3518  int aligned_context_window
3519 ) {
3521 
3522  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[18].output);
3523 
3524  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3525  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3526  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3527  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3528  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3529  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3530  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3531  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3532  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3533  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3534  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3535 
3536  /* Weights (explicit types for layer 19) */
3537  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3538  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3539  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
3540  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3541  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3542  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
3543 
3544  /* Attention biases (Qwen2-style) */
3545  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3546  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3547  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3548 
3551 
3552  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3553  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3554  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3555 
3556  float q_token[H * aligned_head_dim];
3557  float k_token[H_kv * aligned_head_dim];
3558  float v_token[H_kv * aligned_head_dim];
3559  float attn_token[H * aligned_head_dim];
3560 
3561  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3562  float fc1_out[2 * aligned_intermediate_dim];
3563  float swiglu_out[aligned_intermediate_dim];
3564 
3565  /* Step 1: RMSNorm before attention */
3566  rmsnorm_forward(input,
3567  ln1_gamma,
3568  ln1_out,
3569  NULL,
3570  1,
3572  aligned_embed_dim,
3573  1e-06f);
3574 
3575  /* Step 2: QKV projection */
3576  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3577  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3578 
3579  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3580  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3581 
3582  /* V projection: Q8_0 -> gemm_nt_q8_0 */
3583  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3584 
3585  /* Step 3: RoPE */
3586  rope_forward_qk(q_token,
3587  k_token,
3588  rope_cos,
3589  rope_sin,
3590  H,
3591  H_kv,
3592  1,
3593  head_dim,
3594  aligned_head_dim,
3595  token_index);
3596 
3597  /* Step 4: KV cache write */
3598  kv_cache_write_head_major(k_token,
3599  v_token,
3600  k_cache,
3601  v_cache,
3602  H_kv,
3603  token_index,
3604  aligned_context_window,
3605  head_dim,
3606  aligned_head_dim);
3607 
3608  /* Step 5: Attention (decode) */
3610  k_cache,
3611  v_cache,
3612  attn_token,
3613  H,
3614  H_kv,
3615  token_index + 1,
3616  aligned_context_window,
3617  head_dim,
3618  aligned_head_dim);
3619 
3620  /* Step 6: Output projection */
3621  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3622  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3623 
3624  /* Step 7: Residual add */
3625  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3626 
3627  /* Step 8: RMSNorm before MLP */
3628  rmsnorm_forward(residual1,
3629  ln2_gamma,
3630  ln2_out,
3631  NULL,
3632  1,
3634  aligned_embed_dim,
3635  1e-06f);
3636 
3637  /* Step 9: MLP (SwiGLU) */
3638  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3639  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3640 
3641  /* SwiGLU activation */
3642  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3643 
3644  /* Down projection: Q6_K -> gemm_nt_q6_k */
3645  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3646 
3647  /* Step 10: Final residual add */
3648  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3649 }
3650 
3651 /*
3652  * Layer 20: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
3653  */
3655  QWEN2_0_5B_DECODEModel *model,
3656  int token_index,
3657  int aligned_embed_dim,
3658  int aligned_head_dim,
3659  int aligned_intermediate_dim,
3660  int aligned_context_window
3661 ) {
3663 
3664  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[19].output);
3665 
3666  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3667  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3668  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3669  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3670  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3671  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3672  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3673  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3674  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3675  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3676  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3677 
3678  /* Weights (explicit types for layer 20) */
3679  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3680  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3681  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
3682  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3683  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3684  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
3685 
3686  /* Attention biases (Qwen2-style) */
3687  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3688  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3689  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3690 
3693 
3694  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3695  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3696  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3697 
3698  float q_token[H * aligned_head_dim];
3699  float k_token[H_kv * aligned_head_dim];
3700  float v_token[H_kv * aligned_head_dim];
3701  float attn_token[H * aligned_head_dim];
3702 
3703  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3704  float fc1_out[2 * aligned_intermediate_dim];
3705  float swiglu_out[aligned_intermediate_dim];
3706 
3707  /* Step 1: RMSNorm before attention */
3708  rmsnorm_forward(input,
3709  ln1_gamma,
3710  ln1_out,
3711  NULL,
3712  1,
3714  aligned_embed_dim,
3715  1e-06f);
3716 
3717  /* Step 2: QKV projection */
3718  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3719  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3720 
3721  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3722  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3723 
3724  /* V projection: Q5_0 -> gemm_nt_q5_0 */
3725  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3726 
3727  /* Step 3: RoPE */
3728  rope_forward_qk(q_token,
3729  k_token,
3730  rope_cos,
3731  rope_sin,
3732  H,
3733  H_kv,
3734  1,
3735  head_dim,
3736  aligned_head_dim,
3737  token_index);
3738 
3739  /* Step 4: KV cache write */
3740  kv_cache_write_head_major(k_token,
3741  v_token,
3742  k_cache,
3743  v_cache,
3744  H_kv,
3745  token_index,
3746  aligned_context_window,
3747  head_dim,
3748  aligned_head_dim);
3749 
3750  /* Step 5: Attention (decode) */
3752  k_cache,
3753  v_cache,
3754  attn_token,
3755  H,
3756  H_kv,
3757  token_index + 1,
3758  aligned_context_window,
3759  head_dim,
3760  aligned_head_dim);
3761 
3762  /* Step 6: Output projection */
3763  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3764  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3765 
3766  /* Step 7: Residual add */
3767  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3768 
3769  /* Step 8: RMSNorm before MLP */
3770  rmsnorm_forward(residual1,
3771  ln2_gamma,
3772  ln2_out,
3773  NULL,
3774  1,
3776  aligned_embed_dim,
3777  1e-06f);
3778 
3779  /* Step 9: MLP (SwiGLU) */
3780  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3781  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3782 
3783  /* SwiGLU activation */
3784  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3785 
3786  /* Down projection: Q4_K -> gemm_nt_q4_k */
3787  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3788 
3789  /* Step 10: Final residual add */
3790  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3791 }
3792 
3793 /*
3794  * Layer 21: wq=q5_0 wk=q5_0 wv=q8_0 wo=q5_0 w1=q5_0 w2=q6_k
3795  */
3797  QWEN2_0_5B_DECODEModel *model,
3798  int token_index,
3799  int aligned_embed_dim,
3800  int aligned_head_dim,
3801  int aligned_intermediate_dim,
3802  int aligned_context_window
3803 ) {
3805 
3806  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[20].output);
3807 
3808  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3809  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3810  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3811  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3812  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3813  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3814  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3815  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3816  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3817  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3818  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3819 
3820  /* Weights (explicit types for layer 21) */
3821  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3822  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3823  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q8_0 */
3824  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3825  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3826  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q6_K (down) */
3827 
3828  /* Attention biases (Qwen2-style) */
3829  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3830  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3831  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3832 
3835 
3836  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3837  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3838  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3839 
3840  float q_token[H * aligned_head_dim];
3841  float k_token[H_kv * aligned_head_dim];
3842  float v_token[H_kv * aligned_head_dim];
3843  float attn_token[H * aligned_head_dim];
3844 
3845  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3846  float fc1_out[2 * aligned_intermediate_dim];
3847  float swiglu_out[aligned_intermediate_dim];
3848 
3849  /* Step 1: RMSNorm before attention */
3850  rmsnorm_forward(input,
3851  ln1_gamma,
3852  ln1_out,
3853  NULL,
3854  1,
3856  aligned_embed_dim,
3857  1e-06f);
3858 
3859  /* Step 2: QKV projection */
3860  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
3861  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3862 
3863  /* K projection: Q5_0 -> gemm_nt_q5_0 */
3864  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3865 
3866  /* V projection: Q8_0 -> gemm_nt_q8_0 */
3867  gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3868 
3869  /* Step 3: RoPE */
3870  rope_forward_qk(q_token,
3871  k_token,
3872  rope_cos,
3873  rope_sin,
3874  H,
3875  H_kv,
3876  1,
3877  head_dim,
3878  aligned_head_dim,
3879  token_index);
3880 
3881  /* Step 4: KV cache write */
3882  kv_cache_write_head_major(k_token,
3883  v_token,
3884  k_cache,
3885  v_cache,
3886  H_kv,
3887  token_index,
3888  aligned_context_window,
3889  head_dim,
3890  aligned_head_dim);
3891 
3892  /* Step 5: Attention (decode) */
3894  k_cache,
3895  v_cache,
3896  attn_token,
3897  H,
3898  H_kv,
3899  token_index + 1,
3900  aligned_context_window,
3901  head_dim,
3902  aligned_head_dim);
3903 
3904  /* Step 6: Output projection */
3905  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
3906  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3907 
3908  /* Step 7: Residual add */
3909  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
3910 
3911  /* Step 8: RMSNorm before MLP */
3912  rmsnorm_forward(residual1,
3913  ln2_gamma,
3914  ln2_out,
3915  NULL,
3916  1,
3918  aligned_embed_dim,
3919  1e-06f);
3920 
3921  /* Step 9: MLP (SwiGLU) */
3922  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
3923  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3924 
3925  /* SwiGLU activation */
3926  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3927 
3928  /* Down projection: Q6_K -> gemm_nt_q6_k */
3929  gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3930 
3931  /* Step 10: Final residual add */
3932  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
3933 }
3934 
3935 /*
3936  * Layer 22: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
3937  */
3939  QWEN2_0_5B_DECODEModel *model,
3940  int token_index,
3941  int aligned_embed_dim,
3942  int aligned_head_dim,
3943  int aligned_intermediate_dim,
3944  int aligned_context_window
3945 ) {
3947 
3948  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[21].output);
3949 
3950  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
3951  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
3952  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
3953  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
3954  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
3955  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
3956  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
3957  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
3958  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
3959  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
3960  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
3961 
3962  /* Weights (explicit types for layer 22) */
3963  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
3964  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
3965  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
3966  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
3967  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
3968  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
3969 
3970  /* Attention biases (Qwen2-style) */
3971  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
3972  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
3973  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
3974 
3977 
3978  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
3979  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
3980  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
3981 
3982  float q_token[H * aligned_head_dim];
3983  float k_token[H_kv * aligned_head_dim];
3984  float v_token[H_kv * aligned_head_dim];
3985  float attn_token[H * aligned_head_dim];
3986 
3987  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
3988  float fc1_out[2 * aligned_intermediate_dim];
3989  float swiglu_out[aligned_intermediate_dim];
3990 
3991  /* Step 1: RMSNorm before attention */
3992  rmsnorm_forward(input,
3993  ln1_gamma,
3994  ln1_out,
3995  NULL,
3996  1,
3998  aligned_embed_dim,
3999  1e-06f);
4000 
4001  /* Step 2: QKV projection */
4002  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
4003  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
4004 
4005  /* K projection: Q5_0 -> gemm_nt_q5_0 */
4006  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
4007 
4008  /* V projection: Q5_0 -> gemm_nt_q5_0 */
4009  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
4010 
4011  /* Step 3: RoPE */
4012  rope_forward_qk(q_token,
4013  k_token,
4014  rope_cos,
4015  rope_sin,
4016  H,
4017  H_kv,
4018  1,
4019  head_dim,
4020  aligned_head_dim,
4021  token_index);
4022 
4023  /* Step 4: KV cache write */
4024  kv_cache_write_head_major(k_token,
4025  v_token,
4026  k_cache,
4027  v_cache,
4028  H_kv,
4029  token_index,
4030  aligned_context_window,
4031  head_dim,
4032  aligned_head_dim);
4033 
4034  /* Step 5: Attention (decode) */
4036  k_cache,
4037  v_cache,
4038  attn_token,
4039  H,
4040  H_kv,
4041  token_index + 1,
4042  aligned_context_window,
4043  head_dim,
4044  aligned_head_dim);
4045 
4046  /* Step 6: Output projection */
4047  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
4048  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
4049 
4050  /* Step 7: Residual add */
4051  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
4052 
4053  /* Step 8: RMSNorm before MLP */
4054  rmsnorm_forward(residual1,
4055  ln2_gamma,
4056  ln2_out,
4057  NULL,
4058  1,
4060  aligned_embed_dim,
4061  1e-06f);
4062 
4063  /* Step 9: MLP (SwiGLU) */
4064  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
4065  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
4066 
4067  /* SwiGLU activation */
4068  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
4069 
4070  /* Down projection: Q4_K -> gemm_nt_q4_k */
4071  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
4072 
4073  /* Step 10: Final residual add */
4074  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
4075 }
4076 
4077 /*
4078  * Layer 23: wq=q5_0 wk=q5_0 wv=q5_0 wo=q5_0 w1=q5_0 w2=q4_k
4079  */
4081  QWEN2_0_5B_DECODEModel *model,
4082  int token_index,
4083  int aligned_embed_dim,
4084  int aligned_head_dim,
4085  int aligned_intermediate_dim,
4086  int aligned_context_window
4087 ) {
4089 
4090  float *input = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[22].output);
4091 
4092  float *ln1_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln1_gamma);
4093  float *ln1_out = QWEN2_0_5B_DECODE_PTR(model, L->ln1_out);
4094  float *ln2_gamma = QWEN2_0_5B_DECODE_PTR(model, L->ln2_gamma);
4095  float *ln2_out = QWEN2_0_5B_DECODE_PTR(model, L->ln2_out);
4096  float *k_cache = QWEN2_0_5B_DECODE_PTR(model, L->k);
4097  float *v_cache = QWEN2_0_5B_DECODE_PTR(model, L->v);
4098  float *proj_tmp = QWEN2_0_5B_DECODE_PTR(model, L->proj_tmp);
4099  float *proj_scratch = QWEN2_0_5B_DECODE_PTR(model, L->proj_scratch);
4100  float *residual1 = QWEN2_0_5B_DECODE_PTR(model, L->residual1);
4101  float *mlp_out = QWEN2_0_5B_DECODE_PTR(model, L->mlp_out);
4102  float *output = QWEN2_0_5B_DECODE_PTR(model, L->output);
4103 
4104  /* Weights (explicit types for layer 23) */
4105  const void *WQ = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wq); /* Q5_0 */
4106  const void *WK = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wk); /* Q5_0 */
4107  const void *WV = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wv); /* Q5_0 */
4108  const void *WO = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->wo); /* Q5_0 */
4109  const void *W1 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w1); /* Q5_0 (gate+up) */
4110  const void *W2 = (const void *)QWEN2_0_5B_DECODE_PTR(model, L->w2); /* Q4_K (down) */
4111 
4112  /* Attention biases (Qwen2-style) */
4113  const float *BQ = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bq);
4114  const float *BK = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bk);
4115  const float *BV = (const float *)QWEN2_0_5B_DECODE_PTR(model, L->bv);
4116 
4119 
4120  const int H = QWEN2_0_5B_DECODE_NUM_HEADS;
4121  const int H_kv = QWEN2_0_5B_DECODE_NUM_KV_HEADS;
4122  const int head_dim = QWEN2_0_5B_DECODE_HEAD_DIM;
4123 
4124  float q_token[H * aligned_head_dim];
4125  float k_token[H_kv * aligned_head_dim];
4126  float v_token[H_kv * aligned_head_dim];
4127  float attn_token[H * aligned_head_dim];
4128 
4129  /* Local MLP buffers (avoid layout dependencies for intermediate values) */
4130  float fc1_out[2 * aligned_intermediate_dim];
4131  float swiglu_out[aligned_intermediate_dim];
4132 
4133  /* Step 1: RMSNorm before attention */
4134  rmsnorm_forward(input,
4135  ln1_gamma,
4136  ln1_out,
4137  NULL,
4138  1,
4140  aligned_embed_dim,
4141  1e-06f);
4142 
4143  /* Step 2: QKV projection */
4144  /* Q projection: Q5_0 -> gemm_nt_q5_0 */
4145  gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
4146 
4147  /* K projection: Q5_0 -> gemm_nt_q5_0 */
4148  gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
4149 
4150  /* V projection: Q5_0 -> gemm_nt_q5_0 */
4151  gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
4152 
4153  /* Step 3: RoPE */
4154  rope_forward_qk(q_token,
4155  k_token,
4156  rope_cos,
4157  rope_sin,
4158  H,
4159  H_kv,
4160  1,
4161  head_dim,
4162  aligned_head_dim,
4163  token_index);
4164 
4165  /* Step 4: KV cache write */
4166  kv_cache_write_head_major(k_token,
4167  v_token,
4168  k_cache,
4169  v_cache,
4170  H_kv,
4171  token_index,
4172  aligned_context_window,
4173  head_dim,
4174  aligned_head_dim);
4175 
4176  /* Step 5: Attention (decode) */
4178  k_cache,
4179  v_cache,
4180  attn_token,
4181  H,
4182  H_kv,
4183  token_index + 1,
4184  aligned_context_window,
4185  head_dim,
4186  aligned_head_dim);
4187 
4188  /* Step 6: Output projection */
4189  /* WO projection: Q5_0 -> gemm_nt_q5_0 */
4190  gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
4191 
4192  /* Step 7: Residual add */
4193  qwen2_0_5b_decode_residual_add_token_major(input, proj_tmp, residual1, 1, aligned_embed_dim);
4194 
4195  /* Step 8: RMSNorm before MLP */
4196  rmsnorm_forward(residual1,
4197  ln2_gamma,
4198  ln2_out,
4199  NULL,
4200  1,
4202  aligned_embed_dim,
4203  1e-06f);
4204 
4205  /* Step 9: MLP (SwiGLU) */
4206  /* Gate+Up projection: Q5_0 -> gemm_nt_q5_0 */
4207  gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
4208 
4209  /* SwiGLU activation */
4210  swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
4211 
4212  /* Down projection: Q4_K -> gemm_nt_q4_k */
4213  gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
4214 
4215  /* Step 10: Final residual add */
4216  qwen2_0_5b_decode_residual_add_token_major(residual1, mlp_out, output, 1, aligned_embed_dim);
4217 }
4218 
4219 /* ============================================================================
4220  * DECODE TOKEN (calls each layer explicitly)
4221  * ============================================================================ */
4222 
4224  QWEN2_0_5B_DECODEModel *model,
4225  const int *token,
4226  int token_index
4227 ) {
4228  if (!model || !token) return;
4229 
4230  const int aligned_embed_dim = 896;
4231  const int aligned_head_dim = 64;
4232  const int aligned_intermediate_dim = 4864;
4233  const int aligned_context_window = 131072;
4234 
4235  if (token_index < 0 || token_index >= aligned_context_window) return;
4236 
4237  /* Embedding lookup */
4239  const void *embed_weight = (const void *)QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_HEADER.token_emb);
4240  /* Embedding: Q8_0 -> embedding_forward_q8_0 */
4241  embedding_forward_q8_0((const int32_t *)token,
4242  1,
4244  embed_weight,
4245  NULL,
4246  embed_out,
4248  aligned_embed_dim,
4249  1,
4250  0);
4251 
4252  /* Process each layer explicitly */
4253  qwen2_0_5b_decode_layer_0_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4254  qwen2_0_5b_decode_layer_1_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4255  qwen2_0_5b_decode_layer_2_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4256  qwen2_0_5b_decode_layer_3_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4257  qwen2_0_5b_decode_layer_4_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4258  qwen2_0_5b_decode_layer_5_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4259  qwen2_0_5b_decode_layer_6_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4260  qwen2_0_5b_decode_layer_7_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4261  qwen2_0_5b_decode_layer_8_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4262  qwen2_0_5b_decode_layer_9_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4263  qwen2_0_5b_decode_layer_10_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4264  qwen2_0_5b_decode_layer_11_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4265  qwen2_0_5b_decode_layer_12_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4266  qwen2_0_5b_decode_layer_13_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4267  qwen2_0_5b_decode_layer_14_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4268  qwen2_0_5b_decode_layer_15_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4269  qwen2_0_5b_decode_layer_16_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4270  qwen2_0_5b_decode_layer_17_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4271  qwen2_0_5b_decode_layer_18_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4272  qwen2_0_5b_decode_layer_19_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4273  qwen2_0_5b_decode_layer_20_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4274  qwen2_0_5b_decode_layer_21_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4275  qwen2_0_5b_decode_layer_22_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4276  qwen2_0_5b_decode_layer_23_decode(model, token_index, aligned_embed_dim, aligned_head_dim, aligned_intermediate_dim, aligned_context_window);
4277 
4278  /* Final RMSNorm */
4279  float *last_hidden = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_LAYERS[23].output);
4280  float *final_ln_weight = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.final_ln_weight);
4281  float *final_out = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.final_output);
4282  rmsnorm_forward(last_hidden,
4283  final_ln_weight,
4284  final_out,
4285  NULL,
4286  1,
4288  aligned_embed_dim,
4289  1e-06f);
4290 
4291  /* LM head projection */
4292  float *logits = QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.logits);
4293  const void *lm_head = (const void *)QWEN2_0_5B_DECODE_PTR(model, QWEN2_0_5B_DECODE_FOOTER.lm_head_weight);
4294  /* LM head: Q8_0 -> gemm_nt_q8_0 */
4295  gemm_nt_q8_0(final_out, lm_head, NULL, logits, 1, QWEN2_0_5B_DECODE_VOCAB_SIZE, aligned_embed_dim);
4296 }
4297 
4298 /* ============================================================================
4299  * PUBLIC API
4300  * ============================================================================ */
4301 
4303  QWEN2_0_5B_DECODEModel *model,
4304  const int *tokens,
4305  int num_tokens
4306 ) {
4307  if (!model || !tokens || num_tokens <= 0) return;
4308  qwen2_0_5b_decode_forward_prefill_impl(model, tokens, num_tokens);
4309 }
4310 
4311 void qwen2_0_5b_decode_decode(QWEN2_0_5B_DECODEModel *model, const int *token, int token_index) {
4312  qwen2_0_5b_decode_decode_token(model, token, token_index);
4313 }
void swiglu_forward(const float *input, float *output, int tokens, int dim)
void gemm_nt_q4_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void kv_cache_repack_head_major_inplace(float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim)
void kv_cache_write_head_major(const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)
void gemm_nt_q5_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void gemm_nt_q6_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void gemm_nt_q8_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.
void attention_forward_decode_head_major_gqa_regular(const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
WARNING: This is NOT true flash attention!
void rmsnorm_forward(const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
void rope_forward_qk(float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:448
void embedding_forward_q8_0(const int32_t *token_ids, int token_count, int vocab_size, const void *token_embeddings, const float *pos_embeddings, float *output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)
const char * token
Definition: tokenizer.h:306
int vocab_size
Definition: true_bpe.h:185
static void qwen2_0_5b_decode_layer_18_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_2_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_1_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_8_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_20_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_0_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_7_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_6_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_19_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_15_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_17_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_22_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_3_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_13_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_21_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_4_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_12_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_16_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_5_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_11_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_14_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_10_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_9_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_23_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
#define QWEN2_0_5B_DECODE_TOTAL_BYTES
#define QWEN2_0_5B_DECODE_PTR(model, offset)
#define QWEN2_0_5B_DECODE_ACTIVATION_BYTES
static const QWEN2_0_5B_DECODEFooterOffsets QWEN2_0_5B_DECODE_FOOTER
static const QWEN2_0_5B_DECODELayerOffsets QWEN2_0_5B_DECODE_LAYERS[24]
#define QWEN2_0_5B_DECODE_DTYPE_BYTES
#define QWEN2_0_5B_DECODE_MAX_SEQ_LEN
#define QWEN2_0_5B_DECODE_NUM_LAYERS
#define QWEN2_0_5B_DECODE_WEIGHT_BYTES
#define QWEN2_0_5B_DECODE_CANARY_VALUE
static const QWEN2_0_5B_DECODEGlobalOffsets QWEN2_0_5B_DECODE_GLOBALS
#define QWEN2_0_5B_DECODE_NUM_KV_HEADS
#define QWEN2_0_5B_DECODE_CANARY_SIZE
#define QWEN2_0_5B_DECODE_VOCAB_SIZE
static const QWEN2_0_5B_DECODEHeaderOffsets QWEN2_0_5B_DECODE_HEADER
static const QWEN2_0_5B_DECODECanary QWEN2_0_5B_DECODE_CANARIES[]
static void qwen2_0_5b_decode_layer_3_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_18_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_16_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_precompute_rope(QWEN2_0_5B_DECODEModel *model)
int qwen2_0_5b_decode_verify_canaries(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_residual_add_token_major(const float *a, const float *b, float *out, int tokens, int aligned_embed_dim)
static void qwen2_0_5b_decode_layer_4_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static int qwen2_0_5b_decode_align_elems(int elems, int elem_bytes, int align_bytes)
static void qwen2_0_5b_decode_layer_8_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_5_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_13_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
int qwen2_0_5b_decode_model_allocate(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_layer_0_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_19_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_6_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_7_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_decode(QWEN2_0_5B_DECODEModel *model, const int *token, int token_index)
static void qwen2_0_5b_decode_layer_2_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_forward_prefill_impl(QWEN2_0_5B_DECODEModel *model, const int *tokens, int num_tokens)
static void qwen2_0_5b_decode_layer_9_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_10_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_decode_token(QWEN2_0_5B_DECODEModel *model, const int *token, int token_index)
static void qwen2_0_5b_decode_layer_21_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_22_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_11_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_23_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
struct __attribute__((packed))
static void qwen2_0_5b_decode_layer_1_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_forward(QWEN2_0_5B_DECODEModel *model, const int *tokens, int num_tokens)
static void qwen2_0_5b_decode_layer_14_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_15_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_12_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_model_free(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_layer_20_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
_Static_assert(sizeof(MagicHeader)==64, "MagicHeader must be 64 bytes")
static void qwen2_0_5b_decode_layer_17_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
AUTO-GENERATED: qwen2_0.5b_decode Memory Layout.