71 #if QWEN2_0_5B_DECODE_DTYPE_BYTES != 4
72 #error "qwen2_0.5b_decode: v6 codegen currently supports fp32 only. Use --dtype=fp32."
86 if (!a || !b || !out) {
89 for (
int t = 0; t < tokens; ++t) {
90 const float *pa = a + (size_t)t * (
size_t)aligned_embed_dim;
91 const float *pb = b + (size_t)t * (
size_t)aligned_embed_dim;
92 float *pc = out + (size_t)t * (
size_t)aligned_embed_dim;
93 for (
int d = 0; d < aligned_embed_dim; ++d) {
94 pc[d] = pa[d] + pb[d];
106 uint64_t total_bytes;
107 uint64_t weight_bytes;
108 uint64_t activation_bytes;
113 uint32_t max_seq_len;
114 uint32_t canary_count;
128 model->
base = mmap(NULL, total,
129 PROT_READ | PROT_WRITE,
130 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
132 if (model->
base == MAP_FAILED) {
133 model->
base = mmap(NULL, total,
134 PROT_READ | PROT_WRITE,
135 MAP_PRIVATE | MAP_ANONYMOUS,
138 if (model->
base == MAP_FAILED) {
139 perror(
"mmap failed");
143 model->
base = aligned_alloc(64, total);
145 perror(
"aligned_alloc failed");
178 if (!model || !model->
base)
return;
194 for (
int j = 0; j < 4; j++) {
196 fprintf(stderr,
"CANARY CORRUPTION: %s at offset 0x%lX\n",
213 int bytes = elems * elem_bytes;
214 int aligned = (bytes + align_bytes - 1) / align_bytes * align_bytes;
215 return aligned / elem_bytes;
225 const float theta = 1000000.0f;
230 for (
int pos = 0; pos < T; pos++) {
231 for (
int i = 0; i < D; i++) {
232 float freq = 1.0f / powf(theta, (
float)(2 * i) / (
float)(D * 2));
233 float angle = (float)pos * freq;
234 cos_ptr[pos * D + i] = cosf(angle);
235 sin_ptr[pos * D + i] = sinf(angle);
253 if (!model || !tokens || num_tokens <= 0) {
258 const int aligned_embed_dim = 896;
259 const int aligned_head_dim = 64;
260 const int aligned_intermediate_dim = 4864;
261 const int aligned_context_window = 131072;
281 aligned_intermediate_dim,
282 aligned_context_window);
288 aligned_context_window,
294 aligned_context_window,
302 aligned_intermediate_dim,
303 aligned_context_window);
309 aligned_context_window,
315 aligned_context_window,
323 aligned_intermediate_dim,
324 aligned_context_window);
330 aligned_context_window,
336 aligned_context_window,
344 aligned_intermediate_dim,
345 aligned_context_window);
351 aligned_context_window,
357 aligned_context_window,
365 aligned_intermediate_dim,
366 aligned_context_window);
372 aligned_context_window,
378 aligned_context_window,
386 aligned_intermediate_dim,
387 aligned_context_window);
393 aligned_context_window,
399 aligned_context_window,
407 aligned_intermediate_dim,
408 aligned_context_window);
414 aligned_context_window,
420 aligned_context_window,
428 aligned_intermediate_dim,
429 aligned_context_window);
435 aligned_context_window,
441 aligned_context_window,
449 aligned_intermediate_dim,
450 aligned_context_window);
456 aligned_context_window,
462 aligned_context_window,
470 aligned_intermediate_dim,
471 aligned_context_window);
477 aligned_context_window,
483 aligned_context_window,
491 aligned_intermediate_dim,
492 aligned_context_window);
498 aligned_context_window,
504 aligned_context_window,
512 aligned_intermediate_dim,
513 aligned_context_window);
519 aligned_context_window,
525 aligned_context_window,
533 aligned_intermediate_dim,
534 aligned_context_window);
540 aligned_context_window,
546 aligned_context_window,
554 aligned_intermediate_dim,
555 aligned_context_window);
561 aligned_context_window,
567 aligned_context_window,
575 aligned_intermediate_dim,
576 aligned_context_window);
582 aligned_context_window,
588 aligned_context_window,
596 aligned_intermediate_dim,
597 aligned_context_window);
603 aligned_context_window,
609 aligned_context_window,
617 aligned_intermediate_dim,
618 aligned_context_window);
624 aligned_context_window,
630 aligned_context_window,
638 aligned_intermediate_dim,
639 aligned_context_window);
645 aligned_context_window,
651 aligned_context_window,
659 aligned_intermediate_dim,
660 aligned_context_window);
666 aligned_context_window,
672 aligned_context_window,
680 aligned_intermediate_dim,
681 aligned_context_window);
687 aligned_context_window,
693 aligned_context_window,
701 aligned_intermediate_dim,
702 aligned_context_window);
708 aligned_context_window,
714 aligned_context_window,
722 aligned_intermediate_dim,
723 aligned_context_window);
729 aligned_context_window,
735 aligned_context_window,
743 aligned_intermediate_dim,
744 aligned_context_window);
750 aligned_context_window,
756 aligned_context_window,
764 aligned_intermediate_dim,
765 aligned_context_window);
771 aligned_context_window,
777 aligned_context_window,
794 for (
int t = 0; t < num_tokens; ++t) {
795 const float *row = final_out + (size_t)t * (
size_t)aligned_embed_dim;
817 int aligned_embed_dim,
818 int aligned_head_dim,
819 int aligned_intermediate_dim,
820 int aligned_context_window
858 float q_token[H * aligned_head_dim];
859 float k_token[H_kv * aligned_head_dim];
860 float v_token[H_kv * aligned_head_dim];
861 float attn_token[H * aligned_head_dim];
864 float fc1_out[2 * aligned_intermediate_dim];
865 float swiglu_out[aligned_intermediate_dim];
879 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
882 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
885 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
906 aligned_context_window,
918 aligned_context_window,
924 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
941 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
947 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
959 int aligned_embed_dim,
960 int aligned_head_dim,
961 int aligned_intermediate_dim,
962 int aligned_context_window
1000 float q_token[H * aligned_head_dim];
1001 float k_token[H_kv * aligned_head_dim];
1002 float v_token[H_kv * aligned_head_dim];
1003 float attn_token[H * aligned_head_dim];
1006 float fc1_out[2 * aligned_intermediate_dim];
1007 float swiglu_out[aligned_intermediate_dim];
1021 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1024 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1027 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1048 aligned_context_window,
1060 aligned_context_window,
1066 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1083 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1086 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1089 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1101 int aligned_embed_dim,
1102 int aligned_head_dim,
1103 int aligned_intermediate_dim,
1104 int aligned_context_window
1142 float q_token[H * aligned_head_dim];
1143 float k_token[H_kv * aligned_head_dim];
1144 float v_token[H_kv * aligned_head_dim];
1145 float attn_token[H * aligned_head_dim];
1148 float fc1_out[2 * aligned_intermediate_dim];
1149 float swiglu_out[aligned_intermediate_dim];
1163 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1166 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1169 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1190 aligned_context_window,
1202 aligned_context_window,
1208 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1225 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1228 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1231 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1243 int aligned_embed_dim,
1244 int aligned_head_dim,
1245 int aligned_intermediate_dim,
1246 int aligned_context_window
1284 float q_token[H * aligned_head_dim];
1285 float k_token[H_kv * aligned_head_dim];
1286 float v_token[H_kv * aligned_head_dim];
1287 float attn_token[H * aligned_head_dim];
1290 float fc1_out[2 * aligned_intermediate_dim];
1291 float swiglu_out[aligned_intermediate_dim];
1305 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1308 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1311 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1332 aligned_context_window,
1344 aligned_context_window,
1350 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1367 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1370 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1373 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1385 int aligned_embed_dim,
1386 int aligned_head_dim,
1387 int aligned_intermediate_dim,
1388 int aligned_context_window
1426 float q_token[H * aligned_head_dim];
1427 float k_token[H_kv * aligned_head_dim];
1428 float v_token[H_kv * aligned_head_dim];
1429 float attn_token[H * aligned_head_dim];
1432 float fc1_out[2 * aligned_intermediate_dim];
1433 float swiglu_out[aligned_intermediate_dim];
1447 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1450 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1453 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1474 aligned_context_window,
1486 aligned_context_window,
1492 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1509 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1512 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1515 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1527 int aligned_embed_dim,
1528 int aligned_head_dim,
1529 int aligned_intermediate_dim,
1530 int aligned_context_window
1568 float q_token[H * aligned_head_dim];
1569 float k_token[H_kv * aligned_head_dim];
1570 float v_token[H_kv * aligned_head_dim];
1571 float attn_token[H * aligned_head_dim];
1574 float fc1_out[2 * aligned_intermediate_dim];
1575 float swiglu_out[aligned_intermediate_dim];
1589 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1592 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1595 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1616 aligned_context_window,
1628 aligned_context_window,
1634 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1651 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1654 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1657 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1669 int aligned_embed_dim,
1670 int aligned_head_dim,
1671 int aligned_intermediate_dim,
1672 int aligned_context_window
1710 float q_token[H * aligned_head_dim];
1711 float k_token[H_kv * aligned_head_dim];
1712 float v_token[H_kv * aligned_head_dim];
1713 float attn_token[H * aligned_head_dim];
1716 float fc1_out[2 * aligned_intermediate_dim];
1717 float swiglu_out[aligned_intermediate_dim];
1731 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1734 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1737 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1758 aligned_context_window,
1770 aligned_context_window,
1776 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1793 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1796 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1799 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1811 int aligned_embed_dim,
1812 int aligned_head_dim,
1813 int aligned_intermediate_dim,
1814 int aligned_context_window
1852 float q_token[H * aligned_head_dim];
1853 float k_token[H_kv * aligned_head_dim];
1854 float v_token[H_kv * aligned_head_dim];
1855 float attn_token[H * aligned_head_dim];
1858 float fc1_out[2 * aligned_intermediate_dim];
1859 float swiglu_out[aligned_intermediate_dim];
1873 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
1876 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
1879 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
1900 aligned_context_window,
1912 aligned_context_window,
1918 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
1935 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
1938 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
1941 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
1953 int aligned_embed_dim,
1954 int aligned_head_dim,
1955 int aligned_intermediate_dim,
1956 int aligned_context_window
1994 float q_token[H * aligned_head_dim];
1995 float k_token[H_kv * aligned_head_dim];
1996 float v_token[H_kv * aligned_head_dim];
1997 float attn_token[H * aligned_head_dim];
2000 float fc1_out[2 * aligned_intermediate_dim];
2001 float swiglu_out[aligned_intermediate_dim];
2015 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2018 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2021 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2042 aligned_context_window,
2054 aligned_context_window,
2060 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2077 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2080 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2083 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2095 int aligned_embed_dim,
2096 int aligned_head_dim,
2097 int aligned_intermediate_dim,
2098 int aligned_context_window
2136 float q_token[H * aligned_head_dim];
2137 float k_token[H_kv * aligned_head_dim];
2138 float v_token[H_kv * aligned_head_dim];
2139 float attn_token[H * aligned_head_dim];
2142 float fc1_out[2 * aligned_intermediate_dim];
2143 float swiglu_out[aligned_intermediate_dim];
2157 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2160 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2163 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2184 aligned_context_window,
2196 aligned_context_window,
2202 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2219 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2222 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2225 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2237 int aligned_embed_dim,
2238 int aligned_head_dim,
2239 int aligned_intermediate_dim,
2240 int aligned_context_window
2278 float q_token[H * aligned_head_dim];
2279 float k_token[H_kv * aligned_head_dim];
2280 float v_token[H_kv * aligned_head_dim];
2281 float attn_token[H * aligned_head_dim];
2284 float fc1_out[2 * aligned_intermediate_dim];
2285 float swiglu_out[aligned_intermediate_dim];
2299 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2302 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2305 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2326 aligned_context_window,
2338 aligned_context_window,
2344 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2361 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2364 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2367 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2379 int aligned_embed_dim,
2380 int aligned_head_dim,
2381 int aligned_intermediate_dim,
2382 int aligned_context_window
2420 float q_token[H * aligned_head_dim];
2421 float k_token[H_kv * aligned_head_dim];
2422 float v_token[H_kv * aligned_head_dim];
2423 float attn_token[H * aligned_head_dim];
2426 float fc1_out[2 * aligned_intermediate_dim];
2427 float swiglu_out[aligned_intermediate_dim];
2441 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2444 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2447 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2468 aligned_context_window,
2480 aligned_context_window,
2486 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2503 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2506 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2509 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2521 int aligned_embed_dim,
2522 int aligned_head_dim,
2523 int aligned_intermediate_dim,
2524 int aligned_context_window
2562 float q_token[H * aligned_head_dim];
2563 float k_token[H_kv * aligned_head_dim];
2564 float v_token[H_kv * aligned_head_dim];
2565 float attn_token[H * aligned_head_dim];
2568 float fc1_out[2 * aligned_intermediate_dim];
2569 float swiglu_out[aligned_intermediate_dim];
2583 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2586 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2589 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2610 aligned_context_window,
2622 aligned_context_window,
2628 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2645 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2648 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2651 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2663 int aligned_embed_dim,
2664 int aligned_head_dim,
2665 int aligned_intermediate_dim,
2666 int aligned_context_window
2704 float q_token[H * aligned_head_dim];
2705 float k_token[H_kv * aligned_head_dim];
2706 float v_token[H_kv * aligned_head_dim];
2707 float attn_token[H * aligned_head_dim];
2710 float fc1_out[2 * aligned_intermediate_dim];
2711 float swiglu_out[aligned_intermediate_dim];
2725 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2728 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2731 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2752 aligned_context_window,
2764 aligned_context_window,
2770 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2787 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2790 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2793 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2805 int aligned_embed_dim,
2806 int aligned_head_dim,
2807 int aligned_intermediate_dim,
2808 int aligned_context_window
2846 float q_token[H * aligned_head_dim];
2847 float k_token[H_kv * aligned_head_dim];
2848 float v_token[H_kv * aligned_head_dim];
2849 float attn_token[H * aligned_head_dim];
2852 float fc1_out[2 * aligned_intermediate_dim];
2853 float swiglu_out[aligned_intermediate_dim];
2867 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
2870 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
2873 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
2894 aligned_context_window,
2906 aligned_context_window,
2912 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
2929 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
2932 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
2935 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
2947 int aligned_embed_dim,
2948 int aligned_head_dim,
2949 int aligned_intermediate_dim,
2950 int aligned_context_window
2988 float q_token[H * aligned_head_dim];
2989 float k_token[H_kv * aligned_head_dim];
2990 float v_token[H_kv * aligned_head_dim];
2991 float attn_token[H * aligned_head_dim];
2994 float fc1_out[2 * aligned_intermediate_dim];
2995 float swiglu_out[aligned_intermediate_dim];
3009 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3012 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3015 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3036 aligned_context_window,
3048 aligned_context_window,
3054 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3071 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3074 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3077 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3089 int aligned_embed_dim,
3090 int aligned_head_dim,
3091 int aligned_intermediate_dim,
3092 int aligned_context_window
3130 float q_token[H * aligned_head_dim];
3131 float k_token[H_kv * aligned_head_dim];
3132 float v_token[H_kv * aligned_head_dim];
3133 float attn_token[H * aligned_head_dim];
3136 float fc1_out[2 * aligned_intermediate_dim];
3137 float swiglu_out[aligned_intermediate_dim];
3151 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3154 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3157 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3178 aligned_context_window,
3190 aligned_context_window,
3196 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3213 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3216 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3219 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3231 int aligned_embed_dim,
3232 int aligned_head_dim,
3233 int aligned_intermediate_dim,
3234 int aligned_context_window
3272 float q_token[H * aligned_head_dim];
3273 float k_token[H_kv * aligned_head_dim];
3274 float v_token[H_kv * aligned_head_dim];
3275 float attn_token[H * aligned_head_dim];
3278 float fc1_out[2 * aligned_intermediate_dim];
3279 float swiglu_out[aligned_intermediate_dim];
3293 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3296 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3299 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3320 aligned_context_window,
3332 aligned_context_window,
3338 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3355 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3358 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3361 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3373 int aligned_embed_dim,
3374 int aligned_head_dim,
3375 int aligned_intermediate_dim,
3376 int aligned_context_window
3414 float q_token[H * aligned_head_dim];
3415 float k_token[H_kv * aligned_head_dim];
3416 float v_token[H_kv * aligned_head_dim];
3417 float attn_token[H * aligned_head_dim];
3420 float fc1_out[2 * aligned_intermediate_dim];
3421 float swiglu_out[aligned_intermediate_dim];
3435 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3438 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3441 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3462 aligned_context_window,
3474 aligned_context_window,
3480 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3497 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3500 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3503 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3515 int aligned_embed_dim,
3516 int aligned_head_dim,
3517 int aligned_intermediate_dim,
3518 int aligned_context_window
3556 float q_token[H * aligned_head_dim];
3557 float k_token[H_kv * aligned_head_dim];
3558 float v_token[H_kv * aligned_head_dim];
3559 float attn_token[H * aligned_head_dim];
3562 float fc1_out[2 * aligned_intermediate_dim];
3563 float swiglu_out[aligned_intermediate_dim];
3577 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3580 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3583 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3604 aligned_context_window,
3616 aligned_context_window,
3622 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3639 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3642 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3645 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3657 int aligned_embed_dim,
3658 int aligned_head_dim,
3659 int aligned_intermediate_dim,
3660 int aligned_context_window
3698 float q_token[H * aligned_head_dim];
3699 float k_token[H_kv * aligned_head_dim];
3700 float v_token[H_kv * aligned_head_dim];
3701 float attn_token[H * aligned_head_dim];
3704 float fc1_out[2 * aligned_intermediate_dim];
3705 float swiglu_out[aligned_intermediate_dim];
3719 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3722 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3725 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3746 aligned_context_window,
3758 aligned_context_window,
3764 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3781 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3784 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3787 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3799 int aligned_embed_dim,
3800 int aligned_head_dim,
3801 int aligned_intermediate_dim,
3802 int aligned_context_window
3840 float q_token[H * aligned_head_dim];
3841 float k_token[H_kv * aligned_head_dim];
3842 float v_token[H_kv * aligned_head_dim];
3843 float attn_token[H * aligned_head_dim];
3846 float fc1_out[2 * aligned_intermediate_dim];
3847 float swiglu_out[aligned_intermediate_dim];
3861 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
3864 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
3867 gemm_nt_q8_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
3888 aligned_context_window,
3900 aligned_context_window,
3906 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
3923 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
3926 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
3929 gemm_nt_q6_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
3941 int aligned_embed_dim,
3942 int aligned_head_dim,
3943 int aligned_intermediate_dim,
3944 int aligned_context_window
3982 float q_token[H * aligned_head_dim];
3983 float k_token[H_kv * aligned_head_dim];
3984 float v_token[H_kv * aligned_head_dim];
3985 float attn_token[H * aligned_head_dim];
3988 float fc1_out[2 * aligned_intermediate_dim];
3989 float swiglu_out[aligned_intermediate_dim];
4003 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
4006 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
4009 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
4030 aligned_context_window,
4042 aligned_context_window,
4048 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
4065 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
4068 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
4071 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
4083 int aligned_embed_dim,
4084 int aligned_head_dim,
4085 int aligned_intermediate_dim,
4086 int aligned_context_window
4124 float q_token[H * aligned_head_dim];
4125 float k_token[H_kv * aligned_head_dim];
4126 float v_token[H_kv * aligned_head_dim];
4127 float attn_token[H * aligned_head_dim];
4130 float fc1_out[2 * aligned_intermediate_dim];
4131 float swiglu_out[aligned_intermediate_dim];
4145 gemm_nt_q5_0(ln1_out, WQ, BQ, q_token, 1, H * head_dim, aligned_embed_dim);
4148 gemm_nt_q5_0(ln1_out, WK, BK, k_token, 1, H_kv * head_dim, aligned_embed_dim);
4151 gemm_nt_q5_0(ln1_out, WV, BV, v_token, 1, H_kv * head_dim, aligned_embed_dim);
4172 aligned_context_window,
4184 aligned_context_window,
4190 gemm_nt_q5_0(attn_token, WO, NULL, proj_tmp, 1, aligned_embed_dim, H * head_dim);
4207 gemm_nt_q5_0(ln2_out, W1, NULL, fc1_out, 1, 2 * aligned_intermediate_dim, aligned_embed_dim);
4210 swiglu_forward(fc1_out, swiglu_out, 1, aligned_intermediate_dim);
4213 gemm_nt_q4_k(swiglu_out, W2, NULL, mlp_out, 1, aligned_embed_dim, aligned_intermediate_dim);
4228 if (!model || !
token)
return;
4230 const int aligned_embed_dim = 896;
4231 const int aligned_head_dim = 64;
4232 const int aligned_intermediate_dim = 4864;
4233 const int aligned_context_window = 131072;
4235 if (token_index < 0 || token_index >= aligned_context_window)
return;
4307 if (!model || !tokens || num_tokens <= 0)
return;
void swiglu_forward(const float *input, float *output, int tokens, int dim)
void gemm_nt_q4_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void kv_cache_repack_head_major_inplace(float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim)
void kv_cache_write_head_major(const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)
void gemm_nt_q5_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void gemm_nt_q6_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
void gemm_nt_q8_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.
void attention_forward_decode_head_major_gqa_regular(const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
WARNING: This is NOT true flash attention!
void rmsnorm_forward(const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
void rope_forward_qk(float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
void embedding_forward_q8_0(const int32_t *token_ids, int token_count, int vocab_size, const void *token_embeddings, const float *pos_embeddings, float *output, int embed_dim, int aligned_embed_dim, int context_window, int add_pos)
static void qwen2_0_5b_decode_layer_18_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_2_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_1_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_8_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_20_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_0_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_7_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_6_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_19_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_15_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_17_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_22_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_3_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_13_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_21_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_4_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_12_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_16_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_5_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_11_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_14_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_10_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_9_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_23_prefill(QWEN2_0_5B_DECODEModel *model, int num_tokens, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
#define QWEN2_0_5B_DECODE_TOTAL_BYTES
#define QWEN2_0_5B_DECODE_HEAD_DIM
#define QWEN2_0_5B_DECODE_PTR(model, offset)
#define QWEN2_0_5B_DECODE_ACTIVATION_BYTES
static const QWEN2_0_5B_DECODEFooterOffsets QWEN2_0_5B_DECODE_FOOTER
static const QWEN2_0_5B_DECODELayerOffsets QWEN2_0_5B_DECODE_LAYERS[24]
#define QWEN2_0_5B_DECODE_DTYPE_BYTES
#define QWEN2_0_5B_DECODE_EMBED_DIM
#define QWEN2_0_5B_DECODE_MAGIC
#define QWEN2_0_5B_DECODE_CANARY_COUNT
#define QWEN2_0_5B_DECODE_MAX_SEQ_LEN
#define QWEN2_0_5B_DECODE_NUM_LAYERS
#define QWEN2_0_5B_DECODE_WEIGHT_BYTES
#define QWEN2_0_5B_DECODE_CANARY_VALUE
static const QWEN2_0_5B_DECODEGlobalOffsets QWEN2_0_5B_DECODE_GLOBALS
#define QWEN2_0_5B_DECODE_NUM_KV_HEADS
#define QWEN2_0_5B_DECODE_NUM_HEADS
#define QWEN2_0_5B_DECODE_CANARY_SIZE
#define QWEN2_0_5B_DECODE_VOCAB_SIZE
static const QWEN2_0_5B_DECODEHeaderOffsets QWEN2_0_5B_DECODE_HEADER
static const QWEN2_0_5B_DECODECanary QWEN2_0_5B_DECODE_CANARIES[]
static void qwen2_0_5b_decode_layer_3_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_18_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_16_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_precompute_rope(QWEN2_0_5B_DECODEModel *model)
int qwen2_0_5b_decode_verify_canaries(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_residual_add_token_major(const float *a, const float *b, float *out, int tokens, int aligned_embed_dim)
static void qwen2_0_5b_decode_layer_4_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static int qwen2_0_5b_decode_align_elems(int elems, int elem_bytes, int align_bytes)
static void qwen2_0_5b_decode_layer_8_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_5_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_13_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
int qwen2_0_5b_decode_model_allocate(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_layer_0_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_19_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_6_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_7_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_decode(QWEN2_0_5B_DECODEModel *model, const int *token, int token_index)
static void qwen2_0_5b_decode_layer_2_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_forward_prefill_impl(QWEN2_0_5B_DECODEModel *model, const int *tokens, int num_tokens)
static void qwen2_0_5b_decode_layer_9_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_10_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_decode_token(QWEN2_0_5B_DECODEModel *model, const int *token, int token_index)
static void qwen2_0_5b_decode_layer_21_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_22_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_11_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_23_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
struct __attribute__((packed))
static void qwen2_0_5b_decode_layer_1_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_forward(QWEN2_0_5B_DECODEModel *model, const int *tokens, int num_tokens)
static void qwen2_0_5b_decode_layer_14_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_15_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
static void qwen2_0_5b_decode_layer_12_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
void qwen2_0_5b_decode_model_free(QWEN2_0_5B_DECODEModel *model)
static void qwen2_0_5b_decode_layer_20_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
_Static_assert(sizeof(MagicHeader)==64, "MagicHeader must be 64 bytes")
static void qwen2_0_5b_decode_layer_17_decode(QWEN2_0_5B_DECODEModel *model, int token_index, int aligned_embed_dim, int aligned_head_dim, int aligned_intermediate_dim, int aligned_context_window)
AUTO-GENERATED: qwen2_0.5b_decode Memory Layout.