4 {
"token_emb",
CK_SCOPE_GLOBAL,
CK_ROLE_WEIGHT, { {
CK_DIM_VOCAB, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
5 {
"pos_emb",
CK_SCOPE_GLOBAL,
CK_ROLE_WEIGHT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
6 {
"embedded_input",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
7 {
"rope_cos_cache",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_HEAD_DIM, 1, 2 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL,
"rope_theta",
CK_DT_FP32},
8 {
"rope_sin_cache",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_HEAD_DIM, 1, 2 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL,
"rope_theta",
CK_DT_FP32},
9 {
"final_ln_weight",
CK_SCOPE_GLOBAL,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
10 {
"final_ln_bias",
CK_SCOPE_GLOBAL,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
11 {
"final_ln_mean",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
12 {
"final_ln_rstd",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
13 {
"final_output",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
14 {
"lm_head_weight",
CK_SCOPE_GLOBAL,
CK_ROLE_WEIGHT, { {
CK_DIM_VOCAB, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0,
"token_emb", NULL,
CK_DT_FP32},
15 {
"logits",
CK_SCOPE_GLOBAL,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_VOCAB, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
16 {
"d_token_emb",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_VOCAB, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
17 {
"d_pos_emb",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
18 {
"d_final_output",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
19 {
"d_final_input",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
20 {
"d_final_ln_weight",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
21 {
"d_logits",
CK_SCOPE_GLOBAL,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_VOCAB, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
22 {
"input",
CK_SCOPE_LAYER,
CK_ROLE_INPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
23 {
"ln1_gamma",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
24 {
"ln1_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
25 {
"ln1_rstd",
CK_SCOPE_LAYER,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 1, NULL, NULL,
CK_DT_FP32},
26 {
"wq",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
27 {
"bq",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
28 {
"wk",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
29 {
"bk",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
30 {
"wv",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
31 {
"bv",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
32 {
"q",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
33 {
"k",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
34 {
"v",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
35 {
"scores",
CK_SCOPE_LAYER,
CK_ROLE_ACTIVATION, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_CTX, 1, 1 }, {
CK_DIM_ALIGNED_CTX, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL,
"training_enabled",
CK_DT_FP32},
36 {
"attn_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
37 {
"wo",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
38 {
"bo",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
39 {
"proj_tmp",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
40 {
"proj_scratch",
CK_SCOPE_LAYER,
CK_ROLE_SCRATCH, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
41 {
"residual1",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
42 {
"ln2_gamma",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
43 {
"ln2_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
44 {
"ln2_rstd",
CK_SCOPE_LAYER,
CK_ROLE_ACTIVATION, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 1, NULL, NULL,
CK_DT_FP32},
45 {
"w1",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
46 {
"b1",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
47 {
"fc1_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
48 {
"swiglu_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
49 {
"w2",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
50 {
"b2",
CK_SCOPE_LAYER,
CK_ROLE_WEIGHT, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
51 {
"mlp_out",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
52 {
"output",
CK_SCOPE_LAYER,
CK_ROLE_OUTPUT, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
53 {
"d_output",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
54 {
"d_residual1",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
55 {
"d_mlp_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
56 {
"d_swiglu_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
57 {
"d_w2",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
58 {
"d_b2",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
59 {
"d_fc1_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
60 {
"d_ln2_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
61 {
"d_w1",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
62 {
"d_b1",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_INTERMEDIATE, 2, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
63 {
"d_ln2_gamma",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
64 {
"d_input",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
65 {
"d_proj_tmp",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
66 {
"d_attn_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
67 {
"d_wo",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
68 {
"d_bo",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
69 {
"d_q",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
70 {
"d_k",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
71 {
"d_v",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
72 {
"d_scores",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_CTX, 1, 1 }, {
CK_DIM_ALIGNED_CTX, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
73 {
"d_ln1_out",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_TOKENS, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
74 {
"d_wq",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
75 {
"d_bq",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
76 {
"d_wk",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
77 {
"d_bk",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
78 {
"d_wv",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
79 {
"d_bv",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_NUM_KV_HEADS, 1, 1 }, {
CK_DIM_ALIGNED_HEAD, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
80 {
"d_ln1_gamma",
CK_SCOPE_LAYER,
CK_ROLE_GRAD, { {
CK_DIM_ALIGNED_EMBED, 1, 1 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 }, {
CK_DIM_END, 0, 0 } }, 0, NULL, NULL,
CK_DT_FP32},
86 {
"attention", {
"attention_forward_causal_head_major_gqa",
"attention_forward_causal_head_major_gqa_bf16", NULL, NULL, NULL }, {
"attention_backward_causal_head_major_gqa",
"attention_backward_causal_head_major_gqa_bf16", NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32) |
CK_DT_MASK(
CK_DT_BF16),
CK_DT_FP32, {
"src/kernels/attention_kernels.c",
"src/kernels/softmax_kernels.c", NULL, NULL, NULL, NULL, NULL, NULL }},
87 {
"attn_proj", {
"ck_attention_project_head_major", NULL, NULL, NULL, NULL }, {
"ck_attention_project_head_major_backward", NULL, NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32),
CK_DT_FP32, {
"src/ckernel_orchestration.c",
"src/kernels/gemm_kernels.c",
"src/kernels/mlp_kernels.c",
"src/kernels/gelu_kernels.c", NULL, NULL, NULL, NULL }},
88 {
"mlp_down", {
"gemm_blocked_serial", NULL, NULL, NULL, NULL }, {
"fc2_backward_kernel", NULL, NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32),
CK_DT_FP32, {
"src/kernels/gemm_kernels.c",
"src/kernels/mlp_kernels.c",
"src/kernels/gelu_kernels.c", NULL, NULL, NULL, NULL, NULL }},
89 {
"mlp_up", {
"gemm_blocked_serial", NULL, NULL, NULL, NULL }, {
"fc1_backward_kernel", NULL, NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32),
CK_DT_FP32, {
"src/kernels/gemm_kernels.c",
"src/kernels/mlp_kernels.c",
"src/kernels/gelu_kernels.c", NULL, NULL, NULL, NULL, NULL }},
90 {
"qkv_project", {
"ck_qkv_project_head_major", NULL, NULL, NULL, NULL }, {
"ck_qkv_project_head_major_backward", NULL, NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32),
CK_DT_FP32, {
"src/ckernel_orchestration.c",
"src/kernels/gemm_kernels.c",
"src/kernels/mlp_kernels.c",
"src/kernels/gelu_kernels.c", NULL, NULL, NULL, NULL }},
91 {
"residual_add", {
"ck_residual_add_token_major", NULL, NULL, NULL, NULL }, {
"ck_residual_add_backward", NULL, NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32),
CK_DT_FP32, {
"src/ckernel_orchestration.c", NULL, NULL, NULL, NULL, NULL, NULL, NULL }},
92 {
"rmsnorm", {
"rmsnorm_forward",
"rmsnorm_forward_bf16", NULL,
"rmsnorm_forward_int8",
"rmsnorm_forward_int4" }, {
"rmsnorm_backward",
"rmsnorm_backward_bf16", NULL,
"rmsnorm_backward_int8",
"rmsnorm_backward_int4" },
CK_DT_MASK(
CK_DT_FP32) |
CK_DT_MASK(
CK_DT_BF16) |
CK_DT_MASK(
CK_DT_INT8) |
CK_DT_MASK(
CK_DT_INT4),
CK_DT_FP32, {
"src/kernels/rmsnorm_kernels.c",
"src/kernels/rmsnorm_kernels_bf16.c",
"src/kernels/rmsnorm_kernels_int8.c",
"src/kernels/rmsnorm_kernels_int4.c", NULL, NULL, NULL, NULL }},
93 {
"rope", {
"rope_forward_qk",
"rope_forward_qk_bf16", NULL, NULL, NULL }, {
"rope_backward_qk",
"rope_backward_qk_bf16", NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32) |
CK_DT_MASK(
CK_DT_BF16),
CK_DT_FP32, {
"src/kernels/rope_kernels.c",
"src/kernels/rope_kernels_bf16.c", NULL, NULL, NULL, NULL, NULL, NULL }},
94 {
"swiglu", {
"swiglu_forward",
"swiglu_forward_bf16", NULL, NULL, NULL }, {
"swiglu_backward",
"swiglu_backward_bf16", NULL, NULL, NULL },
CK_DT_MASK(
CK_DT_FP32) |
CK_DT_MASK(
CK_DT_BF16),
CK_DT_FP32, {
"src/kernels/swiglu_kernels.c",
"src/kernels/swiglu_kernels_bf16.c",
"src/kernels/sigmoid_kernels.c", NULL, NULL, NULL, NULL, NULL }},
101 {
"qkv_project", NULL},
102 {
"rope",
"rope_theta>0"},
105 {
"residual_add", NULL},
110 {
"residual_add", NULL},
116 {
"residual_add", NULL},
121 {
"residual_add", NULL},
124 {
"rope",
"rope_theta>0"},
125 {
"qkv_project", NULL},
133 {
"gamma",
"ln1_gamma"},
135 {
"rstd",
"ln1_rstd"},
139 {
"input",
"ln1_out"},
154 {
"cos_cache",
"rope_cos_cache"},
155 {
"sin_cache",
"rope_sin_cache"},
162 {
"scores",
"scores"},
163 {
"attn_out",
"attn_out"},
167 {
"attn_out",
"attn_out"},
170 {
"proj_tmp",
"proj_tmp"},
171 {
"proj_scratch",
"proj_scratch"},
177 {
"out",
"residual1"},
181 {
"input",
"residual1"},
182 {
"gamma",
"ln2_gamma"},
184 {
"rstd",
"ln2_rstd"},
188 {
"input",
"ln2_out"},
191 {
"fc1_out",
"fc1_out"},
195 {
"fc1_out",
"fc1_out"},
196 {
"swiglu_out",
"swiglu_out"},
200 {
"swiglu_out",
"swiglu_out"},
203 {
"mlp_out",
"mlp_out"},
213 {
"d_out",
"d_output"},
214 {
"d_a",
"d_residual1"},
215 {
"d_b",
"d_mlp_out"},
219 {
"d_out",
"d_mlp_out"},
220 {
"swiglu_out",
"swiglu_out"},
222 {
"d_input",
"d_swiglu_out"},
228 {
"fc1_out",
"fc1_out"},
229 {
"d_out",
"d_swiglu_out"},
230 {
"d_input",
"d_fc1_out"},
234 {
"d_out",
"d_fc1_out"},
235 {
"input",
"ln2_out"},
237 {
"d_input",
"d_ln2_out"},
243 {
"d_out",
"d_ln2_out"},
244 {
"input",
"residual1"},
245 {
"gamma",
"ln2_gamma"},
246 {
"rstd",
"ln2_rstd"},
247 {
"d_input",
"d_residual1"},
248 {
"d_gamma",
"d_ln2_gamma"},
252 {
"d_out",
"d_residual1"},
254 {
"d_b",
"d_proj_tmp"},
258 {
"d_out",
"d_proj_tmp"},
259 {
"attn_out",
"attn_out"},
261 {
"d_attn_out",
"d_attn_out"},
267 {
"d_out",
"d_attn_out"},
271 {
"scores",
"scores"},
275 {
"d_scores",
"d_scores"},
283 {
"cos_cache",
"rope_cos_cache"},
284 {
"sin_cache",
"rope_sin_cache"},
291 {
"input",
"ln1_out"},
295 {
"d_input",
"d_ln1_out"},
305 {
"d_out",
"d_ln1_out"},
307 {
"gamma",
"ln1_gamma"},
308 {
"rstd",
"ln1_rstd"},
309 {
"d_input",
"d_input"},
310 {
"d_gamma",
"d_ln1_gamma"},
static const CKPlanBinding ck_decoder_forward_bindings_7[]
static const CKPlanBinding ck_decoder_forward_bindings_9[]
static const CKPlanBinding ck_decoder_backward_bindings_9[]
static const CKPlanBinding ck_decoder_backward_bindings_2[]
const CKPlanStepV2 ck_decoder_forward_plan_v2[]
static const CKPlanBinding ck_decoder_forward_bindings_1[]
static const CKPlanBinding ck_decoder_backward_bindings_6[]
const CKPlanStep ck_decoder_forward_plan[]
static const CKPlanBinding ck_decoder_forward_bindings_6[]
static const CKPlanBinding ck_decoder_forward_bindings_0[]
const size_t ck_decoder_backward_plan_count
const size_t ck_decoder_forward_plan_v2_count
static const CKPlanBinding ck_decoder_backward_bindings_0[]
static const CKPlanBinding ck_decoder_backward_bindings_10[]
static const CKPlanBinding ck_decoder_forward_bindings_8[]
static const CKPlanBinding ck_decoder_forward_bindings_3[]
static const CKPlanBinding ck_decoder_forward_bindings_4[]
static const CKPlanBinding ck_decoder_backward_bindings_7[]
const size_t ck_decoder_backward_plan_v2_count
static const CKPlanBinding ck_decoder_forward_bindings_5[]
const size_t ck_decoder_forward_plan_count
static const CKPlanBinding ck_decoder_backward_bindings_5[]
const CKPlanStep ck_decoder_backward_plan[]
static const CKPlanBinding ck_decoder_forward_bindings_10[]
static const CKPlanBinding ck_decoder_forward_bindings_2[]
const CKKernelSpec ck_kernel_specs[]
static const CKPlanBinding ck_decoder_backward_bindings_8[]
static const CKPlanBinding ck_decoder_backward_bindings_1[]
static const CKPlanBinding ck_decoder_backward_bindings_3[]
const CKPlanStepV2 ck_decoder_backward_plan_v2[]
const CKBufferSpec ck_decoder_buffers[]
const size_t ck_kernel_spec_count
const size_t ck_decoder_buffer_count
static const CKPlanBinding ck_decoder_backward_bindings_4[]
@ CK_DIM_ALIGNED_INTERMEDIATE