C-Kernel-Engine/doxygen/ckernel__orchestration_8c_source.html

 /**

  * @file ckernel_orchestration.c

  *

  * ===========================================================================

  * LEGACY CODE - NOT USED IN v6.6

  * ===========================================================================

  *

  * This file contains v6.5 orchestration code that is NO LONGER USED.

  * It is kept for reference and potential future use but is NOT compiled

  * into the v6.6 engine.

  *

  * v6.6 Architecture:

  *   - IR Lower 3 handles all orchestration via dataflow graph

  *   - Kernel dispatch via ckernel_codegen.c (for dynamically loaded kernels)

  *   - Memory planning via memory_planner_v6_6.py

  *

  * Contents of this file (NOT used):

  *   - ck_attention_flash_decode_wrapper: Flash attention wrapper (use

  *     mega_fused_attention_prefill/avx instead)

  *   - ck_quantized_gemm: Dispatcher for Q4_K, Q5_0, Q5_1, Q6_K, Q8_0

  *     (use kernel_maps/KERNEL_REGISTRY.json + codegen instead)

  *

  * To remove completely:

  *   1. Delete this file

  *   2. Remove from Makefile SRCS list

  *   3. Remove ckernel_orchestration.h

  *

  * Last used: v6.5

  * Deprecated: v6.6 (2026-02)

  * ===========================================================================

  */


 #include "ckernel_orchestration.h"


 #include "ckernel_engine.h"

 #include "ckernel_dtype.h"

 #include "ckernel_quant.h"


 #include <stddef.h>

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>

 #include <math.h>


 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)

 #include <immintrin.h>

 #endif


 /* ============================================================================

  * TRUE FLASH ATTENTION (O(1) for decode)

  *

  * New implementation based on Tri Dao's Flash Attention algorithm.

  * Provides O(1) complexity for decode instead of O(n).

  *

  * Reference: attention_flash_decode() in src/kernels/attention_flash_true.c

  * ============================================================================ */


 /**

  * @brief Wrapper to call TRUE flash attention from orchestration layer

  *

  * @param q_token Query token [H, D_h]

  * @param k_cache Cached keys [T_k, H, D_h]

  * @param v_cache Cached values [T_k, H, D_h]

  * @param out_token Output [H, D_h]

  * @param num_heads Number of heads

  * @param num_kv_heads Number of KV heads (for GQA)

  * @param kv_tokens Number of tokens in KV cache

  * @param cache_capacity Cache capacity

  * @param head_dim Head dimension

  * @param aligned_head_dim Aligned head dimension

  */

 void ck_attention_flash_decode_wrapper(

     const float *q_token,

     const float *k_cache,

     const float *v_cache,

     float *out_token,

     int num_heads,

     int num_kv_heads,

     int kv_tokens,

     int cache_capacity,

     int head_dim,

     int aligned_head_dim)

 {

     if (!q_token || !k_cache || !v_cache || !out_token) {

         return;

     }

     if (num_heads <= 0 || num_kv_heads <= 0 || kv_tokens <= 0 || cache_capacity <= 0) {

         return;

     }

     if (kv_tokens > cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {

         return;

     }


     static int use_strict = -1;

     if (use_strict < 0) {

         const char *env = getenv("CK_FLASH_ATTN_STRICT");

         use_strict = (env && env[0] && env[0] != '0') ? 1 : 0;

     }


     if (use_strict) {

         attention_forward_decode_head_major_gqa_regular(q_token,

                                                         k_cache,

                                                         v_cache,

                                                         out_token,

                                                         num_heads,

                                                         num_kv_heads,

                                                         kv_tokens,

                                                         cache_capacity,

                                                         head_dim,

                                                         aligned_head_dim);

         return;

     }


     // Scale factor: 1/sqrt(head_dim)

     const float scale = 1.0f / sqrtf((float)head_dim);

     const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;


 #pragma omp parallel for schedule(static) if(num_heads > 1)

     for (int h = 0; h < num_heads; ++h) {

         const int kv_head = (int)((long long)h * (long long)num_kv_heads / (long long)num_heads);

         const float *q_head = q_token + (size_t)h * (size_t)aligned_head_dim;

         const float *k_head = k_cache + (size_t)kv_head * head_stride;

         const float *v_head = v_cache + (size_t)kv_head * head_stride;

         float *out_head = out_token + (size_t)h * (size_t)aligned_head_dim;


         // Use aligned_head_dim as D_h so per-token stride matches the cache layout.

         attention_flash_decode(out_head,

                                q_head,

                                k_head,

                                v_head,

                                1,

                                kv_tokens,

                                1,

                                aligned_head_dim,

                                scale);

     }

 }


 void ck_residual_add_token_major(const float *a,

                                  const float *b,

                                  float *out,

                                  int tokens,

                                  int aligned_embed_dim)

 {

     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;

     for (size_t i = 0; i < total; ++i) {

         out[i] = a[i] + b[i];

     }

 }


 void ck_residual_add_backward(const float *d_out,

                               float *d_a,

                               float *d_b,

                               int tokens,

                               int aligned_embed_dim)

 {

     if (!d_out || !d_a || !d_b) {

         return;

     }

     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;

     for (size_t i = 0; i < total; ++i) {

         float v = d_out[i];

         d_a[i] = v;

         d_b[i] = v;

     }

 }


 void ck_qkv_project_head_major(const float *input,

                                const float *wq, const float *bq,

                                const float *wk, const float *bk,

                                const float *wv, const float *bv,

                                float *q, float *k, float *v,

                                int tokens,

                                int kv_stride_tokens,

                                int aligned_embed_dim,

                                int num_heads,

                                int num_kv_heads,

                                int aligned_head_dim)

 {

     if (!input || !wq || !wk || !wv || !q || !k || !v) {

         return;

     }

     if (kv_stride_tokens < tokens) {

         return;

     }


     size_t head_weight_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     size_t q_head_stride = (size_t)tokens * (size_t)aligned_head_dim;

     size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;


     for (int h = 0; h < num_heads; ++h) {

         const float *wq_h = wq + (size_t)h * head_weight_stride;

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q + (size_t)h * q_head_stride;


         gemm_blocked_serial(input, wq_h, bq_h, q_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const float *wk_h = wk + (size_t)h * head_weight_stride;

         const float *wv_h = wv + (size_t)h * head_weight_stride;


         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;


         float *k_h = k + (size_t)h * kv_head_stride;

         float *v_h = v + (size_t)h * kv_head_stride;


         gemm_blocked_serial(input, wk_h, bk_h, k_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

         gemm_blocked_serial(input, wv_h, bv_h, v_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

     }

 }


 static int ck_layer_debug_enabled(void)

 {

     static int cached = -2;

     if (cached != -2) {

         return cached;

     }

     const char *env = getenv("CK_LAYER_DEBUG");

     if (env && (env[0] == '1' || env[0] == 'y' || env[0] == 'Y')) {

         cached = 1;

     } else {

         cached = 0;

     }

     return cached;

 }


 static void ck_debug_check_buffer(const char *stage, const float *buf, int size)

 {

     if (!ck_layer_debug_enabled() || !buf) {

         return;

     }

     int nan_count = 0, inf_count = 0;

     float min_val = 1e38f, max_val = -1e38f;

     for (int i = 0; i < size; ++i) {

         float v = buf[i];

         if (isnan(v)) {

             nan_count++;

         } else if (isinf(v)) {

             inf_count++;

         } else {

             if (v < min_val) min_val = v;

             if (v > max_val) max_val = v;

         }

     }

     if (nan_count > 0 || inf_count > 0) {

         fprintf(stderr, "[LAYER_DEBUG] %-30s size=%5d nan=%d inf=%d\n",

                 stage, size, nan_count, inf_count);

     } else {

         fprintf(stderr, "[LAYER_DEBUG] %-30s size=%5d range=[%.3e, %.3e]\n",

                 stage, size, min_val, max_val);

     }

 }


 static void ck_debug_check_q8k(const char *stage, const void *q8_buf, int num_blocks)

 {

     if (!ck_layer_debug_enabled() || !q8_buf) {

         return;

     }

     const block_q8_K *blocks = (const block_q8_K *)q8_buf;

     int nan_scale = 0, inf_scale = 0;

     float min_d = 1e38f, max_d = -1e38f;

     for (int i = 0; i < num_blocks; ++i) {

         float d = blocks[i].d;

         if (isnan(d)) {

             nan_scale++;

         } else if (isinf(d)) {

             inf_scale++;

         } else {

             if (d < min_d) min_d = d;

             if (d > max_d) max_d = d;

         }

     }

     if (nan_scale > 0 || inf_scale > 0) {

         fprintf(stderr, "[LAYER_DEBUG] %-30s blocks=%d nan_scale=%d inf_scale=%d\n",

                 stage, num_blocks, nan_scale, inf_scale);

     } else {

         fprintf(stderr, "[LAYER_DEBUG] %-30s blocks=%d scale_range=[%.3e, %.3e]\n",

                 stage, num_blocks, min_d, max_d);

     }

 }


 static void ck_debug_check_q4k_weights(const char *stage, const void *q4_buf, int num_blocks)

 {

     if (!ck_layer_debug_enabled() || !q4_buf) {

         return;

     }

     const block_q4_K *blocks = (const block_q4_K *)q4_buf;

     int nan_d = 0, nan_dmin = 0;

     float min_d = 1e38f, max_d = -1e38f;

     for (int i = 0; i < num_blocks; ++i) {

         float d = CK_FP16_TO_FP32(blocks[i].d);

         float dm = CK_FP16_TO_FP32(blocks[i].dmin);

         if (isnan(d)) nan_d++;

         if (isnan(dm)) nan_dmin++;

         if (!isnan(d) && !isinf(d)) {

             if (d < min_d) min_d = d;

             if (d > max_d) max_d = d;

         }

     }

     if (nan_d > 0 || nan_dmin > 0) {

         fprintf(stderr, "[LAYER_DEBUG] %-30s blocks=%d nan_d=%d nan_dmin=%d\n",

                 stage, num_blocks, nan_d, nan_dmin);

     } else {

         fprintf(stderr, "[LAYER_DEBUG] %-30s blocks=%d d_range=[%.3e, %.3e]\n",

                 stage, num_blocks, min_d, max_d);

     }

 }


 static int ck_q8k_activations_enabled(void)

 {

     static int cached = -2;

     if (cached != -2) {

         return cached;

     }


     const char *env = getenv("CK_Q8K_ACTIVATIONS");

     if (!env || !env[0]) {

         cached = ck_strict_parity_enabled() ? 0 : 1;

         return cached;

     }

     if (env[0] == '0' || env[0] == 'n' || env[0] == 'N' ||

         env[0] == 'f' || env[0] == 'F') {

         cached = 0;

     } else {

         cached = 1;

     }

     return cached;

 }


 void ck_gemm_nt_quant(const float *A,

                       const void *B,

                       const float *bias,

                       float *C,

                       int M, int N, int K,

                       CKDataType dtype)

 {

     switch (dtype) {

     case CK_DT_FP32:

         gemm_blocked_serial(A, (const float *)B, bias, C, M, N, K);

         break;

     case CK_DT_Q4_K:

         gemm_nt_q4_k(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q6_K:

         gemm_nt_q6_k(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q4_0:

         gemm_nt_q4_0(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q4_1:

         gemm_nt_q4_1(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q5_0:

         gemm_nt_q5_0(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q5_1:

         gemm_nt_q5_1(A, B, bias, C, M, N, K);

         break;

     case CK_DT_Q8_0:

         gemm_nt_q8_0(A, B, bias, C, M, N, K);

         break;

     default:

         break;

     }

 }


 /* ============================================================================

  * Q4_K (Q4_K_M) forward-only paths

  * ============================================================================

  *

  * These helpers keep the same activation layouts as the fp32 code paths, but

  * accept weight matrices stored as GGML-compatible Q4_K blocks. This is meant

  * for weight-only quantized inference: activations remain fp32 by default, but

  * decode can switch to Q8_K activations via CK_Q8K_ACTIVATIONS=1.

  *

  * Important constraints:

  *   - Q4_K kernels require K (the input dimension) to be a multiple of 256.

  *   - For attention output projection we assume the concatenated head vector

  *     has length aligned_embed_dim (i.e., num_heads * aligned_head_dim matches).

  */


 static void ck_qkv_project_head_major_q4_k(const float *input,

                                           const void *wq, const float *bq,

                                           const void *wk, const float *bk,

                                           const void *wv, const float *bv,

                                           float *q, float *k, float *v,

                                           int tokens,

                                           int kv_stride_tokens,

                                           int aligned_embed_dim,

                                           int num_heads,

                                           int num_kv_heads,

                                           int aligned_head_dim)

 {

     if (!input || !wq || !wk || !wv || !q || !k || !v) {

         return;

     }

     if (kv_stride_tokens < tokens) {

         return;

     }


     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     const size_t head_w_bytes = ck_dtype_row_bytes(CK_DT_Q4_K, head_w_elems);

     const size_t q_head_stride = (size_t)tokens * (size_t)aligned_head_dim;

     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;


     const uint8_t *wq_bytes = (const uint8_t *)wq;

     const uint8_t *wk_bytes = (const uint8_t *)wk;

     const uint8_t *wv_bytes = (const uint8_t *)wv;


     for (int h = 0; h < num_heads; ++h) {

         const void *wq_h = wq_bytes + (size_t)h * head_w_bytes;

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q + (size_t)h * q_head_stride;


         gemm_nt_q4_k(input, wq_h, bq_h, q_h,

                      tokens, aligned_head_dim, aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const void *wk_h = wk_bytes + (size_t)h * head_w_bytes;

         const void *wv_h = wv_bytes + (size_t)h * head_w_bytes;


         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;


         float *k_h = k + (size_t)h * kv_head_stride;

         float *v_h = v + (size_t)h * kv_head_stride;


         gemm_nt_q4_k(input, wk_h, bk_h, k_h,

                      tokens, aligned_head_dim, aligned_embed_dim);

         gemm_nt_q4_k(input, wv_h, bv_h, v_h,

                      tokens, aligned_head_dim, aligned_embed_dim);

     }

 }


 static void ck_qkv_project_head_major_quant(const float *input,

                                             const void *wq, const float *bq, CKDataType wq_dtype,

                                             const void *wk, const float *bk, CKDataType wk_dtype,

                                             const void *wv, const float *bv, CKDataType wv_dtype,

                                             float *q, float *k, float *v,

                                             int tokens,

                                             int kv_stride_tokens,

                                             int aligned_embed_dim,

                                             int num_heads,

                                             int num_kv_heads,

                                             int aligned_head_dim)

 {

     if (!input || !wq || !wk || !wv || !q || !k || !v) {

         return;

     }

     if (kv_stride_tokens < tokens) {

         return;

     }


     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     const size_t q_head_stride = (size_t)tokens * (size_t)aligned_head_dim;

     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;


     const size_t wq_head_bytes = ck_dtype_row_bytes(wq_dtype, head_w_elems);

     const size_t wk_head_bytes = ck_dtype_row_bytes(wk_dtype, head_w_elems);

     const size_t wv_head_bytes = ck_dtype_row_bytes(wv_dtype, head_w_elems);


     const uint8_t *wq_bytes = (const uint8_t *)wq;

     const uint8_t *wk_bytes = (const uint8_t *)wk;

     const uint8_t *wv_bytes = (const uint8_t *)wv;


     for (int h = 0; h < num_heads; ++h) {

         const void *wq_h = (wq_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wq + (size_t)h * head_w_elems)

             : (const void *)(wq_bytes + (size_t)h * wq_head_bytes);

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q + (size_t)h * q_head_stride;


         ck_gemm_nt_quant(input, wq_h, bq_h, q_h,

                          tokens, aligned_head_dim, aligned_embed_dim, wq_dtype);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const void *wk_h = (wk_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wk + (size_t)h * head_w_elems)

             : (const void *)(wk_bytes + (size_t)h * wk_head_bytes);

         const void *wv_h = (wv_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wv + (size_t)h * head_w_elems)

             : (const void *)(wv_bytes + (size_t)h * wv_head_bytes);


         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;


         float *k_h = k + (size_t)h * kv_head_stride;

         float *v_h = v + (size_t)h * kv_head_stride;


         ck_gemm_nt_quant(input, wk_h, bk_h, k_h,

                          tokens, aligned_head_dim, aligned_embed_dim, wk_dtype);

         ck_gemm_nt_quant(input, wv_h, bv_h, v_h,

                          tokens, aligned_head_dim, aligned_embed_dim, wv_dtype);

     }

 }


 static void ck_attention_project_head_major_q4_k(const float *attn_out,

                                                  const void *wo,

                                                  const float *bo,

                                                  float *out,

                                                  float *scratch,

                                                  int tokens,

                                                  int aligned_embed_dim,

                                                  int num_heads,

                                                  int aligned_head_dim)

 {

     if (!attn_out || !wo || !out || !scratch) {

         return;

     }


     /* Flatten head-major [H, T, ad] into token-major [T, H*ad] */

     const int K = num_heads * aligned_head_dim;

     if (K != aligned_embed_dim) {

         return;

     }


     const size_t head_in_stride = (size_t)tokens * (size_t)aligned_head_dim;


     for (int t = 0; t < tokens; ++t) {

         float *dst = scratch + (size_t)t * (size_t)aligned_embed_dim;

         for (int h = 0; h < num_heads; ++h) {

             const float *src = attn_out + (size_t)h * head_in_stride + (size_t)t * (size_t)aligned_head_dim;

             memcpy(dst + (size_t)h * (size_t)aligned_head_dim,

                    src,

                    (size_t)aligned_head_dim * sizeof(float));

         }

     }


     gemm_nt_q4_k(scratch, wo, bo, out,

                  tokens, aligned_embed_dim, aligned_embed_dim);

 }


 static void ck_attention_project_head_major_quant(const float *attn_out,

                                                   const void *wo,

                                                   const float *bo,

                                                   float *out,

                                                   float *scratch,

                                                   int tokens,

                                                   int aligned_embed_dim,

                                                   int num_heads,

                                                   int aligned_head_dim,

                                                   CKDataType wo_dtype)

 {

     if (!attn_out || !wo || !out || !scratch) {

         return;

     }


     if (wo_dtype == CK_DT_FP32) {

         ck_attention_project_head_major(attn_out,

                                         (const float *)wo,

                                         bo,

                                         out,

                                         scratch,

                                         tokens,

                                         aligned_embed_dim,

                                         num_heads,

                                         aligned_head_dim);

         return;

     }


     const int K = num_heads * aligned_head_dim;

     if (K != aligned_embed_dim) {

         return;

     }


     const size_t head_in_stride = (size_t)tokens * (size_t)aligned_head_dim;


     for (int t = 0; t < tokens; ++t) {

         float *dst = scratch + (size_t)t * (size_t)aligned_embed_dim;

         for (int h = 0; h < num_heads; ++h) {

             const float *src = attn_out + (size_t)h * head_in_stride + (size_t)t * (size_t)aligned_head_dim;

             memcpy(dst + (size_t)h * (size_t)aligned_head_dim,

                    src,

                    (size_t)aligned_head_dim * sizeof(float));

         }

     }


     ck_gemm_nt_quant(scratch, wo, bo, out,

                      tokens, aligned_embed_dim, aligned_embed_dim, wo_dtype);

 }


 static void ck_mlp_swiglu_forward_q4_k(const float *input,

                                       const void *w1,

                                       const float *b1,

                                       const void *w2,

                                       const float *b2,

                                       float *fc1_out,

                                       float *swiglu_out,

                                       float *output,

                                       int tokens,

                                       int aligned_embed_dim,

                                       int aligned_intermediate_dim)

 {

     int up_dim = 2 * aligned_intermediate_dim;

     gemm_nt_q4_k(input, w1, b1, fc1_out,

                  tokens, up_dim, aligned_embed_dim);


     swiglu_forward(fc1_out, swiglu_out, tokens, aligned_intermediate_dim);


     gemm_nt_q4_k(swiglu_out, w2, b2, output,

                  tokens, aligned_embed_dim, aligned_intermediate_dim);

 }


 static void ck_mlp_swiglu_forward_quant(const float *input,

                                         const void *w1,

                                         const float *b1,

                                         CKDataType w1_dtype,

                                         const void *w2,

                                         const float *b2,

                                         CKDataType w2_dtype,

                                         float *fc1_out,

                                         float *swiglu_out,

                                         float *output,

                                         int tokens,

                                         int aligned_embed_dim,

                                         int aligned_intermediate_dim)

 {

     int up_dim = 2 * aligned_intermediate_dim;

     ck_gemm_nt_quant(input, w1, b1, fc1_out,

                      tokens, up_dim, aligned_embed_dim, w1_dtype);


     swiglu_forward(fc1_out, swiglu_out, tokens, aligned_intermediate_dim);


     ck_gemm_nt_quant(swiglu_out, w2, b2, output,

                      tokens, aligned_embed_dim, aligned_intermediate_dim, w2_dtype);

 }


 static void ck_mlp_swiglu_forward_q4_k_q8_k(const float *input,

                                             const void *w1,

                                             const float *b1,

                                             const void *w2,

                                             const float *b2,

                                             float *fc1_out,

                                             float *swiglu_out,

                                             float *output,

                                             int aligned_embed_dim,

                                             int aligned_intermediate_dim)

 {

     if (!input || !w1 || !w2 || !fc1_out || !swiglu_out || !output) {

         return;

     }

     if ((aligned_embed_dim % QK_K) != 0 || (aligned_intermediate_dim % QK_K) != 0) {

         return;

     }


     const int up_dim = 2 * aligned_intermediate_dim;

     const int q8_blocks_embed = aligned_embed_dim / QK_K;

     const int q8_blocks_inter = aligned_intermediate_dim / QK_K;

     const int q8_blocks_max = (q8_blocks_embed > q8_blocks_inter) ? q8_blocks_embed : q8_blocks_inter;

     block_q8_K q8_buf[q8_blocks_max];


     quantize_row_q8_k(input, q8_buf, aligned_embed_dim);

     gemm_nt_q4_k_q8_k(q8_buf, w1, b1, fc1_out,

                       /*M=*/1, /*N=*/up_dim, /*K=*/aligned_embed_dim);


     swiglu_forward(fc1_out, swiglu_out, /*tokens=*/1, aligned_intermediate_dim);


     quantize_row_q8_k(swiglu_out, q8_buf, aligned_intermediate_dim);

     gemm_nt_q4_k_q8_k(q8_buf, w2, b2, output,

                       /*M=*/1, /*N=*/aligned_embed_dim, /*K=*/aligned_intermediate_dim);

 }


 static void ck_qkv_project_head_major_ref(const float *input,

                                           const float *wq, const float *bq,

                                           const float *wk, const float *bk,

                                           const float *wv, const float *bv,

                                           float *q, float *k, float *v,

                                           int tokens,

                                           int kv_stride_tokens,

                                           int aligned_embed_dim,

                                           int num_heads,

                                           int num_kv_heads,

                                           int aligned_head_dim)

 {

     if (!input || !wq || !wk || !wv || !q || !k || !v) {

         return;

     }

     if (kv_stride_tokens < tokens) {

         return;

     }


     size_t head_weight_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     size_t q_head_stride = (size_t)tokens * (size_t)aligned_head_dim;

     size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;


     for (int h = 0; h < num_heads; ++h) {

         const float *wq_h = wq + (size_t)h * head_weight_stride;

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q + (size_t)h * q_head_stride;


         gemm_naive_parallel(input, wq_h, bq_h, q_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const float *wk_h = wk + (size_t)h * head_weight_stride;

         const float *wv_h = wv + (size_t)h * head_weight_stride;


         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;


         float *k_h = k + (size_t)h * kv_head_stride;

         float *v_h = v + (size_t)h * kv_head_stride;


         gemm_naive_parallel(input, wk_h, bk_h, k_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

         gemm_naive_parallel(input, wv_h, bv_h, v_h,

                             tokens, aligned_head_dim, aligned_embed_dim);

     }

 }


 static void ck_add_inplace(float *dst,

                            const float *src,

                            int tokens,

                            int aligned_embed_dim)

 {

     size_t total = (size_t)tokens * (size_t)aligned_embed_dim;

     for (size_t i = 0; i < total; ++i) {

         dst[i] += src[i];

     }

 }


 void ck_attention_project_head_major(const float *attn_out,

                                      const float *wo,

                                      const float *bo,

                                      float *out,

                                      float *scratch,

                                      int tokens,

                                      int aligned_embed_dim,

                                      int num_heads,

                                      int aligned_head_dim)

 {

     if (!attn_out || !wo || !out) {

         return;

     }

     if (num_heads > 1 && !scratch) {

         return;

     }


     size_t head_in_stride = (size_t)tokens * (size_t)aligned_head_dim;

     size_t head_weight_stride = (size_t)aligned_embed_dim * (size_t)aligned_head_dim;


     for (int h = 0; h < num_heads; ++h) {

         const float *head_in = attn_out + (size_t)h * head_in_stride;

         const float *wo_h = wo + (size_t)h * head_weight_stride;


         if (h == 0) {

             gemm_blocked_serial(head_in, wo_h, bo, out,

                                 tokens, aligned_embed_dim, aligned_head_dim);

         } else {

             gemm_blocked_serial(head_in, wo_h, NULL, scratch,

                                 tokens, aligned_embed_dim, aligned_head_dim);

             ck_add_inplace(out, scratch, tokens, aligned_embed_dim);

         }

     }

 }


 static void ck_attention_project_head_major_ref(const float *attn_out,

                                                 const float *wo,

                                                 const float *bo,

                                                 float *out,

                                                 float *scratch,

                                                 int tokens,

                                                 int aligned_embed_dim,

                                                 int num_heads,

                                                 int aligned_head_dim)

 {

     if (!attn_out || !wo || !out) {

         return;

     }

     if (num_heads > 1 && !scratch) {

         return;

     }


     size_t head_in_stride = (size_t)tokens * (size_t)aligned_head_dim;

     size_t head_weight_stride = (size_t)aligned_embed_dim * (size_t)aligned_head_dim;


     for (int h = 0; h < num_heads; ++h) {

         const float *head_in = attn_out + (size_t)h * head_in_stride;

         const float *wo_h = wo + (size_t)h * head_weight_stride;


         if (h == 0) {

             gemm_naive_parallel(head_in, wo_h, bo, out,

                                 tokens, aligned_embed_dim, aligned_head_dim);

         } else {

             gemm_naive_parallel(head_in, wo_h, NULL, scratch,

                                 tokens, aligned_embed_dim, aligned_head_dim);

             ck_add_inplace(out, scratch, tokens, aligned_embed_dim);

         }

     }

 }


 void ck_attention_project_head_major_backward(const float *d_out,

                                               const float *attn_out,

                                               const float *wo,

                                               float *d_attn_out,

                                               float *d_wo,

                                               float *d_bo,

                                               int tokens,

                                               int aligned_embed_dim,

                                               int num_heads,

                                               int aligned_head_dim)

 {

     if (!d_out || !attn_out || !wo || !d_attn_out || !d_wo || !d_bo) {

         return;

     }


     // Bias gradient: sum over tokens once (bias is applied once in forward).

     for (int d = 0; d < aligned_embed_dim; ++d) {

         d_bo[d] = 0.0f;

     }

     for (int t = 0; t < tokens; ++t) {

         const float *row = d_out + (size_t)t * (size_t)aligned_embed_dim;

         for (int d = 0; d < aligned_embed_dim; ++d) {

             d_bo[d] += row[d];

         }

     }


     size_t head_in_stride = (size_t)tokens * (size_t)aligned_head_dim;

     size_t head_weight_stride = (size_t)aligned_embed_dim * (size_t)aligned_head_dim;


     float *tmp_b = (float *)calloc((size_t)aligned_embed_dim, sizeof(float));

     if (!tmp_b) {

         return;

     }


     for (int h = 0; h < num_heads; ++h) {

         const float *head_in = attn_out + (size_t)h * head_in_stride;

         const float *wo_h = wo + (size_t)h * head_weight_stride;

         float *d_head_in = d_attn_out + (size_t)h * head_in_stride;

         float *d_wo_h = d_wo + (size_t)h * head_weight_stride;


         memset(tmp_b, 0, (size_t)aligned_embed_dim * sizeof(float));

         fc2_backward_kernel(d_out,

                             head_in,

                             wo_h,

                             d_head_in,

                             d_wo_h,

                             tmp_b,

                             tokens,

                             aligned_head_dim,

                             aligned_embed_dim,

                             1);

     }


     free(tmp_b);

 }


 void ck_qkv_project_head_major_backward(const float *d_q,

                                         const float *d_k,

                                         const float *d_v,

                                         const float *input,

                                         const float *wq,

                                         const float *bq,

                                         const float *wk,

                                         const float *bk,

                                         const float *wv,

                                         const float *bv,

                                         float *d_input,

                                         float *d_wq,

                                         float *d_bq,

                                         float *d_wk,

                                         float *d_bk,

                                         float *d_wv,

                                         float *d_bv,

                                         float *scratch,

                                         int tokens,

                                         int aligned_embed_dim,

                                         int num_heads,

                                         int num_kv_heads,

                                         int aligned_head_dim,

                                         int num_threads)

 {

     if (!d_q || !d_k || !d_v || !input || !wq || !wk || !wv ||

         !d_input || !d_wq || !d_bq || !d_wk || !d_bk || !d_wv || !d_bv || !scratch) {

         return;

     }


     size_t total_in = (size_t)tokens * (size_t)aligned_embed_dim;

     for (size_t i = 0; i < total_in; ++i) {

         d_input[i] = 0.0f;

     }


     size_t head_weight_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     size_t head_out_stride = (size_t)tokens * (size_t)aligned_head_dim;


     for (int h = 0; h < num_heads; ++h) {

         const float *d_q_h = d_q + (size_t)h * head_out_stride;

         const float *wq_h = wq + (size_t)h * head_weight_stride;

         float *d_wq_h = d_wq + (size_t)h * head_weight_stride;

         float *d_bq_h = d_bq + (size_t)h * (size_t)aligned_head_dim;


         fc2_backward_kernel(d_q_h,

                             input,

                             wq_h,

                             scratch,

                             d_wq_h,

                             d_bq_h,

                             tokens,

                             aligned_embed_dim,

                             aligned_head_dim,

                             num_threads);

         ck_add_inplace(d_input, scratch, tokens, aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const float *d_k_h = d_k + (size_t)h * head_out_stride;

         const float *d_v_h = d_v + (size_t)h * head_out_stride;


         const float *wk_h = wk + (size_t)h * head_weight_stride;

         const float *wv_h = wv + (size_t)h * head_weight_stride;


         float *d_wk_h = d_wk + (size_t)h * head_weight_stride;

         float *d_wv_h = d_wv + (size_t)h * head_weight_stride;


         float *d_bk_h = d_bk + (size_t)h * (size_t)aligned_head_dim;

         float *d_bv_h = d_bv + (size_t)h * (size_t)aligned_head_dim;


         fc2_backward_kernel(d_k_h,

                             input,

                             wk_h,

                             scratch,

                             d_wk_h,

                             d_bk_h,

                             tokens,

                             aligned_embed_dim,

                             aligned_head_dim,

                             num_threads);

         ck_add_inplace(d_input, scratch, tokens, aligned_embed_dim);


         fc2_backward_kernel(d_v_h,

                             input,

                             wv_h,

                             scratch,

                             d_wv_h,

                             d_bv_h,

                             tokens,

                             aligned_embed_dim,

                             aligned_head_dim,

                             num_threads);

         ck_add_inplace(d_input, scratch, tokens, aligned_embed_dim);

     }

 }


 void ck_mlp_swiglu_forward(const float *input,

                            const float *w1,

                            const float *b1,

                            const float *w2,

                            const float *b2,

                            float *fc1_out,

                            float *swiglu_out,

                            float *output,

                            int tokens,

                            int aligned_embed_dim,

                            int aligned_intermediate_dim)

 {

     int up_dim = 2 * aligned_intermediate_dim;

     gemm_blocked_serial(input, w1, b1, fc1_out,

                         tokens, up_dim, aligned_embed_dim);


     swiglu_forward(fc1_out, swiglu_out, tokens, aligned_intermediate_dim);


     gemm_blocked_serial(swiglu_out, w2, b2, output,

                         tokens, aligned_embed_dim, aligned_intermediate_dim);

 }


 static void ck_mlp_swiglu_forward_ref(const float *input,

                                       const float *w1,

                                       const float *b1,

                                       const float *w2,

                                       const float *b2,

                                       float *fc1_out,

                                       float *swiglu_out,

                                       float *output,

                                       int tokens,

                                       int aligned_embed_dim,

                                       int aligned_intermediate_dim)

 {

     int up_dim = 2 * aligned_intermediate_dim;

     gemm_naive_parallel(input, w1, b1, fc1_out,

                         tokens, up_dim, aligned_embed_dim);


     swiglu_forward(fc1_out, swiglu_out, tokens, aligned_intermediate_dim);


     gemm_naive_parallel(swiglu_out, w2, b2, output,

                         tokens, aligned_embed_dim, aligned_intermediate_dim);

 }


 void ck_layer_forward_rmsnorm_swiglu(const CKLayerForwardParams *p)

 {

     if (!p) {

         return;

     }


     rmsnorm_forward(p->input,

                     p->ln1_gamma,

                     p->ln1_out,

                     p->ln1_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_qkv_project_head_major(p->ln1_out,

                               p->wq, p->bq,

                               p->wk, p->bk,

                               p->wv, p->bv,

                               p->q, p->k, p->v,

                               p->tokens,

                               p->tokens,

                               p->aligned_embed_dim,

                               p->num_heads,

                               p->num_kv_heads,

                               p->aligned_head_dim);


     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(p->q,

                         p->k,

                         p->rope_cos,

                         p->rope_sin,

                         p->num_heads,

                         p->num_kv_heads,

                         p->tokens,

                         p->head_dim,

                         p->aligned_head_dim,

                         p->rope_pos_offset);

     }


     if (p->scores) {

         attention_forward_causal_head_major_gqa(p->q,

                                                p->k,

                                                p->v,

                                                p->scores,

                                                p->attn_out,

                                                p->num_heads,

                                                p->num_kv_heads,

                                                p->tokens,

                                                p->head_dim,

                                                p->aligned_head_dim,

                                                p->aligned_context_window);

     } else {

         attention_forward_causal_head_major_gqa_flash(p->q,

                                                      p->k,

                                                      p->v,

                                                      p->attn_out,

                                                      p->num_heads,

                                                      p->num_kv_heads,

                                                      p->tokens,

                                                      p->head_dim,

                                                      p->aligned_head_dim);

     }


     ck_attention_project_head_major(p->attn_out,

                                     p->wo,

                                     p->bo,

                                     p->proj_tmp,

                                     p->proj_scratch,

                                     p->tokens,

                                     p->aligned_embed_dim,

                                     p->num_heads,

                                     p->aligned_head_dim);


     ck_residual_add_token_major(p->input,

                                 p->proj_tmp,

                                 p->residual1,

                                 p->tokens,

                                 p->aligned_embed_dim);


     rmsnorm_forward(p->residual1,

                     p->ln2_gamma,

                     p->ln2_out,

                     p->ln2_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_mlp_swiglu_forward(p->ln2_out,

                           p->w1,

                           p->b1,

                           p->w2,

                           p->b2,

                           p->fc1_out,

                           p->swiglu_out,

                           p->mlp_out,

                           p->tokens,

                           p->aligned_embed_dim,

                           p->aligned_intermediate_dim);


     ck_residual_add_token_major(p->residual1,

                                 p->mlp_out,

                                 p->output,

                                 p->tokens,

                                 p->aligned_embed_dim);

 }


 void ck_layer_forward_rmsnorm_swiglu_ref(const CKLayerForwardParams *p)

 {

     if (!p) {

         return;

     }


     rmsnorm_forward(p->input,

                     p->ln1_gamma,

                     p->ln1_out,

                     p->ln1_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_qkv_project_head_major_ref(p->ln1_out,

                                   p->wq, p->bq,

                                   p->wk, p->bk,

                                   p->wv, p->bv,

                                   p->q, p->k, p->v,

                                   p->tokens,

                                   p->tokens,

                                   p->aligned_embed_dim,

                                   p->num_heads,

                                   p->num_kv_heads,

                                   p->aligned_head_dim);


     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(p->q,

                         p->k,

                         p->rope_cos,

                         p->rope_sin,

                         p->num_heads,

                         p->num_kv_heads,

                         p->tokens,

                         p->head_dim,

                         p->aligned_head_dim,

                         p->rope_pos_offset);

     }


     if (p->scores) {

         attention_forward_causal_head_major_gqa(p->q,

                                                p->k,

                                                p->v,

                                                p->scores,

                                                p->attn_out,

                                                p->num_heads,

                                                p->num_kv_heads,

                                                p->tokens,

                                                p->head_dim,

                                                p->aligned_head_dim,

                                                p->aligned_context_window);

     } else {

         attention_forward_causal_head_major_gqa_flash(p->q,

                                                      p->k,

                                                      p->v,

                                                      p->attn_out,

                                                      p->num_heads,

                                                      p->num_kv_heads,

                                                      p->tokens,

                                                      p->head_dim,

                                                      p->aligned_head_dim);

     }


     ck_attention_project_head_major_ref(p->attn_out,

                                         p->wo,

                                         p->bo,

                                         p->proj_tmp,

                                         p->proj_scratch,

                                         p->tokens,

                                         p->aligned_embed_dim,

                                         p->num_heads,

                                         p->aligned_head_dim);


     ck_residual_add_token_major(p->input,

                                 p->proj_tmp,

                                 p->residual1,

                                 p->tokens,

                                 p->aligned_embed_dim);


     rmsnorm_forward(p->residual1,

                     p->ln2_gamma,

                     p->ln2_out,

                     p->ln2_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_mlp_swiglu_forward_ref(p->ln2_out,

                               p->w1,

                               p->b1,

                               p->w2,

                               p->b2,

                               p->fc1_out,

                               p->swiglu_out,

                               p->mlp_out,

                               p->tokens,

                               p->aligned_embed_dim,

                               p->aligned_intermediate_dim);


     ck_residual_add_token_major(p->residual1,

                                 p->mlp_out,

                                 p->output,

                                 p->tokens,

                                 p->aligned_embed_dim);

 }


 void ck_mlp_swiglu_forward_fused_token(const float *input_row,

                                        const float *w1,

                                        const float *b1,

                                        const float *w2,

                                        const float *b2,

                                        float *swiglu_row,

                                        float *output_row,

                                        int aligned_embed_dim,

                                        int aligned_intermediate_dim)

 {

     if (!input_row || !w1 || !w2 || !swiglu_row || !output_row) {

         return;

     }


     const float *w_gate = w1;

     const float *w_up = w1 + (size_t)aligned_intermediate_dim * (size_t)aligned_embed_dim;

     const float *b_gate = b1;

     const float *b_up = b1 ? (b1 + aligned_intermediate_dim) : NULL;


     gemm_swiglu_fused(input_row,

                       w_gate,

                       w_up,

                       b_gate,

                       b_up,

                       swiglu_row,

                       /*M=*/1,

                       /*N=*/aligned_intermediate_dim,

                       /*K=*/aligned_embed_dim);


     gemm_blocked_serial(swiglu_row, w2, b2, output_row,

                         /*M=*/1,

                         /*N=*/aligned_embed_dim,

                         /*K=*/aligned_intermediate_dim);

 }


 void ck_mlp_swiglu_forward_fully_fused_token(const float *input_row,

                                               const float *w1,

                                               const float *b1,

                                               const float *w2,

                                               const float *b2,

                                               float *output_row,

                                               int aligned_embed_dim,

                                               int aligned_intermediate_dim)

 {

     if (!input_row || !w1 || !w2 || !output_row) {

         return;

     }


     // Split w1 into gate and up projections

     // w1 layout: [2 * aligned_intermediate_dim, aligned_embed_dim]

     //   First half: W_gate [aligned_intermediate_dim, aligned_embed_dim]

     //   Second half: W_up [aligned_intermediate_dim, aligned_embed_dim]

     const float *w_gate = w1;

     const float *w_up = w1 + (size_t)aligned_intermediate_dim * (size_t)aligned_embed_dim;


     // Split b1 into gate and up biases (if present)

     const float *b_gate = b1;

     const float *b_up = b1 ? (b1 + aligned_intermediate_dim) : NULL;


     // w2 is W_down: [aligned_embed_dim, aligned_intermediate_dim]

     const float *w_down = w2;

     const float *b_down = b2;


     // Call the fully fused kernel - eliminates DRAM round-trip for swiglu

     // Uses aligned dimensions since weights are stored with alignment padding

     fused_mlp_swiglu_decode_v2(input_row,

                                 w_gate,

                                 w_up,

                                 w_down,

                                 b_gate,

                                 b_up,

                                 b_down,

                                 output_row,

                                 aligned_embed_dim,

                                 aligned_intermediate_dim);

 }


 void ck_layer_forward_rmsnorm_swiglu_decode(const CKLayerForwardParams *p,

                                            int token_index,

                                            int cache_capacity)

 {

     if (!p) {

         return;

     }

     if (!p->input || !p->ln1_gamma || !p->ln2_gamma || !p->ln1_out || !p->ln2_out ||

         !p->wq || !p->wk || !p->wv || !p->wo || !p->w1 || !p->w2 ||

         !p->k || !p->v ||

         !p->proj_tmp || !p->residual1 || !p->fc1_out || !p->swiglu_out || !p->mlp_out || !p->output) {

         return;

     }

     if (token_index < 0 || cache_capacity <= 0 || token_index >= cache_capacity) {

         return;

     }

     if (p->num_heads <= 0 || p->num_kv_heads <= 0 || p->aligned_head_dim <= 0) {

         return;

     }


     const int D = p->embed_dim;

     const int aligned_D = p->aligned_embed_dim;

     const int H = p->num_heads;

     const int H_kv = p->num_kv_heads;

     const int hd = p->head_dim;

     const int ad = p->aligned_head_dim;

     const int aligned_intermediate = p->aligned_intermediate_dim;


     /* Decode buffers are single-token; token_index only applies to KV cache. */

     const size_t token_slot = 0;

     const float *input_row = p->input + token_slot * (size_t)aligned_D;

     float *ln1_row = p->ln1_out + token_slot * (size_t)aligned_D;

     float *ln2_row = p->ln2_out + token_slot * (size_t)aligned_D;

     float *proj_row = p->proj_tmp + token_slot * (size_t)aligned_D;

     float *residual_row = p->residual1 + token_slot * (size_t)aligned_D;

     float *mlp_row = p->mlp_out + token_slot * (size_t)aligned_D;

     float *out_row = p->output + token_slot * (size_t)aligned_D;


     float ln1_rstd_tmp = 0.0f;

     float ln2_rstd_tmp = 0.0f;

     float *ln1_rstd = p->ln1_rstd ? (p->ln1_rstd + token_slot) : &ln1_rstd_tmp;

     float *ln2_rstd = p->ln2_rstd ? (p->ln2_rstd + token_slot) : &ln2_rstd_tmp;


     // Scratch for a single token in head-major layout: [head, aligned_head_dim].

     size_t q_elems = (size_t)H * (size_t)ad;

     size_t kv_elems = (size_t)H_kv * (size_t)ad;

     float q_token[q_elems];

     float k_token[kv_elems];

     float v_token[kv_elems];

     float attn_token[q_elems];


     // LN1 / RMSNorm.

     rmsnorm_forward(input_row,

                     p->ln1_gamma,

                     ln1_row,

                     ln1_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     // Project Q/K/V for the new token.

     ck_qkv_project_head_major_token(ln1_row,

                                     p->wq, p->bq,

                                     p->wk, p->bk,

                                     p->wv, p->bv,

                                     q_token, k_token, v_token,

                                     aligned_D,

                                     H,

                                     H_kv,

                                     ad);


     // RoPE for the new token at absolute position `p->rope_pos_offset`.

     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(q_token,

                         k_token,

                         p->rope_cos,

                         p->rope_sin,

                         H,

                         H_kv,

                         /*num_tokens=*/1,

                         hd,

                         ad,

                         p->rope_pos_offset);

     }


     // Update KV cache (stores k/v for this token and clears padded lanes).

     kv_cache_write_head_major(k_token,

                               v_token,

                               p->k,

                               p->v,

                               H_kv,

                               token_index,

                               cache_capacity,

                               hd,

                               ad);


     // Decode attention for this token using the KV cache.

     ck_attention_flash_decode_wrapper(q_token,

                                    p->k,

                                    p->v,

                                    attn_token,

                                    H,

                                    H_kv,

                                    /*kv_tokens=*/token_index + 1,

                                    cache_capacity,

                                    hd,

                                    ad);


     // Output projection (Wo) into token-major buffer (decode-specialized).

     ck_attention_project_head_major_decode_token(attn_token,

                                                  p->wo,

                                                  p->bo,

                                                  proj_row,

                                                  D,

                                                  aligned_D,

                                                  H,

                                                  ad);


     // Residual + LN2 / RMSNorm.

     ck_residual_add_token_major(input_row,

                                 proj_row,

                                 residual_row,

                                 /*tokens=*/1,

                                 aligned_D);


     rmsnorm_forward(residual_row,

                     p->ln2_gamma,

                     ln2_row,

                     ln2_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     // MLP block for this token.

     int up_dim = 2 * aligned_intermediate;

     float *fc1_row = p->fc1_out + token_slot * (size_t)up_dim;

     float *swiglu_row = p->swiglu_out + token_slot * (size_t)aligned_intermediate;


     ck_mlp_swiglu_forward(ln2_row,

                           p->w1,

                           p->b1,

                           p->w2,

                           p->b2,

                           fc1_row,

                           swiglu_row,

                           mlp_row,

                           /*tokens=*/1,

                           aligned_D,

                           aligned_intermediate);


     // Final residual.

     ck_residual_add_token_major(residual_row,

                                 mlp_row,

                                 out_row,

                                 /*tokens=*/1,

                                 aligned_D);

 }


 void ck_layer_forward_rmsnorm_swiglu_decode_fused(const CKLayerForwardParams *p,

                                                  int token_index,

                                                  int cache_capacity)

 {

     if (!p) {

         return;

     }

     if (!p->input || !p->ln1_gamma || !p->ln2_gamma || !p->ln1_out || !p->ln2_out ||

         !p->wq || !p->wk || !p->wv || !p->wo || !p->w1 || !p->w2 ||

         !p->k || !p->v || !p->swiglu_out ||

         !p->proj_tmp || !p->residual1 || !p->mlp_out || !p->output) {

         return;

     }

     if (token_index < 0 || cache_capacity <= 0 || token_index >= cache_capacity) {

         return;

     }

     if (p->num_heads <= 0 || p->num_kv_heads <= 0 || p->aligned_head_dim <= 0) {

         return;

     }


     const int D = p->embed_dim;

     const int aligned_D = p->aligned_embed_dim;

     const int H = p->num_heads;

     const int H_kv = p->num_kv_heads;

     const int hd = p->head_dim;

     const int ad = p->aligned_head_dim;

     const int aligned_intermediate = p->aligned_intermediate_dim;


     /* Decode buffers are single-token; token_index only applies to KV cache. */

     const size_t token_slot = 0;

     const float *input_row = p->input + token_slot * (size_t)aligned_D;

     float *ln1_row = p->ln1_out + token_slot * (size_t)aligned_D;

     float *ln2_row = p->ln2_out + token_slot * (size_t)aligned_D;

     float *proj_row = p->proj_tmp + token_slot * (size_t)aligned_D;

     float *residual_row = p->residual1 + token_slot * (size_t)aligned_D;

     float *swiglu_row = p->swiglu_out + token_slot * (size_t)aligned_intermediate;

     float *mlp_row = p->mlp_out + token_slot * (size_t)aligned_D;

     float *out_row = p->output + token_slot * (size_t)aligned_D;


     float ln1_rstd_tmp = 0.0f;

     float ln2_rstd_tmp = 0.0f;

     float *ln1_rstd = p->ln1_rstd ? (p->ln1_rstd + token_slot) : &ln1_rstd_tmp;

     float *ln2_rstd = p->ln2_rstd ? (p->ln2_rstd + token_slot) : &ln2_rstd_tmp;


     // Scratch for a single token in head-major layout: [head, aligned_head_dim].

     size_t q_elems = (size_t)H * (size_t)ad;

     size_t kv_elems = (size_t)H_kv * (size_t)ad;

     float q_token[q_elems];

     float k_token[kv_elems];

     float v_token[kv_elems];

     float attn_token[q_elems];


     // LN1 / RMSNorm.

     rmsnorm_forward(input_row,

                     p->ln1_gamma,

                     ln1_row,

                     ln1_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     // Project Q/K/V for the new token.

     ck_qkv_project_head_major_token(ln1_row,

                                     p->wq, p->bq,

                                     p->wk, p->bk,

                                     p->wv, p->bv,

                                     q_token, k_token, v_token,

                                     aligned_D,

                                     H,

                                     H_kv,

                                     ad);


     // RoPE for the new token at absolute position `p->rope_pos_offset`.

     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(q_token,

                         k_token,

                         p->rope_cos,

                         p->rope_sin,

                         H,

                         H_kv,

                         /*num_tokens=*/1,

                         hd,

                         ad,

                         p->rope_pos_offset);

     }


     // Update KV cache (stores k/v for this token and clears padded lanes).

     kv_cache_write_head_major(k_token,

                               v_token,

                               p->k,

                               p->v,

                               H_kv,

                               token_index,

                               cache_capacity,

                               hd,

                               ad);


     // Decode attention for this token using the KV cache.

     ck_attention_flash_decode_wrapper(q_token,

                                    p->k,

                                    p->v,

                                    attn_token,

                                    H,

                                    H_kv,

                                    /*kv_tokens=*/token_index + 1,

                                    cache_capacity,

                                    hd,

                                    ad);


     // Output projection (Wo) into token-major buffer (decode-specialized).

     ck_attention_project_head_major_decode_token(attn_token,

                                                  p->wo,

                                                  p->bo,

                                                  proj_row,

                                                  D,

                                                  aligned_D,

                                                  H,

                                                  ad);


     // Residual + LN2 / RMSNorm.

     ck_residual_add_token_major(input_row,

                                 proj_row,

                                 residual_row,

                                 /*tokens=*/1,

                                 aligned_D);


     rmsnorm_forward(residual_row,

                     p->ln2_gamma,

                     ln2_row,

                     ln2_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     // MLP block for this token (fully fused - all 3 projections in one pass).

     // Eliminates DRAM round-trip for swiglu intermediate values.

     ck_mlp_swiglu_forward_fully_fused_token(ln2_row,

                                              p->w1,

                                              p->b1,

                                              p->w2,

                                              p->b2,

                                              mlp_row,

                                              aligned_D,

                                              aligned_intermediate);


     // Final residual.

     ck_residual_add_token_major(residual_row,

                                 mlp_row,

                                 out_row,

                                 /*tokens=*/1,

                                 aligned_D);

 }


 static void ck_qkv_project_head_major_token_q4_k(const float *input_row,

                                                  const void *wq, const float *bq,

                                                  const void *wk, const float *bk,

                                                  const void *wv, const float *bv,

                                                  float *q_token,

                                                  float *k_token,

                                                  float *v_token,

                                                  int aligned_embed_dim,

                                                  int num_heads,

                                                  int num_kv_heads,

                                                  int aligned_head_dim)

 {

     if (!input_row || !wq || !wk || !wv || !q_token || !k_token || !v_token) {

         return;

     }


     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     const size_t head_w_bytes = ck_dtype_row_bytes(CK_DT_Q4_K, head_w_elems);


     const uint8_t *wq_bytes = (const uint8_t *)wq;

     const uint8_t *wk_bytes = (const uint8_t *)wk;

     const uint8_t *wv_bytes = (const uint8_t *)wv;


     for (int h = 0; h < num_heads; ++h) {

         const void *wq_h = wq_bytes + (size_t)h * head_w_bytes;

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q_token + (size_t)h * (size_t)aligned_head_dim;

         gemm_nt_q4_k(input_row, wq_h, bq_h, q_h,

                      /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const void *wk_h = wk_bytes + (size_t)h * head_w_bytes;

         const void *wv_h = wv_bytes + (size_t)h * head_w_bytes;

         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *k_h = k_token + (size_t)h * (size_t)aligned_head_dim;

         float *v_h = v_token + (size_t)h * (size_t)aligned_head_dim;

         gemm_nt_q4_k(input_row, wk_h, bk_h, k_h,

                      /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

         gemm_nt_q4_k(input_row, wv_h, bv_h, v_h,

                      /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

     }

 }


 static void ck_qkv_project_head_major_token_q4_k_q8_k(const block_q8_K *input_q8,

                                                       const void *wq, const float *bq,

                                                       const void *wk, const float *bk,

                                                       const void *wv, const float *bv,

                                                       float *q_token,

                                                       float *k_token,

                                                       float *v_token,

                                                       int aligned_embed_dim,

                                                       int num_heads,

                                                       int num_kv_heads,

                                                       int aligned_head_dim)

 {

     if (!input_q8 || !wq || !wk || !wv || !q_token || !k_token || !v_token) {

         return;

     }


     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     const size_t head_w_bytes = ck_dtype_row_bytes(CK_DT_Q4_K, head_w_elems);


     const uint8_t *wq_bytes = (const uint8_t *)wq;

     const uint8_t *wk_bytes = (const uint8_t *)wk;

     const uint8_t *wv_bytes = (const uint8_t *)wv;


     for (int h = 0; h < num_heads; ++h) {

         const void *wq_h = wq_bytes + (size_t)h * head_w_bytes;

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q_token + (size_t)h * (size_t)aligned_head_dim;

         gemm_nt_q4_k_q8_k(input_q8, wq_h, bq_h, q_h,

                           /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const void *wk_h = wk_bytes + (size_t)h * head_w_bytes;

         const void *wv_h = wv_bytes + (size_t)h * head_w_bytes;

         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *k_h = k_token + (size_t)h * (size_t)aligned_head_dim;

         float *v_h = v_token + (size_t)h * (size_t)aligned_head_dim;

         gemm_nt_q4_k_q8_k(input_q8, wk_h, bk_h, k_h,

                           /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

         gemm_nt_q4_k_q8_k(input_q8, wv_h, bv_h, v_h,

                           /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim);

     }

 }


 static void ck_qkv_project_head_major_q4_k_q8_k(const float *input,

                                                 const void *wq, const float *bq,

                                                 const void *wk, const float *bk,

                                                 const void *wv, const float *bv,

                                                 float *q, float *k, float *v,

                                                 int tokens,

                                                 int kv_stride_tokens,

                                                 int aligned_embed_dim,

                                                 int num_heads,

                                                 int num_kv_heads,

                                                 int aligned_head_dim)

 {

     if (!input || !wq || !wk || !wv || !q || !k || !v) {

         return;

     }

     if (tokens <= 0 || aligned_embed_dim <= 0) {

         return;

     }

     if (kv_stride_tokens < tokens) {

         return;

     }

     if ((aligned_embed_dim % QK_K) != 0) {

         return;

     }


     const int q8_blocks = aligned_embed_dim / QK_K;

     block_q8_K q8_buf[q8_blocks];

     const size_t q_head_stride = (size_t)tokens * (size_t)aligned_head_dim;

     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;


     float q_token[num_heads * aligned_head_dim];

     float k_token[num_kv_heads * aligned_head_dim];

     float v_token[num_kv_heads * aligned_head_dim];


     for (int t = 0; t < tokens; ++t) {

         const float *input_row = input + (size_t)t * (size_t)aligned_embed_dim;

         quantize_row_q8_k(input_row, q8_buf, aligned_embed_dim);


         ck_qkv_project_head_major_token_q4_k_q8_k(q8_buf,

                                                   wq, bq,

                                                   wk, bk,

                                                   wv, bv,

                                                   q_token,

                                                   k_token,

                                                   v_token,

                                                   aligned_embed_dim,

                                                   num_heads,

                                                   num_kv_heads,

                                                   aligned_head_dim);


         for (int h = 0; h < num_heads; ++h) {

             float *q_dst = q + (size_t)h * q_head_stride + (size_t)t * (size_t)aligned_head_dim;

             memcpy(q_dst,

                    q_token + (size_t)h * (size_t)aligned_head_dim,

                    (size_t)aligned_head_dim * sizeof(float));

         }


         for (int h = 0; h < num_kv_heads; ++h) {

             float *k_dst = k + (size_t)h * kv_head_stride + (size_t)t * (size_t)aligned_head_dim;

             float *v_dst = v + (size_t)h * kv_head_stride + (size_t)t * (size_t)aligned_head_dim;

             memcpy(k_dst,

                    k_token + (size_t)h * (size_t)aligned_head_dim,

                    (size_t)aligned_head_dim * sizeof(float));

             memcpy(v_dst,

                    v_token + (size_t)h * (size_t)aligned_head_dim,

                    (size_t)aligned_head_dim * sizeof(float));

         }

     }

 }


 static void ck_attention_project_head_major_q4_k_q8_k(const float *attn_out,

                                                       const void *wo,

                                                       const float *bo,

                                                       float *out,

                                                       int tokens,

                                                       int aligned_embed_dim,

                                                       int num_heads,

                                                       int aligned_head_dim)

 {

     if (!attn_out || !wo || !out) {

         return;

     }

     if (tokens <= 0 || aligned_embed_dim <= 0) {

         return;

     }

     if ((aligned_embed_dim % QK_K) != 0) {

         return;

     }


     const int K = num_heads * aligned_head_dim;

     if (K != aligned_embed_dim) {

         return;

     }


     const int q8_blocks = aligned_embed_dim / QK_K;

     block_q8_K q8_buf[q8_blocks];

     float attn_token[aligned_embed_dim];

     const size_t head_stride = (size_t)tokens * (size_t)aligned_head_dim;


     for (int t = 0; t < tokens; ++t) {

         for (int h = 0; h < num_heads; ++h) {

             const float *src = attn_out + (size_t)h * head_stride + (size_t)t * (size_t)aligned_head_dim;

             memcpy(attn_token + (size_t)h * (size_t)aligned_head_dim,

                    src,

                    (size_t)aligned_head_dim * sizeof(float));

         }


         quantize_row_q8_k(attn_token, q8_buf, aligned_embed_dim);

         gemm_nt_q4_k_q8_k(q8_buf, wo, bo,

                           out + (size_t)t * (size_t)aligned_embed_dim,

                           /*M=*/1, /*N=*/aligned_embed_dim, /*K=*/aligned_embed_dim);

     }

 }


 static void ck_mlp_swiglu_forward_q4_k_q8_k_prefill(const float *input,

                                                     const void *w1,

                                                     const float *b1,

                                                     const void *w2,

                                                     const float *b2,

                                                     float *fc1_out,

                                                     float *swiglu_out,

                                                     float *output,

                                                     int tokens,

                                                     int aligned_embed_dim,

                                                     int aligned_intermediate_dim)

 {

     if (!input || !w1 || !w2 || !fc1_out || !swiglu_out || !output) {

         return;

     }

     if (tokens <= 0) {

         return;

     }

     if ((aligned_embed_dim % QK_K) != 0 || (aligned_intermediate_dim % QK_K) != 0) {

         return;

     }


     const int up_dim = 2 * aligned_intermediate_dim;

     const int q8_blocks_embed = aligned_embed_dim / QK_K;

     const int q8_blocks_inter = aligned_intermediate_dim / QK_K;

     const int q8_blocks_max = (q8_blocks_embed > q8_blocks_inter) ? q8_blocks_embed : q8_blocks_inter;

     block_q8_K q8_buf[q8_blocks_max];


     for (int t = 0; t < tokens; ++t) {

         const float *input_row = input + (size_t)t * (size_t)aligned_embed_dim;

         float *fc1_row = fc1_out + (size_t)t * (size_t)up_dim;


         quantize_row_q8_k(input_row, q8_buf, aligned_embed_dim);

         gemm_nt_q4_k_q8_k(q8_buf, w1, b1, fc1_row,

                           /*M=*/1, /*N=*/up_dim, /*K=*/aligned_embed_dim);

     }


     swiglu_forward(fc1_out, swiglu_out, tokens, aligned_intermediate_dim);


     for (int t = 0; t < tokens; ++t) {

         const float *swiglu_row = swiglu_out + (size_t)t * (size_t)aligned_intermediate_dim;

         float *out_row = output + (size_t)t * (size_t)aligned_embed_dim;


         quantize_row_q8_k(swiglu_row, q8_buf, aligned_intermediate_dim);

         gemm_nt_q4_k_q8_k(q8_buf, w2, b2, out_row,

                           /*M=*/1, /*N=*/aligned_embed_dim, /*K=*/aligned_intermediate_dim);

     }

 }


 static void ck_qkv_project_head_major_token_quant(const float *input_row,

                                                   const void *wq, const float *bq, CKDataType wq_dtype,

                                                   const void *wk, const float *bk, CKDataType wk_dtype,

                                                   const void *wv, const float *bv, CKDataType wv_dtype,

                                                   float *q_token,

                                                   float *k_token,

                                                   float *v_token,

                                                   int aligned_embed_dim,

                                                   int num_heads,

                                                   int num_kv_heads,

                                                   int aligned_head_dim)

 {

     if (!input_row || !wq || !wk || !wv || !q_token || !k_token || !v_token) {

         return;

     }


     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;

     const size_t wq_head_bytes = ck_dtype_row_bytes(wq_dtype, head_w_elems);

     const size_t wk_head_bytes = ck_dtype_row_bytes(wk_dtype, head_w_elems);

     const size_t wv_head_bytes = ck_dtype_row_bytes(wv_dtype, head_w_elems);


     const uint8_t *wq_bytes = (const uint8_t *)wq;

     const uint8_t *wk_bytes = (const uint8_t *)wk;

     const uint8_t *wv_bytes = (const uint8_t *)wv;


     for (int h = 0; h < num_heads; ++h) {

         const void *wq_h = (wq_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wq + (size_t)h * head_w_elems)

             : (const void *)(wq_bytes + (size_t)h * wq_head_bytes);

         const float *bq_h = bq ? (bq + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *q_h = q_token + (size_t)h * (size_t)aligned_head_dim;

         ck_gemm_nt_quant(input_row, wq_h, bq_h, q_h,

                          /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim, wq_dtype);

     }


     for (int h = 0; h < num_kv_heads; ++h) {

         const void *wk_h = (wk_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wk + (size_t)h * head_w_elems)

             : (const void *)(wk_bytes + (size_t)h * wk_head_bytes);

         const void *wv_h = (wv_dtype == CK_DT_FP32)

             ? (const void *)((const float *)wv + (size_t)h * head_w_elems)

             : (const void *)(wv_bytes + (size_t)h * wv_head_bytes);

         const float *bk_h = bk ? (bk + (size_t)h * (size_t)aligned_head_dim) : NULL;

         const float *bv_h = bv ? (bv + (size_t)h * (size_t)aligned_head_dim) : NULL;

         float *k_h = k_token + (size_t)h * (size_t)aligned_head_dim;

         float *v_h = v_token + (size_t)h * (size_t)aligned_head_dim;

         ck_gemm_nt_quant(input_row, wk_h, bk_h, k_h,

                          /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim, wk_dtype);

         ck_gemm_nt_quant(input_row, wv_h, bv_h, v_h,

                          /*M=*/1, /*N=*/aligned_head_dim, /*K=*/aligned_embed_dim, wv_dtype);

     }

 }


 void ck_layer_forward_rmsnorm_swiglu_q4_k(const CKLayerForwardParamsQ4K *p)

 {

     if (!p) {

         return;

     }


     const int aligned_D = p->aligned_embed_dim;

     const int aligned_intermediate = p->aligned_intermediate_dim;


     rmsnorm_forward(p->input,

                     p->ln1_gamma,

                     p->ln1_out,

                     p->ln1_rstd,

                     p->tokens,

                     p->embed_dim,

                     aligned_D,

                     p->eps);


     if (ck_q8k_activations_enabled()) {

         if ((aligned_D % QK_K) == 0 && (aligned_intermediate % QK_K) == 0) {

             ck_qkv_project_head_major_q4_k_q8_k(p->ln1_out,

                                                 p->wq, p->bq,

                                                 p->wk, p->bk,

                                                 p->wv, p->bv,

                                                 p->q, p->k, p->v,

                                                 p->tokens,

                                                 p->tokens,

                                                 aligned_D,

                                                 p->num_heads,

                                                 p->num_kv_heads,

                                                 p->aligned_head_dim);


             if (p->rope_cos && p->rope_sin) {

                 rope_forward_qk(p->q,

                                 p->k,

                                 p->rope_cos,

                                 p->rope_sin,

                                 p->num_heads,

                                 p->num_kv_heads,

                                 p->tokens,

                                 p->head_dim,

                                 p->aligned_head_dim,

                                 p->rope_pos_offset);

             }


             if (p->scores) {

                 attention_forward_causal_head_major_gqa(p->q,

                                                        p->k,

                                                        p->v,

                                                        p->scores,

                                                        p->attn_out,

                                                        p->num_heads,

                                                        p->num_kv_heads,

                                                        p->tokens,

                                                        p->head_dim,

                                                        p->aligned_head_dim,

                                                        p->aligned_context_window);

             } else {

                 attention_forward_causal_head_major_gqa_flash(p->q,

                                                              p->k,

                                                              p->v,

                                                              p->attn_out,

                                                              p->num_heads,

                                                              p->num_kv_heads,

                                                              p->tokens,

                                                              p->head_dim,

                                                              p->aligned_head_dim);

             }


             ck_attention_project_head_major_q4_k_q8_k(p->attn_out,

                                                       p->wo,

                                                       p->bo,

                                                       p->proj_tmp,

                                                       p->tokens,

                                                       aligned_D,

                                                       p->num_heads,

                                                       p->aligned_head_dim);


             ck_residual_add_token_major(p->input,

                                         p->proj_tmp,

                                         p->residual1,

                                         p->tokens,

                                         aligned_D);


             rmsnorm_forward(p->residual1,

                             p->ln2_gamma,

                             p->ln2_out,

                             p->ln2_rstd,

                             p->tokens,

                             p->embed_dim,

                             aligned_D,

                             p->eps);


             ck_mlp_swiglu_forward_q4_k_q8_k_prefill(p->ln2_out,

                                                     p->w1,

                                                     p->b1,

                                                     p->w2,

                                                     p->b2,

                                                     p->fc1_out,

                                                     p->swiglu_out,

                                                     p->mlp_out,

                                                     p->tokens,

                                                     aligned_D,

                                                     aligned_intermediate);


             ck_residual_add_token_major(p->residual1,

                                         p->mlp_out,

                                         p->output,

                                         p->tokens,

                                         aligned_D);

             return;

         }

     }


     ck_qkv_project_head_major_q4_k(p->ln1_out,

                                   p->wq, p->bq,

                                   p->wk, p->bk,

                                   p->wv, p->bv,

                                   p->q, p->k, p->v,

                                   p->tokens,

                                   p->tokens,

                                   aligned_D,

                                   p->num_heads,

                                   p->num_kv_heads,

                                   p->aligned_head_dim);


     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(p->q,

                         p->k,

                         p->rope_cos,

                         p->rope_sin,

                         p->num_heads,

                         p->num_kv_heads,

                         p->tokens,

                         p->head_dim,

                         p->aligned_head_dim,

                         p->rope_pos_offset);

     }


     if (p->scores) {

         attention_forward_causal_head_major_gqa(p->q,

                                                p->k,

                                                p->v,

                                                p->scores,

                                                p->attn_out,

                                                p->num_heads,

                                                p->num_kv_heads,

                                                p->tokens,

                                                p->head_dim,

                                                p->aligned_head_dim,

                                                p->aligned_context_window);

     } else {

         attention_forward_causal_head_major_gqa_flash(p->q,

                                                      p->k,

                                                      p->v,

                                                      p->attn_out,

                                                      p->num_heads,

                                                      p->num_kv_heads,

                                                      p->tokens,

                                                      p->head_dim,

                                                      p->aligned_head_dim);

     }


     ck_attention_project_head_major_q4_k(p->attn_out,

                                          p->wo,

                                          p->bo,

                                          p->proj_tmp,

                                          p->proj_scratch,

                                          p->tokens,

                                          p->aligned_embed_dim,

                                          p->num_heads,

                                          p->aligned_head_dim);


     ck_residual_add_token_major(p->input,

                                 p->proj_tmp,

                                 p->residual1,

                                 p->tokens,

                                 p->aligned_embed_dim);


     rmsnorm_forward(p->residual1,

                     p->ln2_gamma,

                     p->ln2_out,

                     p->ln2_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_mlp_swiglu_forward_q4_k(p->ln2_out,

                                p->w1,

                                p->b1,

                                p->w2,

                                p->b2,

                                p->fc1_out,

                                p->swiglu_out,

                                p->mlp_out,

                                p->tokens,

                                p->aligned_embed_dim,

                                p->aligned_intermediate_dim);


     ck_residual_add_token_major(p->residual1,

                                 p->mlp_out,

                                 p->output,

                                 p->tokens,

                                 p->aligned_embed_dim);

 }


 void ck_layer_forward_rmsnorm_swiglu_decode_q4_k(const CKLayerForwardParamsQ4K *p,

                                                 int token_index,

                                                 int cache_capacity)

 {

     if (!p) {

         return;

     }

     if (!p->input || !p->ln1_gamma || !p->ln2_gamma || !p->ln1_out || !p->ln2_out ||

         !p->wq || !p->wk || !p->wv || !p->wo || !p->w1 || !p->w2 ||

         !p->k || !p->v ||

         !p->proj_tmp || !p->residual1 || !p->fc1_out || !p->swiglu_out || !p->mlp_out || !p->output) {

         return;

     }

     if (token_index < 0 || cache_capacity <= 0 || token_index >= cache_capacity) {

         return;

     }


     const int D = p->embed_dim;

     const int aligned_D = p->aligned_embed_dim;

     const int H = p->num_heads;

     const int H_kv = p->num_kv_heads;

     const int hd = p->head_dim;

     const int ad = p->aligned_head_dim;

     const int aligned_intermediate = p->aligned_intermediate_dim;

     const int K_concat = H * ad;


     /* Decode buffers are single-token; token_index only applies to KV cache. */

     const size_t token_slot = 0;

     const float *input_row = p->input + token_slot * (size_t)aligned_D;

     float *ln1_row = p->ln1_out + token_slot * (size_t)aligned_D;

     float *ln2_row = p->ln2_out + token_slot * (size_t)aligned_D;

     float *proj_row = p->proj_tmp + token_slot * (size_t)aligned_D;

     float *residual_row = p->residual1 + token_slot * (size_t)aligned_D;

     float *mlp_row = p->mlp_out + token_slot * (size_t)aligned_D;

     float *out_row = p->output + token_slot * (size_t)aligned_D;


     float ln1_rstd_tmp = 0.0f;

     float ln2_rstd_tmp = 0.0f;

     float *ln1_rstd = p->ln1_rstd ? (p->ln1_rstd + token_slot) : &ln1_rstd_tmp;

     float *ln2_rstd = p->ln2_rstd ? (p->ln2_rstd + token_slot) : &ln2_rstd_tmp;


     /* Scratch for a single token in head-major layout: [head, aligned_head_dim]. */

     size_t q_elems = (size_t)H * (size_t)ad;

     size_t kv_elems = (size_t)H_kv * (size_t)ad;

     float q_token[q_elems];

     float k_token[kv_elems];

     float v_token[kv_elems];

     float attn_token[q_elems];


     /* LN1 / RMSNorm. */

     ck_debug_check_buffer("input_row", input_row, aligned_D);

     rmsnorm_forward(input_row,

                     p->ln1_gamma,

                     ln1_row,

                     ln1_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);

     ck_debug_check_buffer("ln1_out (after rmsnorm)", ln1_row, aligned_D);


     if (ck_q8k_activations_enabled()) {

         if ((aligned_D % QK_K) == 0 && (aligned_intermediate % QK_K) == 0) {

         const int q8_blocks_embed = aligned_D / QK_K;

         const int q8_blocks_inter = aligned_intermediate / QK_K;

         const int q8_blocks_max = (q8_blocks_embed > q8_blocks_inter) ? q8_blocks_embed : q8_blocks_inter;

         block_q8_K q8_buf[q8_blocks_max];


         /* Project Q/K/V with Q8_K activations. */

         quantize_row_q8_k(ln1_row, q8_buf, aligned_D);

         ck_debug_check_q8k("q8_buf (after quantize)", q8_buf, q8_blocks_embed);

         ck_debug_check_q4k_weights("wq weights", p->wq, (aligned_D / QK_K) * (H * ad));

         ck_qkv_project_head_major_token_q4_k_q8_k(q8_buf,

                                                   p->wq, p->bq,

                                                   p->wk, p->bk,

                                                   p->wv, p->bv,

                                                   q_token, k_token, v_token,

                                                   aligned_D,

                                                   H,

                                                   H_kv,

                                                   ad);

         ck_debug_check_buffer("q_token (after QKV proj)", q_token, (int)q_elems);

         ck_debug_check_buffer("k_token (after QKV proj)", k_token, (int)kv_elems);

         ck_debug_check_buffer("v_token (after QKV proj)", v_token, (int)kv_elems);


         /* RoPE for the new token at absolute position `p->rope_pos_offset`. */

         if (p->rope_cos && p->rope_sin) {

             rope_forward_qk(q_token,

                             k_token,

                             p->rope_cos,

                             p->rope_sin,

                             H,

                             H_kv,

                             /*num_tokens=*/1,

                             hd,

                             ad,

                             p->rope_pos_offset);

         }


         /* Update KV cache. */

         kv_cache_write_head_major(k_token,

                                   v_token,

                                   p->k,

                                   p->v,

                                   H_kv,

                                   token_index,

                                   cache_capacity,

                                   hd,

                                   ad);


         /* Decode attention for this token using the KV cache. */

         attention_forward_decode_head_major_gqa_regular(q_token,

                                                       p->k,

                                                       p->v,

                                                       attn_token,

                                                       H,

                                                       H_kv,

                                                       /*kv_tokens=*/token_index + 1,

                                                       cache_capacity,

                                                       hd,

                                                       ad);

         ck_debug_check_buffer("attn_token (after attention)", attn_token, (int)q_elems);


         /* Quantized output projection (Wo) with Q8_K activations. */

         quantize_row_q8_k(attn_token, q8_buf, aligned_D);

         gemm_nt_q4_k_q8_k(q8_buf,

                           p->wo,

                           p->bo,

                           proj_row,

                           /*M=*/1,

                           aligned_D,

                           /*K=*/K_concat);

         ck_debug_check_buffer("proj_row (after Wo proj)", proj_row, aligned_D);


         for (int j = D; j < aligned_D; ++j) {

             proj_row[j] = 0.0f;

         }


         /* Residual + LN2 / RMSNorm. */

         ck_residual_add_token_major(input_row,

                                     proj_row,

                                     residual_row,

                                     /*tokens=*/1,

                                     aligned_D);


         rmsnorm_forward(residual_row,

                         p->ln2_gamma,

                         ln2_row,

                         ln2_rstd,

                         /*tokens=*/1,

                         D,

                         aligned_D,

                         p->eps);


         /* MLP block for this token (Q8_K activations). */

         int up_dim = 2 * aligned_intermediate;

         float *fc1_row = p->fc1_out + token_slot * (size_t)up_dim;

         float *swiglu_row = p->swiglu_out + token_slot * (size_t)aligned_intermediate;


         ck_mlp_swiglu_forward_q4_k_q8_k(ln2_row,

                                         p->w1,

                                         p->b1,

                                         p->w2,

                                         p->b2,

                                         fc1_row,

                                         swiglu_row,

                                         mlp_row,

                                         aligned_D,

                                         aligned_intermediate);

         ck_debug_check_buffer("mlp_row (after MLP)", mlp_row, aligned_D);


         /* Final residual. */

         ck_residual_add_token_major(residual_row,

                                     mlp_row,

                                     out_row,

                                     /*tokens=*/1,

                                     aligned_D);

         ck_debug_check_buffer("out_row (final output)", out_row, aligned_D);

         return;

         }

     }


     /* Project Q/K/V for the new token (Q4_K weights). */

     ck_qkv_project_head_major_token_q4_k(ln1_row,

                                          p->wq, p->bq,

                                          p->wk, p->bk,

                                          p->wv, p->bv,

                                          q_token, k_token, v_token,

                                          aligned_D,

                                          H,

                                          H_kv,

                                          ad);


     /* RoPE for the new token at absolute position `p->rope_pos_offset`. */

     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(q_token,

                         k_token,

                         p->rope_cos,

                         p->rope_sin,

                         H,

                         H_kv,

                         /*num_tokens=*/1,

                         hd,

                         ad,

                         p->rope_pos_offset);

     }


     /* Update KV cache. */

     kv_cache_write_head_major(k_token,

                               v_token,

                               p->k,

                               p->v,

                               H_kv,

                               token_index,

                               cache_capacity,

                               hd,

                               ad);


     /* Decode attention for this token using the KV cache. */

     ck_attention_flash_decode_wrapper(q_token,

                                    p->k,

                                    p->v,

                                    attn_token,

                                    H,

                                    H_kv,

                                    /*kv_tokens=*/token_index + 1,

                                    cache_capacity,

                                    hd,

                                    ad);


     /* Quantized output projection: Wo is stored as a flattened Q4_K matrix. */

     gemm_nt_q4_k(attn_token,

                  p->wo,

                  p->bo,

                  proj_row,

                  /*M=*/1,

                  aligned_D,

                  /*K=*/K_concat);


     for (int j = D; j < aligned_D; ++j) {

         proj_row[j] = 0.0f;

     }


     /* Residual + LN2 / RMSNorm. */

     ck_residual_add_token_major(input_row,

                                 proj_row,

                                 residual_row,

                                 /*tokens=*/1,

                                 aligned_D);


     rmsnorm_forward(residual_row,

                     p->ln2_gamma,

                     ln2_row,

                     ln2_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     /* MLP block for this token. */

     int up_dim = 2 * aligned_intermediate;

     float *fc1_row = p->fc1_out + token_slot * (size_t)up_dim;

     float *swiglu_row = p->swiglu_out + token_slot * (size_t)aligned_intermediate;


     ck_mlp_swiglu_forward_q4_k(ln2_row,

                                p->w1,

                                p->b1,

                                p->w2,

                                p->b2,

                                fc1_row,

                                swiglu_row,

                                mlp_row,

                                /*tokens=*/1,

                                aligned_D,

                                aligned_intermediate);


     /* Final residual. */

     ck_residual_add_token_major(residual_row,

                                 mlp_row,

                                 out_row,

                                 /*tokens=*/1,

                                 aligned_D);

 }


 void ck_layer_forward_rmsnorm_swiglu_quant(const CKLayerForwardParamsQ4K *p)

 {

     if (!p) {

         return;

     }


     rmsnorm_forward(p->input,

                     p->ln1_gamma,

                     p->ln1_out,

                     p->ln1_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_qkv_project_head_major_quant(p->ln1_out,

                                    p->wq, p->bq, p->wq_dtype,

                                    p->wk, p->bk, p->wk_dtype,

                                    p->wv, p->bv, p->wv_dtype,

                                    p->q, p->k, p->v,

                                    p->tokens,

                                    p->tokens,

                                    p->aligned_embed_dim,

                                    p->num_heads,

                                    p->num_kv_heads,

                                    p->aligned_head_dim);


     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(p->q,

                         p->k,

                         p->rope_cos,

                         p->rope_sin,

                         p->num_heads,

                         p->num_kv_heads,

                         p->tokens,

                         p->head_dim,

                         p->aligned_head_dim,

                         p->rope_pos_offset);

     }


     if (p->scores) {

         attention_forward_causal_head_major_gqa(p->q,

                                                p->k,

                                                p->v,

                                                p->scores,

                                                p->attn_out,

                                                p->num_heads,

                                                p->num_kv_heads,

                                                p->tokens,

                                                p->head_dim,

                                                p->aligned_head_dim,

                                                p->aligned_context_window);

     } else {

         attention_forward_causal_head_major_gqa_flash(p->q,

                                                      p->k,

                                                      p->v,

                                                      p->attn_out,

                                                      p->num_heads,

                                                      p->num_kv_heads,

                                                      p->tokens,

                                                      p->head_dim,

                                                      p->aligned_head_dim);

     }


     ck_attention_project_head_major_quant(p->attn_out,

                                           p->wo,

                                           p->bo,

                                           p->proj_tmp,

                                           p->proj_scratch,

                                           p->tokens,

                                           p->aligned_embed_dim,

                                           p->num_heads,

                                           p->aligned_head_dim,

                                           p->wo_dtype);


     ck_residual_add_token_major(p->input,

                                 p->proj_tmp,

                                 p->residual1,

                                 p->tokens,

                                 p->aligned_embed_dim);


     rmsnorm_forward(p->residual1,

                     p->ln2_gamma,

                     p->ln2_out,

                     p->ln2_rstd,

                     p->tokens,

                     p->embed_dim,

                     p->aligned_embed_dim,

                     p->eps);


     ck_mlp_swiglu_forward_quant(p->ln2_out,

                                 p->w1,

                                 p->b1,

                                 p->w1_dtype,

                                 p->w2,

                                 p->b2,

                                 p->w2_dtype,

                                 p->fc1_out,

                                 p->swiglu_out,

                                 p->mlp_out,

                                 p->tokens,

                                 p->aligned_embed_dim,

                                 p->aligned_intermediate_dim);


     ck_residual_add_token_major(p->residual1,

                                 p->mlp_out,

                                 p->output,

                                 p->tokens,

                                 p->aligned_embed_dim);

 }


 void ck_layer_forward_rmsnorm_swiglu_decode_quant(const CKLayerForwardParamsQ4K *p,

                                                  int token_index,

                                                  int cache_capacity)

 {

     if (!p) {

         return;

     }

     if (!p->input || !p->ln1_gamma || !p->ln2_gamma || !p->ln1_out || !p->ln2_out ||

         !p->wq || !p->wk || !p->wv || !p->wo || !p->w1 || !p->w2 ||

         !p->k || !p->v ||

         !p->proj_tmp || !p->proj_scratch || !p->residual1 || !p->fc1_out || !p->swiglu_out || !p->mlp_out || !p->output) {

         return;

     }

     if (token_index < 0 || cache_capacity <= 0 || token_index >= cache_capacity) {

         return;

     }


     const int D = p->embed_dim;

     const int aligned_D = p->aligned_embed_dim;

     const int H = p->num_heads;

     const int H_kv = p->num_kv_heads;

     const int hd = p->head_dim;

     const int ad = p->aligned_head_dim;

     const int aligned_intermediate = p->aligned_intermediate_dim;

     const int K_concat = H * ad;


     /* Decode buffers are single-token; token_index only applies to KV cache. */

     const size_t token_slot = 0;

     const float *input_row = p->input + token_slot * (size_t)aligned_D;

     float *ln1_row = p->ln1_out + token_slot * (size_t)aligned_D;

     float *ln2_row = p->ln2_out + token_slot * (size_t)aligned_D;

     float *proj_row = p->proj_tmp + token_slot * (size_t)aligned_D;

     float *residual_row = p->residual1 + token_slot * (size_t)aligned_D;

     float *mlp_row = p->mlp_out + token_slot * (size_t)aligned_D;

     float *out_row = p->output + token_slot * (size_t)aligned_D;


     float ln1_rstd_tmp = 0.0f;

     float ln2_rstd_tmp = 0.0f;

     float *ln1_rstd = p->ln1_rstd ? (p->ln1_rstd + token_slot) : &ln1_rstd_tmp;

     float *ln2_rstd = p->ln2_rstd ? (p->ln2_rstd + token_slot) : &ln2_rstd_tmp;


     size_t q_elems = (size_t)H * (size_t)ad;

     size_t kv_elems = (size_t)H_kv * (size_t)ad;

     float q_token[q_elems];

     float k_token[kv_elems];

     float v_token[kv_elems];

     float attn_token[q_elems];


     rmsnorm_forward(input_row,

                     p->ln1_gamma,

                     ln1_row,

                     ln1_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     ck_qkv_project_head_major_token_quant(ln1_row,

                                           p->wq, p->bq, p->wq_dtype,

                                           p->wk, p->bk, p->wk_dtype,

                                           p->wv, p->bv, p->wv_dtype,

                                           q_token, k_token, v_token,

                                           aligned_D,

                                           H,

                                           H_kv,

                                           ad);


     if (p->rope_cos && p->rope_sin) {

         rope_forward_qk(q_token,

                         k_token,

                         p->rope_cos,

                         p->rope_sin,

                         H,

                         H_kv,

                         /*num_tokens=*/1,

                         hd,

                         ad,

                         p->rope_pos_offset);

     }


     kv_cache_write_head_major(k_token,

                               v_token,

                               p->k,

                               p->v,

                               H_kv,

                               token_index,

                               cache_capacity,

                               hd,

                               ad);


     ck_attention_flash_decode_wrapper(q_token,

                                    p->k,

                                    p->v,

                                    attn_token,

                                    H,

                                    H_kv,

                                    /*kv_tokens=*/token_index + 1,

                                    cache_capacity,

                                    hd,

                                    ad);


     if (p->wo_dtype == CK_DT_FP32) {

         ck_attention_project_head_major_decode_token(attn_token,

                                                      (const float *)p->wo,

                                                      p->bo,

                                                      proj_row,

                                                      D,

                                                      aligned_D,

                                                      H,

                                                      ad);

     } else {

         /* Quantized attention output projection - handle all quant types */

         ck_gemm_nt_quant(attn_token,

                          p->wo,

                          p->bo,

                          proj_row,

                          /*M=*/1,

                          aligned_D,

                          /*K=*/K_concat,

                          p->wo_dtype);

         for (int j = D; j < aligned_D; ++j) {

             proj_row[j] = 0.0f;

         }

     }


     ck_residual_add_token_major(input_row,

                                 proj_row,

                                 residual_row,

                                 /*tokens=*/1,

                                 aligned_D);


     rmsnorm_forward(residual_row,

                     p->ln2_gamma,

                     ln2_row,

                     ln2_rstd,

                     /*tokens=*/1,

                     D,

                     aligned_D,

                     p->eps);


     int up_dim = 2 * aligned_intermediate;

     float *fc1_row = p->fc1_out + token_slot * (size_t)up_dim;

     float *swiglu_row = p->swiglu_out + token_slot * (size_t)aligned_intermediate;


     ck_mlp_swiglu_forward_quant(ln2_row,

                                 p->w1,

                                 p->b1,

                                 p->w1_dtype,

                                 p->w2,

                                 p->b2,

                                 p->w2_dtype,

                                 fc1_row,

                                 swiglu_row,

                                 mlp_row,

                                 /*tokens=*/1,

                                 aligned_D,

                                 aligned_intermediate);


     ck_residual_add_token_major(residual_row,

                                 mlp_row,

                                 out_row,

                                 /*tokens=*/1,

                                 aligned_D);

 }


 void ck_layer_backward_rmsnorm_swiglu(const CKLayerBackwardParams *p)

 {

     if (!p) {

         return;

     }


     int T = p->tokens;

     int aligned_embed = p->aligned_embed_dim;

     int aligned_head = p->aligned_head_dim;

     int aligned_intermediate = p->aligned_intermediate_dim;

     int up_dim = 2 * aligned_intermediate;

     int num_threads = 1;


     // 1) Residual add (output = residual1 + mlp_out)

     ck_residual_add_backward(p->d_output, p->d_residual1, p->d_mlp_out, T, aligned_embed);


     // 2) MLP down proj backward

     fc2_backward_kernel(p->d_mlp_out,

                         p->swiglu_out,

                         p->w2,

                         p->d_swiglu_out,

                         p->d_w2,

                         p->d_b2,

                         T,

                         aligned_intermediate,

                         aligned_embed,

                         num_threads);


     // 3) SwiGLU backward

     swiglu_backward(p->fc1_out, p->d_swiglu_out, p->d_fc1_out, T, aligned_intermediate);


     // 4) MLP up proj backward

     fc1_backward_kernel(p->d_fc1_out,

                         p->ln2_out,

                         p->w1,

                         p->d_ln2_out,

                         p->d_w1,

                         p->d_b1,

                         T,

                         aligned_embed,

                         up_dim,

                         num_threads);


     // 5) RMSNorm (ln2) backward; reuse d_output as scratch for d_residual1_from_ln2

     rmsnorm_backward(p->d_ln2_out,

                      p->residual1,

                      p->ln2_gamma,

                      p->ln2_rstd,

                      p->d_output,

                      p->d_ln2_gamma,

                      T,

                      p->embed_dim,

                      aligned_embed);

     ck_add_inplace(p->d_residual1, p->d_output, T, aligned_embed);


     // 6) Residual add (residual1 = input + proj_tmp)

     ck_residual_add_backward(p->d_residual1, p->d_input, p->d_proj_tmp, T, aligned_embed);


     // 7) Attention projection backward

     ck_attention_project_head_major_backward(p->d_proj_tmp,

                                              p->attn_out,

                                              p->wo,

                                              p->d_attn_out,

                                              p->d_wo,

                                              p->d_bo,

                                              T,

                                              aligned_embed,

                                              p->num_heads,

                                              aligned_head);


     // 8) Attention backward

     attention_backward_causal_head_major_gqa(p->d_attn_out,

                                              p->q,

                                              p->k,

                                              p->v,

                                              p->scores,

                                              p->d_q,

                                              p->d_k,

                                              p->d_v,

                                              p->d_scores,

                                              p->num_heads,

                                              p->num_kv_heads,

                                              T,

                                              p->head_dim,

                                              aligned_head,

                                              p->aligned_context_window);


     // 9) RoPE backward (if enabled)

     if (p->rope_cos && p->rope_sin) {

         rope_backward_qk(p->d_q,

                          p->d_k,

                          p->d_q,

                          p->d_k,

                          p->rope_cos,

                          p->rope_sin,

                          p->num_heads,

                          p->num_kv_heads,

                          T,

                          p->head_dim,

                          aligned_head,

                          p->rope_pos_offset);

     }


     // 10) QKV projection backward (scratch uses d_proj_tmp)

     ck_qkv_project_head_major_backward(p->d_q,

                                        p->d_k,

                                        p->d_v,

                                        p->ln1_out,

                                        p->wq,

                                        p->bq,

                                        p->wk,

                                        p->bk,

                                        p->wv,

                                        p->bv,

                                        p->d_ln1_out,

                                        p->d_wq,

                                        p->d_bq,

                                        p->d_wk,

                                        p->d_bk,

                                        p->d_wv,

                                        p->d_bv,

                                        p->d_proj_tmp,

                                        T,

                                        aligned_embed,

                                        p->num_heads,

                                        p->num_kv_heads,

                                        aligned_head,

                                        num_threads);


     // 11) RMSNorm (ln1) backward; reuse d_ln1_out as scratch for d_input_from_ln1

     rmsnorm_backward(p->d_ln1_out,

                      p->input,

                      p->ln1_gamma,

                      p->ln1_rstd,

                      p->d_ln1_out,

                      p->d_ln1_gamma,

                      T,

                      p->embed_dim,

                      aligned_embed);

     ck_add_inplace(p->d_input, p->d_ln1_out, T, aligned_embed);

 }

ckernel_dtype.h

CKDataType
CKDataType
Supported data types in C-Kernel-Engine.
Definition: ckernel_dtype.h:27

CK_DT_Q4_K
@ CK_DT_Q4_K
Definition: ckernel_dtype.h:40

CK_DT_Q4_0
@ CK_DT_Q4_0
Definition: ckernel_dtype.h:38

CK_DT_Q8_0
@ CK_DT_Q8_0
Definition: ckernel_dtype.h:42

CK_DT_Q5_0
@ CK_DT_Q5_0
Definition: ckernel_dtype.h:44

CK_DT_FP32
@ CK_DT_FP32
Definition: ckernel_dtype.h:29

CK_DT_Q6_K
@ CK_DT_Q6_K
Definition: ckernel_dtype.h:41

CK_DT_Q4_1
@ CK_DT_Q4_1
Definition: ckernel_dtype.h:39

CK_DT_Q5_1
@ CK_DT_Q5_1
Definition: ckernel_dtype.h:45

ck_dtype_row_bytes
static size_t ck_dtype_row_bytes(CKDataType dt, size_t n_elements)
Calculate total bytes for n_elements of given dtype.
Definition: ckernel_dtype.h:138

ckernel_engine.h

gemm_nt_q4_0
void gemm_nt_q4_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.
Definition: gemm_kernels_q4_0.c:176

gemm_naive_parallel
void gemm_naive_parallel(const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels.c:125

swiglu_forward
void swiglu_forward(const float *input, float *output, int tokens, int dim)
Definition: swiglu_kernels.c:131

gemm_swiglu_fused
void gemm_swiglu_fused(const float *x, const float *W_gate, const float *W_up, const float *b_gate, const float *b_up, float *output, int M, int N, int K)
Definition: gemm_fused_kernels.c:241

swiglu_backward
void swiglu_backward(const float *input, const float *d_output, float *d_input, int tokens, int dim)
Definition: swiglu_kernels.c:215

attention_flash_decode
void attention_flash_decode(float *out, const float *q, const float *k, const float *v, int T_q, int T_k, int H, int D_h, float scale)
Main flash attention function with SIMD dispatch.
Definition: attention_flash_true.c:696

gemm_nt_q4_k
void gemm_nt_q4_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q4k.c:683

attention_forward_causal_head_major_gqa
void attention_forward_causal_head_major_gqa(const float *q, const float *k, const float *v, float *scores, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
Definition: attention_kernels.c:224

rope_backward_qk
void rope_backward_qk(const float *d_q_out, const float *d_k_out, float *d_q, float *d_k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:497

gemm_nt_q4_1
void gemm_nt_q4_1(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q4_1 weights: C = A @ B^T.
Definition: gemm_kernels_q4_1.c:256

gemm_nt_q4_k_q8_k
void gemm_nt_q4_k_q8_k(const void *A_q8, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q4k_q8k.c:295

kv_cache_write_head_major
void kv_cache_write_head_major(const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)
Definition: kv_cache_kernels.c:60

gemm_nt_q5_0
void gemm_nt_q5_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_0.c:831

fc1_backward_kernel
void fc1_backward_kernel(const float *d_output, const float *fc1_input, const float *W_fc1, float *d_input, float *d_W_fc1, float *d_b_fc1, int T, int aligned_in, int aligned_out, int num_threads)
Definition: mlp_kernels.c:167

gemm_nt_q6_k
void gemm_nt_q6_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q6k.c:212

gemm_nt_q8_0
void gemm_nt_q8_0(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Matrix-matrix multiply: C[M,N] = A[M,K] @ B[N,K]^T + bias.
Definition: gemm_kernels_q8_0.c:681

fc2_backward_kernel
void fc2_backward_kernel(const float *d_output, const float *fc2_input, const float *W_fc2, float *d_input, float *d_W_fc2, float *d_b_fc2, int T, int aligned_in, int aligned_out, int num_threads)
Definition: mlp_kernels.c:118

quantize_row_q8_k
void quantize_row_q8_k(const float *x, void *y, int k)
Definition: gemm_kernels_q4k_q8k.c:107

attention_forward_decode_head_major_gqa_regular
void attention_forward_decode_head_major_gqa_regular(const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
WARNING: This is NOT true flash attention!
Definition: attention_kernels.c:1524

rmsnorm_forward
void rmsnorm_forward(const float *input, const float *gamma, float *output, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
Definition: rmsnorm_kernels.c:50

gemm_nt_q5_1
void gemm_nt_q5_1(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
GEMM with transposed Q5_1 weights: C = A @ B^T.
Definition: gemm_kernels_q5_1.c:309

attention_forward_causal_head_major_gqa_flash
void attention_forward_causal_head_major_gqa_flash(const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim)
Definition: attention_kernels.c:800

rope_forward_qk
void rope_forward_qk(float *q, float *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:448

fused_mlp_swiglu_decode_v2
void fused_mlp_swiglu_decode_v2(const float *x, const float *W_gate, const float *W_up, const float *W_down, const float *b_gate, const float *b_up, const float *b_down, float *output, int D, int Hff)
Definition: mlp_fused_decode.c:318

rmsnorm_backward
void rmsnorm_backward(const float *d_output, const float *input, const float *gamma, const float *rstd_cache, float *d_input, float *d_gamma, int tokens, int d_model, int aligned_embed_dim)
Definition: rmsnorm_kernels.c:184

attention_backward_causal_head_major_gqa
void attention_backward_causal_head_major_gqa(const float *d_output, const float *q, const float *k, const float *v, const float *attn_weights, float *d_q, float *d_k, float *d_v, float *d_scores, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int aligned_context_window)
Definition: attention_kernels.c:1672

ck_strict_parity_enabled
int ck_strict_parity_enabled(void)
Definition: ckernel_strict.c:33

gemm_blocked_serial
void gemm_blocked_serial(const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels.c:661

ck_add_inplace
static void ck_add_inplace(float *dst, const float *src, int tokens, int aligned_embed_dim)
Definition: ckernel_orchestration.c:719

ck_qkv_project_head_major_quant
static void ck_qkv_project_head_major_quant(const float *input, const void *wq, const float *bq, CKDataType wq_dtype, const void *wk, const float *bk, CKDataType wk_dtype, const void *wv, const float *bv, CKDataType wv_dtype, float *q, float *k, float *v, int tokens, int kv_stride_tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:441

ck_attention_project_head_major_quant
static void ck_attention_project_head_major_quant(const float *attn_out, const void *wo, const float *bo, float *out, float *scratch, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim, CKDataType wo_dtype)
Definition: ckernel_orchestration.c:540

ck_layer_backward_rmsnorm_swiglu
void ck_layer_backward_rmsnorm_swiglu(const CKLayerBackwardParams *p)
Definition: ckernel_orchestration.c:2677

ck_mlp_swiglu_forward_fused_token
void ck_mlp_swiglu_forward_fused_token(const float *input_row, const float *w1, const float *b1, const float *w2, const float *b2, float *swiglu_row, float *output_row, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:1212

ck_attention_project_head_major
void ck_attention_project_head_major(const float *attn_out, const float *wo, const float *bo, float *out, float *scratch, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:730

ck_mlp_swiglu_forward
void ck_mlp_swiglu_forward(const float *input, const float *w1, const float *b1, const float *w2, const float *b2, float *fc1_out, float *swiglu_out, float *output, int tokens, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:952

ck_attention_project_head_major_ref
static void ck_attention_project_head_major_ref(const float *attn_out, const float *wo, const float *bo, float *out, float *scratch, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:765

ck_residual_add_backward
void ck_residual_add_backward(const float *d_out, float *d_a, float *d_b, int tokens, int aligned_embed_dim)
Definition: ckernel_orchestration.c:151

ck_mlp_swiglu_forward_quant
static void ck_mlp_swiglu_forward_quant(const float *input, const void *w1, const float *b1, CKDataType w1_dtype, const void *w2, const float *b2, CKDataType w2_dtype, float *fc1_out, float *swiglu_out, float *output, int tokens, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:611

ck_qkv_project_head_major_token_q4_k_q8_k
static void ck_qkv_project_head_major_token_q4_k_q8_k(const block_q8_K *input_q8, const void *wq, const float *bq, const void *wk, const float *bk, const void *wv, const float *bv, float *q_token, float *k_token, float *v_token, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:1649

ck_attention_project_head_major_q4_k
static void ck_attention_project_head_major_q4_k(const float *attn_out, const void *wo, const float *bo, float *out, float *scratch, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:504

ck_qkv_project_head_major_q4_k_q8_k
static void ck_qkv_project_head_major_q4_k_q8_k(const float *input, const void *wq, const float *bq, const void *wk, const float *bk, const void *wv, const float *bv, float *q, float *k, float *v, int tokens, int kv_stride_tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:1694

ck_qkv_project_head_major_ref
static void ck_qkv_project_head_major_ref(const float *input, const float *wq, const float *bq, const float *wk, const float *bk, const float *wv, const float *bv, float *q, float *k, float *v, int tokens, int kv_stride_tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:670

ck_q8k_activations_enabled
static int ck_q8k_activations_enabled(void)
Definition: ckernel_orchestration.c:314

ck_layer_forward_rmsnorm_swiglu_quant
void ck_layer_forward_rmsnorm_swiglu_quant(const CKLayerForwardParamsQ4K *p)
Definition: ckernel_orchestration.c:2401

ck_layer_debug_enabled
static int ck_layer_debug_enabled(void)
Definition: ckernel_orchestration.c:217

ck_mlp_swiglu_forward_fully_fused_token
void ck_mlp_swiglu_forward_fully_fused_token(const float *input_row, const float *w1, const float *b1, const float *w2, const float *b2, float *output_row, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:1247

ck_mlp_swiglu_forward_q4_k_q8_k
static void ck_mlp_swiglu_forward_q4_k_q8_k(const float *input, const void *w1, const float *b1, const void *w2, const float *b2, float *fc1_out, float *swiglu_out, float *output, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:635

ck_layer_forward_rmsnorm_swiglu_decode
void ck_layer_forward_rmsnorm_swiglu_decode(const CKLayerForwardParams *p, int token_index, int cache_capacity)
Definition: ckernel_orchestration.c:1289

ck_layer_forward_rmsnorm_swiglu_q4_k
void ck_layer_forward_rmsnorm_swiglu_q4_k(const CKLayerForwardParamsQ4K *p)
Definition: ckernel_orchestration.c:1910

ck_qkv_project_head_major
void ck_qkv_project_head_major(const float *input, const float *wq, const float *bq, const float *wk, const float *bk, const float *wv, const float *bv, float *q, float *k, float *v, int tokens, int kv_stride_tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:168

ck_debug_check_q8k
static void ck_debug_check_q8k(const char *stage, const void *q8_buf, int num_blocks)
Definition: ckernel_orchestration.c:259

ck_attention_project_head_major_q4_k_q8_k
static void ck_attention_project_head_major_q4_k_q8_k(const float *attn_out, const void *wo, const float *bo, float *out, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:1764

ck_mlp_swiglu_forward_q4_k
static void ck_mlp_swiglu_forward_q4_k(const float *input, const void *w1, const float *b1, const void *w2, const float *b2, float *fc1_out, float *swiglu_out, float *output, int tokens, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:589

ck_gemm_nt_quant
void ck_gemm_nt_quant(const float *A, const void *B, const float *bias, float *C, int M, int N, int K, CKDataType dtype)
Definition: ckernel_orchestration.c:335

ck_attention_flash_decode_wrapper
void ck_attention_flash_decode_wrapper(const float *q_token, const float *k_cache, const float *v_cache, float *out_token, int num_heads, int num_kv_heads, int kv_tokens, int cache_capacity, int head_dim, int aligned_head_dim)
Wrapper to call TRUE flash attention from orchestration layer.
Definition: ckernel_orchestration.c:72

ck_qkv_project_head_major_backward
void ck_qkv_project_head_major_backward(const float *d_q, const float *d_k, const float *d_v, const float *input, const float *wq, const float *bq, const float *wk, const float *bk, const float *wv, const float *bv, float *d_input, float *d_wq, float *d_bq, float *d_wk, float *d_bk, float *d_wv, float *d_bv, float *scratch, int tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim, int num_threads)
Definition: ckernel_orchestration.c:856

ck_residual_add_token_major
void ck_residual_add_token_major(const float *a, const float *b, float *out, int tokens, int aligned_embed_dim)
Definition: ckernel_orchestration.c:139

ck_layer_forward_rmsnorm_swiglu
void ck_layer_forward_rmsnorm_swiglu(const CKLayerForwardParams *p)
Definition: ckernel_orchestration.c:996

ck_layer_forward_rmsnorm_swiglu_decode_fused
void ck_layer_forward_rmsnorm_swiglu_decode_fused(const CKLayerForwardParams *p, int token_index, int cache_capacity)
Definition: ckernel_orchestration.c:1449

ck_layer_forward_rmsnorm_swiglu_decode_q4_k
void ck_layer_forward_rmsnorm_swiglu_decode_q4_k(const CKLayerForwardParamsQ4K *p, int token_index, int cache_capacity)
Definition: ckernel_orchestration.c:2117

ck_debug_check_q4k_weights
static void ck_debug_check_q4k_weights(const char *stage, const void *q4_buf, int num_blocks)
Definition: ckernel_orchestration.c:287

ck_debug_check_buffer
static void ck_debug_check_buffer(const char *stage, const float *buf, int size)
Definition: ckernel_orchestration.c:232

ck_mlp_swiglu_forward_ref
static void ck_mlp_swiglu_forward_ref(const float *input, const float *w1, const float *b1, const float *w2, const float *b2, float *fc1_out, float *swiglu_out, float *output, int tokens, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:974

ck_layer_forward_rmsnorm_swiglu_ref
void ck_layer_forward_rmsnorm_swiglu_ref(const CKLayerForwardParams *p)
Definition: ckernel_orchestration.c:1104

ck_qkv_project_head_major_token_q4_k
static void ck_qkv_project_head_major_token_q4_k(const float *input_row, const void *wq, const float *bq, const void *wk, const float *bk, const void *wv, const float *bv, float *q_token, float *k_token, float *v_token, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:1604

ck_qkv_project_head_major_q4_k
static void ck_qkv_project_head_major_q4_k(const float *input, const void *wq, const float *bq, const void *wk, const float *bk, const void *wv, const float *bv, float *q, float *k, float *v, int tokens, int kv_stride_tokens, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:387

ck_mlp_swiglu_forward_q4_k_q8_k_prefill
static void ck_mlp_swiglu_forward_q4_k_q8_k_prefill(const float *input, const void *w1, const float *b1, const void *w2, const float *b2, float *fc1_out, float *swiglu_out, float *output, int tokens, int aligned_embed_dim, int aligned_intermediate_dim)
Definition: ckernel_orchestration.c:1808

ck_qkv_project_head_major_token_quant
static void ck_qkv_project_head_major_token_quant(const float *input_row, const void *wq, const float *bq, CKDataType wq_dtype, const void *wk, const float *bk, CKDataType wk_dtype, const void *wv, const float *bv, CKDataType wv_dtype, float *q_token, float *k_token, float *v_token, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:1857

ck_attention_project_head_major_backward
void ck_attention_project_head_major_backward(const float *d_out, const float *attn_out, const float *wo, float *d_attn_out, float *d_wo, float *d_bo, int tokens, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: ckernel_orchestration.c:800

ck_layer_forward_rmsnorm_swiglu_decode_quant
void ck_layer_forward_rmsnorm_swiglu_decode_quant(const CKLayerForwardParamsQ4K *p, int token_index, int cache_capacity)
Definition: ckernel_orchestration.c:2512

ckernel_orchestration.h

ck_qkv_project_head_major_token
void ck_qkv_project_head_major_token(const float *input_row, const float *wq, const float *bq, const float *wk, const float *bk, const float *wv, const float *bv, float *q_token, float *k_token, float *v_token, int aligned_embed_dim, int num_heads, int num_kv_heads, int aligned_head_dim)
Definition: attention_decode_fused.c:78

ck_attention_project_head_major_decode_token
void ck_attention_project_head_major_decode_token(const float *attn_token, const float *wo, const float *bo, float *out_token, int embed_dim, int aligned_embed_dim, int num_heads, int aligned_head_dim)
Definition: attention_decode_fused.c:115

ckernel_quant.h
Quantization block structures for weight-only quantization.

CK_FP16_TO_FP32
#define CK_FP16_TO_FP32(x)
Definition: ckernel_quant.h:400

QK_K
#define QK_K
Definition: ckernel_quant.h:120

C
#define C(color)
Definition: show_config.c:39

CKLayerBackwardParams
Definition: ckernel_orchestration.h:99

CKLayerBackwardParams::d_input
float * d_input
Definition: ckernel_orchestration.h:149

CKLayerBackwardParams::bv
const float * bv
Definition: ckernel_orchestration.h:129

CKLayerBackwardParams::embed_dim
int embed_dim
Definition: ckernel_orchestration.h:101

CKLayerBackwardParams::wq
const float * wq
Definition: ckernel_orchestration.h:124

CKLayerBackwardParams::d_wv
float * d_wv
Definition: ckernel_orchestration.h:156

CKLayerBackwardParams::d_ln1_out
float * d_ln1_out
Definition: ckernel_orchestration.h:165

CKLayerBackwardParams::d_attn_out
float * d_attn_out
Definition: ckernel_orchestration.h:170

CKLayerBackwardParams::tokens
int tokens
Definition: ckernel_orchestration.h:100

CKLayerBackwardParams::wv
const float * wv
Definition: ckernel_orchestration.h:128

CKLayerBackwardParams::bk
const float * bk
Definition: ckernel_orchestration.h:127

CKLayerBackwardParams::d_proj_tmp
float * d_proj_tmp
Definition: ckernel_orchestration.h:171

CKLayerBackwardParams::d_residual1
float * d_residual1
Definition: ckernel_orchestration.h:172

CKLayerBackwardParams::ln2_rstd
const float * ln2_rstd
Definition: ckernel_orchestration.h:119

CKLayerBackwardParams::wk
const float * wk
Definition: ckernel_orchestration.h:126

CKLayerBackwardParams::d_w1
float * d_w1
Definition: ckernel_orchestration.h:160

CKLayerBackwardParams::ln2_out
const float * ln2_out
Definition: ckernel_orchestration.h:118

CKLayerBackwardParams::d_wq
float * d_wq
Definition: ckernel_orchestration.h:152

CKLayerBackwardParams::d_ln1_gamma
float * d_ln1_gamma
Definition: ckernel_orchestration.h:150

CKLayerBackwardParams::d_q
float * d_q
Definition: ckernel_orchestration.h:166

CKLayerBackwardParams::d_w2
float * d_w2
Definition: ckernel_orchestration.h:162

CKLayerBackwardParams::num_heads
int num_heads
Definition: ckernel_orchestration.h:103

CKLayerBackwardParams::d_k
float * d_k
Definition: ckernel_orchestration.h:167

CKLayerBackwardParams::wo
const float * wo
Definition: ckernel_orchestration.h:131

CKLayerBackwardParams::k
const float * k
Definition: ckernel_orchestration.h:140

CKLayerBackwardParams::bq
const float * bq
Definition: ckernel_orchestration.h:125

CKLayerBackwardParams::d_b2
float * d_b2
Definition: ckernel_orchestration.h:163

CKLayerBackwardParams::ln2_gamma
const float * ln2_gamma
Definition: ckernel_orchestration.h:115

CKLayerBackwardParams::ln1_rstd
const float * ln1_rstd
Definition: ckernel_orchestration.h:117

CKLayerBackwardParams::scores
const float * scores
Definition: ckernel_orchestration.h:142

CKLayerBackwardParams::d_bq
float * d_bq
Definition: ckernel_orchestration.h:153

CKLayerBackwardParams::swiglu_out
const float * swiglu_out
Definition: ckernel_orchestration.h:146

CKLayerBackwardParams::ln1_out
const float * ln1_out
Definition: ckernel_orchestration.h:116

CKLayerBackwardParams::q
const float * q
Definition: ckernel_orchestration.h:139

CKLayerBackwardParams::d_mlp_out
float * d_mlp_out
Definition: ckernel_orchestration.h:176

CKLayerBackwardParams::ln1_gamma
const float * ln1_gamma
Definition: ckernel_orchestration.h:114

CKLayerBackwardParams::rope_pos_offset
int rope_pos_offset
Definition: ckernel_orchestration.h:111

CKLayerBackwardParams::attn_out
const float * attn_out
Definition: ckernel_orchestration.h:143

CKLayerBackwardParams::d_wk
float * d_wk
Definition: ckernel_orchestration.h:154

CKLayerBackwardParams::fc1_out
const float * fc1_out
Definition: ckernel_orchestration.h:145

CKLayerBackwardParams::d_v
float * d_v
Definition: ckernel_orchestration.h:168

CKLayerBackwardParams::d_b1
float * d_b1
Definition: ckernel_orchestration.h:161

CKLayerBackwardParams::residual1
const float * residual1
Definition: ckernel_orchestration.h:144

CKLayerBackwardParams::aligned_context_window
int aligned_context_window
Definition: ckernel_orchestration.h:107

CKLayerBackwardParams::d_ln2_gamma
float * d_ln2_gamma
Definition: ckernel_orchestration.h:151

CKLayerBackwardParams::d_fc1_out
float * d_fc1_out
Definition: ckernel_orchestration.h:174

CKLayerBackwardParams::d_ln2_out
float * d_ln2_out
Definition: ckernel_orchestration.h:173

CKLayerBackwardParams::aligned_intermediate_dim
int aligned_intermediate_dim
Definition: ckernel_orchestration.h:109

CKLayerBackwardParams::d_bv
float * d_bv
Definition: ckernel_orchestration.h:157

CKLayerBackwardParams::d_bk
float * d_bk
Definition: ckernel_orchestration.h:155

CKLayerBackwardParams::input
const float * input
Definition: ckernel_orchestration.h:113

CKLayerBackwardParams::d_bo
float * d_bo
Definition: ckernel_orchestration.h:159

CKLayerBackwardParams::rope_cos
const float * rope_cos
Definition: ckernel_orchestration.h:121

CKLayerBackwardParams::w1
const float * w1
Definition: ckernel_orchestration.h:134

CKLayerBackwardParams::d_output
float * d_output
Definition: ckernel_orchestration.h:148

CKLayerBackwardParams::head_dim
int head_dim
Definition: ckernel_orchestration.h:105

CKLayerBackwardParams::d_wo
float * d_wo
Definition: ckernel_orchestration.h:158

CKLayerBackwardParams::d_swiglu_out
float * d_swiglu_out
Definition: ckernel_orchestration.h:175

CKLayerBackwardParams::v
const float * v
Definition: ckernel_orchestration.h:141

CKLayerBackwardParams::aligned_embed_dim
int aligned_embed_dim
Definition: ckernel_orchestration.h:102

CKLayerBackwardParams::aligned_head_dim
int aligned_head_dim
Definition: ckernel_orchestration.h:106

CKLayerBackwardParams::w2
const float * w2
Definition: ckernel_orchestration.h:136

CKLayerBackwardParams::rope_sin
const float * rope_sin
Definition: ckernel_orchestration.h:122

CKLayerBackwardParams::d_scores
float * d_scores
Definition: ckernel_orchestration.h:169

CKLayerBackwardParams::num_kv_heads
int num_kv_heads
Definition: ckernel_orchestration.h:104

CKLayerForwardParamsQ4K
Definition: ckernel_orchestration.h:318

CKLayerForwardParamsQ4K::eps
float eps
Definition: ckernel_orchestration.h:329

CKLayerForwardParamsQ4K::aligned_embed_dim
int aligned_embed_dim
Definition: ckernel_orchestration.h:321

CKLayerForwardParamsQ4K::w2_dtype
CKDataType w2_dtype
Definition: ckernel_orchestration.h:376

CKLayerForwardParamsQ4K::wk_dtype
CKDataType wk_dtype
Definition: ckernel_orchestration.h:372

CKLayerForwardParamsQ4K::rope_pos_offset
int rope_pos_offset
Definition: ckernel_orchestration.h:330

CKLayerForwardParamsQ4K::bv
const float * bv
Definition: ckernel_orchestration.h:344

CKLayerForwardParamsQ4K::wv_dtype
CKDataType wv_dtype
Definition: ckernel_orchestration.h:373

CKLayerForwardParamsQ4K::ln2_rstd
float * ln2_rstd
Definition: ckernel_orchestration.h:365

CKLayerForwardParamsQ4K::output
float * output
Definition: ckernel_orchestration.h:369

CKLayerForwardParamsQ4K::b1
const float * b1
Definition: ckernel_orchestration.h:350

CKLayerForwardParamsQ4K::wq
const void * wq
Definition: ckernel_orchestration.h:339

CKLayerForwardParamsQ4K::rope_cos
const float * rope_cos
Definition: ckernel_orchestration.h:336

CKLayerForwardParamsQ4K::k
float * k
Definition: ckernel_orchestration.h:357

CKLayerForwardParamsQ4K::attn_out
float * attn_out
Definition: ckernel_orchestration.h:360

CKLayerForwardParamsQ4K::wv
const void * wv
Definition: ckernel_orchestration.h:343

CKLayerForwardParamsQ4K::aligned_intermediate_dim
int aligned_intermediate_dim
Definition: ckernel_orchestration.h:328

CKLayerForwardParamsQ4K::ln2_gamma
const float * ln2_gamma
Definition: ckernel_orchestration.h:334

CKLayerForwardParamsQ4K::wo
const void * wo
Definition: ckernel_orchestration.h:346

CKLayerForwardParamsQ4K::bq
const float * bq
Definition: ckernel_orchestration.h:340

CKLayerForwardParamsQ4K::num_heads
int num_heads
Definition: ckernel_orchestration.h:322

CKLayerForwardParamsQ4K::q
float * q
Definition: ckernel_orchestration.h:356

CKLayerForwardParamsQ4K::ln1_rstd
float * ln1_rstd
Definition: ckernel_orchestration.h:355

CKLayerForwardParamsQ4K::proj_tmp
float * proj_tmp
Definition: ckernel_orchestration.h:361

CKLayerForwardParamsQ4K::tokens
int tokens
Definition: ckernel_orchestration.h:319

CKLayerForwardParamsQ4K::b2
const float * b2
Definition: ckernel_orchestration.h:352

CKLayerForwardParamsQ4K::rope_sin
const float * rope_sin
Definition: ckernel_orchestration.h:337

CKLayerForwardParamsQ4K::mlp_out
float * mlp_out
Definition: ckernel_orchestration.h:368

CKLayerForwardParamsQ4K::fc1_out
float * fc1_out
Definition: ckernel_orchestration.h:366

CKLayerForwardParamsQ4K::residual1
float * residual1
Definition: ckernel_orchestration.h:363

CKLayerForwardParamsQ4K::v
float * v
Definition: ckernel_orchestration.h:358

CKLayerForwardParamsQ4K::ln1_out
float * ln1_out
Definition: ckernel_orchestration.h:354

CKLayerForwardParamsQ4K::bk
const float * bk
Definition: ckernel_orchestration.h:342

CKLayerForwardParamsQ4K::head_dim
int head_dim
Definition: ckernel_orchestration.h:324

CKLayerForwardParamsQ4K::swiglu_out
float * swiglu_out
Definition: ckernel_orchestration.h:367

CKLayerForwardParamsQ4K::scores
float * scores
Definition: ckernel_orchestration.h:359

CKLayerForwardParamsQ4K::bo
const float * bo
Definition: ckernel_orchestration.h:347

CKLayerForwardParamsQ4K::num_kv_heads
int num_kv_heads
Definition: ckernel_orchestration.h:323

CKLayerForwardParamsQ4K::w2
const void * w2
Definition: ckernel_orchestration.h:351

CKLayerForwardParamsQ4K::wo_dtype
CKDataType wo_dtype
Definition: ckernel_orchestration.h:374

CKLayerForwardParamsQ4K::w1
const void * w1
Definition: ckernel_orchestration.h:349

CKLayerForwardParamsQ4K::aligned_context_window
int aligned_context_window
Definition: ckernel_orchestration.h:326

CKLayerForwardParamsQ4K::w1_dtype
CKDataType w1_dtype
Definition: ckernel_orchestration.h:375

CKLayerForwardParamsQ4K::proj_scratch
float * proj_scratch
Definition: ckernel_orchestration.h:362

CKLayerForwardParamsQ4K::wk
const void * wk
Definition: ckernel_orchestration.h:341

CKLayerForwardParamsQ4K::input
const float * input
Definition: ckernel_orchestration.h:332

CKLayerForwardParamsQ4K::wq_dtype
CKDataType wq_dtype
Definition: ckernel_orchestration.h:371

CKLayerForwardParamsQ4K::embed_dim
int embed_dim
Definition: ckernel_orchestration.h:320

CKLayerForwardParamsQ4K::aligned_head_dim
int aligned_head_dim
Definition: ckernel_orchestration.h:325

CKLayerForwardParamsQ4K::ln2_out
float * ln2_out
Definition: ckernel_orchestration.h:364

CKLayerForwardParamsQ4K::ln1_gamma
const float * ln1_gamma
Definition: ckernel_orchestration.h:333

CKLayerForwardParams
Definition: ckernel_orchestration.h:45

CKLayerForwardParams::w2
const float * w2
Definition: ckernel_orchestration.h:78

CKLayerForwardParams::embed_dim
int embed_dim
Definition: ckernel_orchestration.h:47

CKLayerForwardParams::bq
const float * bq
Definition: ckernel_orchestration.h:67

CKLayerForwardParams::ln2_out
float * ln2_out
Definition: ckernel_orchestration.h:91

CKLayerForwardParams::wk
const float * wk
Definition: ckernel_orchestration.h:68

CKLayerForwardParams::num_kv_heads
int num_kv_heads
Definition: ckernel_orchestration.h:50

CKLayerForwardParams::proj_scratch
float * proj_scratch
Definition: ckernel_orchestration.h:89

CKLayerForwardParams::proj_tmp
float * proj_tmp
Definition: ckernel_orchestration.h:88

CKLayerForwardParams::rope_sin
const float * rope_sin
Definition: ckernel_orchestration.h:64

CKLayerForwardParams::rope_cos
const float * rope_cos
Definition: ckernel_orchestration.h:63

CKLayerForwardParams::ln1_rstd
float * ln1_rstd
Definition: ckernel_orchestration.h:82

CKLayerForwardParams::ln1_gamma
const float * ln1_gamma
Definition: ckernel_orchestration.h:60

CKLayerForwardParams::ln2_rstd
float * ln2_rstd
Definition: ckernel_orchestration.h:92

CKLayerForwardParams::aligned_head_dim
int aligned_head_dim
Definition: ckernel_orchestration.h:52

CKLayerForwardParams::ln1_out
float * ln1_out
Definition: ckernel_orchestration.h:81

CKLayerForwardParams::head_dim
int head_dim
Definition: ckernel_orchestration.h:51

CKLayerForwardParams::output
float * output
Definition: ckernel_orchestration.h:96

CKLayerForwardParams::wq
const float * wq
Definition: ckernel_orchestration.h:66

CKLayerForwardParams::eps
float eps
Definition: ckernel_orchestration.h:56

CKLayerForwardParams::swiglu_out
float * swiglu_out
Definition: ckernel_orchestration.h:94

CKLayerForwardParams::q
float * q
Definition: ckernel_orchestration.h:83

CKLayerForwardParams::num_heads
int num_heads
Definition: ckernel_orchestration.h:49

CKLayerForwardParams::w1
const float * w1
Definition: ckernel_orchestration.h:76

CKLayerForwardParams::aligned_context_window
int aligned_context_window
Definition: ckernel_orchestration.h:53

CKLayerForwardParams::fc1_out
float * fc1_out
Definition: ckernel_orchestration.h:93

CKLayerForwardParams::bk
const float * bk
Definition: ckernel_orchestration.h:69

CKLayerForwardParams::wv
const float * wv
Definition: ckernel_orchestration.h:70

CKLayerForwardParams::mlp_out
float * mlp_out
Definition: ckernel_orchestration.h:95

CKLayerForwardParams::b1
const float * b1
Definition: ckernel_orchestration.h:77

CKLayerForwardParams::residual1
float * residual1
Definition: ckernel_orchestration.h:90

CKLayerForwardParams::wo
const float * wo
Definition: ckernel_orchestration.h:73

CKLayerForwardParams::scores
float * scores
Definition: ckernel_orchestration.h:86

CKLayerForwardParams::tokens
int tokens
Definition: ckernel_orchestration.h:46

CKLayerForwardParams::ln2_gamma
const float * ln2_gamma
Definition: ckernel_orchestration.h:61

CKLayerForwardParams::bv
const float * bv
Definition: ckernel_orchestration.h:71

CKLayerForwardParams::v
float * v
Definition: ckernel_orchestration.h:85

CKLayerForwardParams::k
float * k
Definition: ckernel_orchestration.h:84

CKLayerForwardParams::b2
const float * b2
Definition: ckernel_orchestration.h:79

CKLayerForwardParams::rope_pos_offset
int rope_pos_offset
Definition: ckernel_orchestration.h:57

CKLayerForwardParams::input
const float * input
Definition: ckernel_orchestration.h:59

CKLayerForwardParams::aligned_embed_dim
int aligned_embed_dim
Definition: ckernel_orchestration.h:48

CKLayerForwardParams::aligned_intermediate_dim
int aligned_intermediate_dim
Definition: ckernel_orchestration.h:55

CKLayerForwardParams::bo
const float * bo
Definition: ckernel_orchestration.h:74

CKLayerForwardParams::attn_out
float * attn_out
Definition: ckernel_orchestration.h:87

block_q4_K
Definition: ckernel_quant.h:123

block_q8_K
Definition: ckernel_quant.h:170

block_q8_K::d
float d
Definition: ckernel_quant.h:171