Fused kernels for prefill phase with proper 2D tiling. More...

#include "ckernel_engine.h"
#include "ckernel_quant.h"
#include <math.h>
#include <string.h>
#include <stddef.h>
#include <stdio.h>

Macros
#define	PREFILL_TILE_M 64

#define	PREFILL_TILE_N 256

Functions
static void	add_bias_tile (float out, const float bias, int tile_m, int out_dim)

static size_t	align_up_size (size_t value, size_t align)

void	fused_mlp_swiglu_prefill (const float x, const float W_gate, const float W_up, const float W_down, float output, int seq_len, int hidden, int intermediate, float scratch)
	Fused MLP (Gate + Up + SwiGLU + Down) for prefill. More...

void	fused_mlp_swiglu_prefill_bias (const float x, const float W_gate, const float W_up, const float W_down, const float B_gate, const float B_up, const float B_down, float output, int seq_len, int hidden, int intermediate, float *scratch)
	Fused MLP for prefill with proper tiling. More...

void	fused_mlp_swiglu_prefill_w1w2_quant (const float x, const void W1, const float B1, CKDataType w1_dt, const void W2, const float B2, CKDataType w2_dt, float output, int seq_len, int embed_dim, int aligned_embed_dim, int intermediate_dim, int aligned_intermediate_dim, void *scratch)
	Quantized fused MLP for prefill (W1=gate+up, W2=down) More...

size_t	fused_mlp_swiglu_prefill_w1w2_quant_scratch_size (int aligned_embed_dim, int aligned_intermediate_dim)
	Get scratch buffer size for fused_mlp_swiglu_prefill_w1w2_quant. More...

size_t	fused_mlp_swiglu_scratch_size (int intermediate)
	Get scratch size for fused MLP. More...

static void	fused_rmsnorm_gemm_2d_tiled (const float x, const float gamma, const float W, float output, int seq_len, int hidden, int out_dim, float eps, float *x_norm_scratch)
	Fused RMSNorm + single GEMM with 2D tiling (weight reuse) More...

void	fused_rmsnorm_qkv_prefill (const float x, const float gamma, const float Wq, const float Wk, const float Wv, float Q, float K, float V, int seq_len, int hidden, int q_dim, int kv_dim, float eps, float *scratch)
	Fused RMSNorm + QKV projection for prefill (v3 optimized) More...

void	fused_rmsnorm_qkv_prefill_head_major (const float x, const float gamma, const float Wq, const float Bq, const float Wk, const float Bk, const float Wv, const float Bv, float Q, float K, float V, int seq_len, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int kv_stride_tokens, float eps, float scratch)
	Fused RMSNorm + QKV projection for prefill (head-major outputs) More...

void	fused_rmsnorm_qkv_prefill_head_major_quant (const float x, const float gamma, const void Wq, const float Bq, CKDataType wq_dt, const void Wk, const float Bk, CKDataType wk_dt, const void Wv, const float Bv, CKDataType wv_dt, float Q, float K, float V, int seq_len, int embed_dim, int aligned_embed_dim, int num_heads, int num_kv_heads, int head_dim, int aligned_head_dim, int kv_stride_tokens, float eps, void scratch)
	Fused RMSNorm + QKV projection for prefill (head-major, Q8 activations) More...

size_t	fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size (int aligned_embed_dim)
	Get scratch buffer size for fused_rmsnorm_qkv_prefill_head_major_quant. More...

size_t	fused_rmsnorm_qkv_scratch_size (int hidden)
	Get scratch size for fused prefill. More...

static void	gemm_nt_q8_0_dispatch (const void A_q8, const void B, const float bias, float C, int M, int N, int K, CKDataType dt)

static void	gemm_nt_q8_0_mlp_dispatch (const void A_q8, const void B, const float bias, float C, int M, int N, int K, CKDataType dt)

static void	gemm_nt_q8_k_mlp_dispatch (const void A_q8, const void B, const float bias, float C, int M, int N, int K, CKDataType dt)

static void	gemm_nt_q8_k_qkv_dispatch (const void A_q8k, const void B, const float bias, float C, int M, int N, int K, CKDataType dt)

static void	gemm_tile_nt_strided (const float A, const float B_tile, float *C, int tile_m, int tile_n, int K, int C_stride)
	GEMM tile with N-dimension tiling (weight reuse) More...

static int	mlp_q8_0_dtype_supported (CKDataType dt)

static int	mlp_q8_k_dtype_supported (CKDataType dt)

static int	qkv_q8_0_dtype_supported (CKDataType dt)

static int	qkv_q8_k_dtype_supported (CKDataType dt)

static void	rmsnorm_tile (const float input, const float gamma, float *output, int tile_m, int embed_dim, int aligned_embed_dim, float eps)
	Compute RMSNorm for a tile of tokens. More...

static float	silu_prefill (float x)

void	unfused_rmsnorm_qkv_prefill (const float x, const float gamma, const float Wq, const float Wk, const float Wv, float x_norm, float Q, float K, float *V, int seq_len, int hidden, int q_dim, int kv_dim, float eps)
	Unfused version for comparison. More...

Detailed Description

Fused kernels for prefill phase with proper 2D tiling.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

KEY INSIGHT:

Naive M-dimension tiling (token tiles) causes weight reloading:

32 token tiles × 4MB weights = 128MB DRAM reads!

Correct approach: Tile along N (output/weight) dimension OUTER, M (token) dimension INNER. This way:

Load weight tile once
Process ALL tokens against that weight tile
Weight tile stays in cache while streaming through tokens

TILING STRATEGY:

For C[M,N] = RMSNorm(A[M,K]) × B[N,K]^T:

for n_tile in [0, N, TILE_N]: # Outer: weight tiles load B[n_tile:n_tile+TILE_N, :] into L3 for m_tile in [0, M, TILE_M]: # Inner: token tiles x_norm = rmsnorm(A[m_tile]) # x_norm in L2 C[m_tile, n_tile] = x_norm × B_tile # Consumes B from L3

Cache behavior:

Weight tile (TILE_N × K × 4 bytes) fits in L3
x_norm tile (TILE_M × K × 4 bytes) fits in L2
Weights loaded once per tile, reused across all token tiles

Definition in file prefill_fused_gemm.c.

Macro Definition Documentation

◆ PREFILL_TILE_M

#define PREFILL_TILE_M 64

Definition at line 64 of file prefill_fused_gemm.c.

◆ PREFILL_TILE_N

#define PREFILL_TILE_N 256

Definition at line 65 of file prefill_fused_gemm.c.

Function Documentation

◆ add_bias_tile()

static void add_bias_tile	(	float *	out,
		const float *	bias,
		int	tile_m,
		int	out_dim
	)

static

Definition at line 310 of file prefill_fused_gemm.c.

 {
     if (!out || !bias) {
         return;
     }
     for (int i = 0; i < tile_m; ++i) {
         float *row = out + (size_t)i * (size_t)out_dim;
         for (int j = 0; j < out_dim; ++j) {
             row[j] += bias[j];
         }
     }
 }

Referenced by fused_mlp_swiglu_prefill_bias(), and fused_rmsnorm_qkv_prefill_head_major().

◆ align_up_size()

static size_t align_up_size	(	size_t	value,
		size_t	align
	)

static

Definition at line 67 of file prefill_fused_gemm.c.

                                                         {
     return (value + align - 1) & ~(align - 1);
 }

Referenced by fused_mlp_swiglu_prefill_w1w2_quant(), fused_mlp_swiglu_prefill_w1w2_quant_scratch_size(), fused_rmsnorm_qkv_prefill_head_major_quant(), and fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size().

◆ fused_mlp_swiglu_prefill()

void fused_mlp_swiglu_prefill	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		float *	output,
		int	seq_len,
		int	hidden,
		int	intermediate,
		float *	scratch
	)

Fused MLP (Gate + Up + SwiGLU + Down) for prefill.

Tiles along token dimension to keep gate/up/hidden in L3 cache.

Parameters

scratch Temporary buffer from fused_mlp_swiglu_scratch_size()

Definition at line 879 of file prefill_fused_gemm.c.

 {
     fused_mlp_swiglu_prefill_bias(x, W_gate, W_up, W_down,
                                   NULL, NULL, NULL,
                                   output, seq_len, hidden, intermediate,
                                   scratch);
 }

References fused_mlp_swiglu_prefill_bias().

◆ fused_mlp_swiglu_prefill_bias()

void fused_mlp_swiglu_prefill_bias	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	B_gate,
		const float *	B_up,
		const float *	B_down,
		float *	output,
		int	seq_len,
		int	hidden,
		int	intermediate,
		float *	scratch
	)

Fused MLP for prefill with proper tiling.

Fused MLP (Gate + Up + SwiGLU + Down) for prefill with biases.

Definition at line 746 of file prefill_fused_gemm.c.

 {
     /* MLP is more complex because we have:
      * gate = x @ W_gate
      * up = x @ W_up
      * hidden = silu(gate) * up
      * out = hidden @ W_down
      *
      * The intermediate (gate, up, hidden) is large: seq_len × intermediate
      * For Qwen2-0.5B: 1024 × 4864 × 4 = 19.4MB (way bigger than L3!)
      *
      * Strategy: Tile along intermediate dimension for gate/up,
      * then fuse SwiGLU, then tile down projection.
      */
  
     /* scratch layout:
      * [gate_tile: TILE_M × TILE_N_INTER]
      * [up_tile: TILE_M × TILE_N_INTER]
      */
     const int TILE_N_INTER = 512;  /* Intermediate tile size */
     float *gate_tile = scratch;
     float *up_tile = scratch + (size_t)PREFILL_TILE_M * TILE_N_INTER;
     float *hidden_tile = gate_tile;  /* Reuse gate_tile for hidden after SwiGLU */
  
     /* For each chunk of intermediate dimension */
     for (int inter_start = 0; inter_start < intermediate; inter_start += TILE_N_INTER) {
         int tile_inter = (inter_start + TILE_N_INTER <= intermediate)
                              ? TILE_N_INTER : (intermediate - inter_start);
  
         const float *W_gate_tile = W_gate + (size_t)inter_start * hidden;
         const float *W_up_tile = W_up + (size_t)inter_start * hidden;
  
         /* For each chunk of tokens */
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
  
             const float *x_tile = x + (size_t)m_start * hidden;
  
             /* Compute gate and up projections for this tile */
             gemm_tile_nt_strided(x_tile, W_gate_tile, gate_tile,
                                   tile_m, tile_inter, hidden, tile_inter);
             gemm_tile_nt_strided(x_tile, W_up_tile, up_tile,
                                   tile_m, tile_inter, hidden, tile_inter);
             if (B_gate) {
                 add_bias_tile(gate_tile, B_gate + inter_start, tile_m, tile_inter);
             }
             if (B_up) {
                 add_bias_tile(up_tile, B_up + inter_start, tile_m, tile_inter);
             }
  
             /* Fused SwiGLU: hidden = silu(gate) * up */
             for (int i = 0; i < tile_m; ++i) {
                 float *g = gate_tile + (size_t)i * tile_inter;
                 float *u = up_tile + (size_t)i * tile_inter;
                 for (int j = 0; j < tile_inter; ++j) {
                     float gv = g[j];
                     float silu = gv / (1.0f + expf(-gv));
                     g[j] = silu * u[j];  /* hidden_tile = gate_tile */
                 }
             }
  
             /* Down projection: accumulate into output
              * out[m_start:, :] += hidden_tile @ W_down[inter_start:, :]^T
              */
             const float *W_down_slice = W_down + (size_t)inter_start;  /* Column slice */
             float *out_tile = output + (size_t)m_start * hidden;
  
             /* This is trickier - W_down is [hidden × intermediate]
              * We have hidden_tile[tile_m × tile_inter]
              * We want out[tile_m × hidden] += hidden_tile × W_down[:, inter_start:inter_start+tile_inter]^T
              *
              * For proper accumulation, need to handle this carefully.
              * For now, use a simpler approach: accumulate partial results.
              */
             for (int i = 0; i < tile_m; ++i) {
                 float *h = hidden_tile + (size_t)i * tile_inter;
                 float *o = out_tile + (size_t)i * hidden;
  
                 for (int d = 0; d < hidden; ++d) {
                     const float *w_row = W_down + (size_t)d * intermediate + inter_start;
                     float sum = (inter_start == 0)
                         ? (B_down ? B_down[d] : 0.0f)
                         : o[d];
  
 #if defined(__AVX512F__)
                     __m512 acc = _mm512_setzero_ps();
                     int j = 0;
                     for (; j + 16 <= tile_inter; j += 16) {
                         __m512 hv = _mm512_loadu_ps(h + j);
                         __m512 wv = _mm512_loadu_ps(w_row + j);
                         acc = _mm512_fmadd_ps(hv, wv, acc);
                     }
                     sum += _mm512_reduce_add_ps(acc);
                     for (; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #elif defined(__AVX__)
                     __m256 acc = _mm256_setzero_ps();
                     int j = 0;
                     for (; j + 8 <= tile_inter; j += 8) {
                         __m256 hv = _mm256_loadu_ps(h + j);
                         __m256 wv = _mm256_loadu_ps(w_row + j);
                         acc = _mm256_add_ps(acc, _mm256_mul_ps(hv, wv));
                     }
                     sum += hsum256_prefill(acc);
                     for (; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #else
                     for (int j = 0; j < tile_inter; ++j) {
                         sum += h[j] * w_row[j];
                     }
 #endif
                     o[d] = sum;
                 }
             }
         }
     }
 }

References add_bias_tile(), gemm_tile_nt_strided(), PREFILL_TILE_M, and silu().

Referenced by fused_mlp_swiglu_prefill().

◆ fused_mlp_swiglu_prefill_w1w2_quant()

void fused_mlp_swiglu_prefill_w1w2_quant	(	const float *	x,
		const void *	W1,
		const float *	B1,
		CKDataType	w1_dt,
		const void *	W2,
		const float *	B2,
		CKDataType	w2_dt,
		float *	output,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	intermediate_dim,
		int	aligned_intermediate_dim,
		void *	scratch
	)

Quantized fused MLP for prefill (W1=gate+up, W2=down)

Uses Q8_0 activations for W1 (Q5_0/Q8_0 weights) and Q8_K activations for W2 (Q4_K/Q6_K weights).

Definition at line 965 of file prefill_fused_gemm.c.

 {
     if (!x || !W1 || !W2 || !output || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         intermediate_dim <= 0 || aligned_intermediate_dim <= 0) {
         return;
     }
     if (aligned_embed_dim < embed_dim || aligned_intermediate_dim < intermediate_dim) {
         return;
     }
     if ((aligned_embed_dim % 32) != 0 || (aligned_intermediate_dim % 256) != 0) {
         return;
     }
     if (!mlp_q8_0_dtype_supported(w1_dt) || !mlp_q8_k_dtype_supported(w2_dt)) {
         return;
     }
  
     const int tile_m_max = PREFILL_TILE_M;
     const int inter = aligned_intermediate_dim;
     const size_t q8_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_intermediate_dim);
     const size_t w1_row_bytes = ck_dtype_row_bytes(w1_dt, (size_t)aligned_embed_dim);
  
     uint8_t *scratch_bytes = (uint8_t *)scratch;
     size_t q8_bytes = (size_t)tile_m_max * q8_row_bytes;
     size_t gate_bytes = (size_t)tile_m_max * (size_t)inter * sizeof(float);
     size_t up_bytes = gate_bytes;
     size_t gate_offset = align_up_size(q8_bytes, 64);
     size_t up_offset = gate_offset + align_up_size(gate_bytes, 64);
     size_t q8k_offset = up_offset + align_up_size(up_bytes, 64);
  
     uint8_t *q8_tile = scratch_bytes;
     float *gate_tile = (float *)(scratch_bytes + gate_offset);
     float *up_tile = (float *)(scratch_bytes + up_offset);
     uint8_t *q8k_tile = scratch_bytes + q8k_offset;
  
     const uint8_t *w1_base = (const uint8_t *)W1;
     const uint8_t *w_gate = w1_base;
     const uint8_t *w_up = w1_base + (size_t)inter * w1_row_bytes;
  
     const float *b_gate = B1;
     const float *b_up = B1 ? (B1 + (size_t)inter) : NULL;
  
     for (int m_start = 0; m_start < seq_len; m_start += tile_m_max) {
         int tile_m = (m_start + tile_m_max <= seq_len)
                          ? tile_m_max : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         float *out_tile = output + (size_t)m_start * (size_t)aligned_embed_dim;
  
         for (int t = 0; t < tile_m; ++t) {
             const float *row = x_tile + (size_t)t * (size_t)aligned_embed_dim;
             quantize_row_q8_0(row,
                               q8_tile + (size_t)t * q8_row_bytes,
                               aligned_embed_dim);
         }
  
         gemm_nt_q8_0_mlp_dispatch(q8_tile, w_gate, b_gate, gate_tile,
                                  tile_m, inter, aligned_embed_dim, w1_dt);
         gemm_nt_q8_0_mlp_dispatch(q8_tile, w_up, b_up, up_tile,
                                  tile_m, inter, aligned_embed_dim, w1_dt);
  
         for (int i = 0; i < tile_m; ++i) {
             float *g = gate_tile + (size_t)i * (size_t)inter;
             float *u = up_tile + (size_t)i * (size_t)inter;
             for (int j = 0; j < inter; ++j) {
                 g[j] = silu_prefill(g[j]) * u[j];
             }
         }
  
         for (int i = 0; i < tile_m; ++i) {
             const float *row = gate_tile + (size_t)i * (size_t)inter;
             quantize_row_q8_k(row,
                               q8k_tile + (size_t)i * q8k_row_bytes,
                               aligned_intermediate_dim);
         }
  
         gemm_nt_q8_k_mlp_dispatch(q8k_tile, W2, B2, out_tile,
                                   tile_m, aligned_embed_dim, aligned_intermediate_dim, w2_dt);
     }
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), gemm_nt_q8_0_mlp_dispatch(), gemm_nt_q8_k_mlp_dispatch(), mlp_q8_0_dtype_supported(), mlp_q8_k_dtype_supported(), PREFILL_TILE_M, quantize_row_q8_0(), quantize_row_q8_k(), and silu_prefill().

Referenced by mega_fused_outproj_mlp_prefill().

◆ fused_mlp_swiglu_prefill_w1w2_quant_scratch_size()

size_t fused_mlp_swiglu_prefill_w1w2_quant_scratch_size	(	int	aligned_embed_dim,
		int	aligned_intermediate_dim
	)

Get scratch buffer size for fused_mlp_swiglu_prefill_w1w2_quant.

Definition at line 1063 of file prefill_fused_gemm.c.

 {
     if (aligned_embed_dim <= 0 || aligned_intermediate_dim <= 0) {
         return 0;
     }
     const size_t q8_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_intermediate_dim);
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     const size_t gate_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_intermediate_dim * sizeof(float);
     const size_t up_bytes = gate_bytes;
     const size_t q8k_bytes = (size_t)PREFILL_TILE_M * q8k_row_bytes;
  
     return align_up_size(q8_bytes, 64) +
            align_up_size(gate_bytes, 64) +
            align_up_size(up_bytes, 64) +
            align_up_size(q8k_bytes, 64);
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), and PREFILL_TILE_M.

Referenced by mega_fused_outproj_mlp_prefill_scratch_size().

◆ fused_mlp_swiglu_scratch_size()

size_t fused_mlp_swiglu_scratch_size ( int intermediate )

Get scratch size for fused MLP.

Get scratch buffer size for fused_mlp_swiglu_prefill.

Definition at line 899 of file prefill_fused_gemm.c.

                                                        {
     const int TILE_N_INTER = 512;
     /* gate_tile + up_tile */
     return 2 * (size_t)PREFILL_TILE_M * TILE_N_INTER * sizeof(float);
 }

References PREFILL_TILE_M.

◆ fused_rmsnorm_gemm_2d_tiled()

static void fused_rmsnorm_gemm_2d_tiled	(	const float *	x,
		const float *	gamma,
		const float *	W,
		float *	output,
		int	seq_len,
		int	hidden,
		int	out_dim,
		float	eps,
		float *	x_norm_scratch
	)

static

Fused RMSNorm + single GEMM with 2D tiling (weight reuse)

Tiles along N (weights) OUTER, M (tokens) INNER. Weight tiles are reused across all token tiles.

Definition at line 332 of file prefill_fused_gemm.c.

 {
     /* Outer loop: tile along output dimension (N) - weight tiles */
     for (int n_start = 0; n_start < out_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= out_dim)
                          ? PREFILL_TILE_N
                          : (out_dim - n_start);
  
         /* Weight tile pointer - this tile stays in L3 cache */
         const float *W_tile = W + (size_t)n_start * hidden;
  
         /* Inner loop: tile along token dimension (M) */
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M
                              : (seq_len - m_start);
  
             const float *x_tile = x + (size_t)m_start * hidden;
             float *out_tile = output + (size_t)m_start * out_dim + n_start;
  
             /* Compute RMSNorm for this token tile (only on first weight tile) */
             if (n_start == 0) {
                 rmsnorm_tile(x_tile, gamma, x_norm_scratch, tile_m, hidden, hidden, eps);
             } else {
                 /* Recompute x_norm for this tile (we can't cache all of it) */
                 /* TODO: For very large N, consider caching x_norm chunks */
                 rmsnorm_tile(x_tile, gamma, x_norm_scratch, tile_m, hidden, hidden, eps);
             }
  
             /* GEMM: x_norm_tile × W_tile^T → output tile */
             gemm_tile_nt_strided(x_norm_scratch, W_tile, out_tile,
                                   tile_m, tile_n, hidden, out_dim);
         }
     }
 }

References gemm_tile_nt_strided(), PREFILL_TILE_M, PREFILL_TILE_N, and rmsnorm_tile().

◆ fused_rmsnorm_qkv_prefill()

void fused_rmsnorm_qkv_prefill	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Wk,
		const float *	Wv,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	hidden,
		int	q_dim,
		int	kv_dim,
		float	eps,
		float *	scratch
	)

Fused RMSNorm + QKV projection for prefill (v3 optimized)

Fused RMSNorm + QKV projection for prefill.

KEY INSIGHT: For Qwen2-0.5B, all QKV weights fit in L3: Wq (896×896) + Wk (128×896) + Wv (128×896) = 4.1MB < 6MB L3

So we use M-tiling (tokens) only:

For each token tile: a. Compute RMSNorm ONCE into scratch (x_norm stays in L2) b. Do all three GEMMs (Q, K, V) against cached x_norm c. Weights stay hot in L3 across all token tiles

This avoids both:

Large x_norm intermediate buffer (only TILE_M × hidden in L2)
RMSNorm recomputation (done once per token tile, used 3×)

Definition at line 393 of file prefill_fused_gemm.c.

 {
     /* scratch is x_norm tile: [TILE_M × hidden] fits in L2 */
  
     /* Process token tiles - weights stay in L3 across all tiles */
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * hidden;
  
         /* Step 1: RMSNorm for this token tile (computed ONCE, used 3×) */
         rmsnorm_tile(x_tile, gamma, scratch, tile_m, hidden, hidden, eps);
  
         /* Step 2: Q projection - x_norm is hot in L2, Wq hot in L3 */
         float *Q_tile = Q + (size_t)m_start * q_dim;
         gemm_tile_nt_strided(scratch, Wq, Q_tile, tile_m, q_dim, hidden, q_dim);
  
         /* Step 3: K projection - x_norm still hot, Wk displaces some Wq */
         float *K_tile = K + (size_t)m_start * kv_dim;
         gemm_tile_nt_strided(scratch, Wk, K_tile, tile_m, kv_dim, hidden, kv_dim);
  
         /* Step 4: V projection - x_norm still hot, Wv displaces Wk */
         float *V_tile = V + (size_t)m_start * kv_dim;
         gemm_tile_nt_strided(scratch, Wv, V_tile, tile_m, kv_dim, hidden, kv_dim);
     }
 }

References gemm_tile_nt_strided(), PREFILL_TILE_M, and rmsnorm_tile().

◆ fused_rmsnorm_qkv_prefill_head_major()

void fused_rmsnorm_qkv_prefill_head_major	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Bq,
		const float *	Wk,
		const float *	Bk,
		const float *	Wv,
		const float *	Bv,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		float	eps,
		float *	scratch
	)

Fused RMSNorm + QKV projection for prefill (head-major outputs)

Q is written as [num_heads, seq_len, aligned_head_dim]. K/V are written with kv_stride_tokens for KV-cache compatibility.

Definition at line 441 of file prefill_fused_gemm.c.

 {
     if (!x || !gamma || !Wq || !Wk || !Wv || !Q || !K || !V || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         head_dim <= 0 || aligned_head_dim <= 0 ||
         num_heads <= 0 || num_kv_heads <= 0) {
         return;
     }
     if (kv_stride_tokens < seq_len) {
         return;
     }
  
     const size_t q_head_stride = (size_t)seq_len * (size_t)aligned_head_dim;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
     const size_t head_w_stride = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;
  
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         rmsnorm_tile(x_tile, gamma, scratch, tile_m, embed_dim, aligned_embed_dim, eps);
  
         for (int h = 0; h < num_heads; ++h) {
             const float *wq_h = Wq + (size_t)h * head_w_stride;
             const float *bq_h = Bq ? (Bq + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *q_h = Q + (size_t)h * q_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             gemm_tile_nt_strided(scratch, wq_h, q_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(q_h, bq_h, tile_m, aligned_head_dim);
         }
  
         for (int h = 0; h < num_kv_heads; ++h) {
             const float *wk_h = Wk + (size_t)h * head_w_stride;
             const float *wv_h = Wv + (size_t)h * head_w_stride;
             const float *bk_h = Bk ? (Bk + (size_t)h * (size_t)aligned_head_dim) : NULL;
             const float *bv_h = Bv ? (Bv + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *k_h = K + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
             float *v_h = V + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             gemm_tile_nt_strided(scratch, wk_h, k_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(k_h, bk_h, tile_m, aligned_head_dim);
  
             gemm_tile_nt_strided(scratch, wv_h, v_h,
                                  tile_m, aligned_head_dim, aligned_embed_dim, aligned_head_dim);
             add_bias_tile(v_h, bv_h, tile_m, aligned_head_dim);
         }
     }
 }

References add_bias_tile(), gemm_tile_nt_strided(), PREFILL_TILE_M, and rmsnorm_tile().

Referenced by mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ fused_rmsnorm_qkv_prefill_head_major_quant()

void fused_rmsnorm_qkv_prefill_head_major_quant	(	const float *	x,
		const float *	gamma,
		const void *	Wq,
		const float *	Bq,
		CKDataType	wq_dt,
		const void *	Wk,
		const float *	Bk,
		CKDataType	wk_dt,
		const void *	Wv,
		const float *	Bv,
		CKDataType	wv_dt,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	embed_dim,
		int	aligned_embed_dim,
		int	num_heads,
		int	num_kv_heads,
		int	head_dim,
		int	aligned_head_dim,
		int	kv_stride_tokens,
		float	eps,
		void *	scratch
	)

Fused RMSNorm + QKV projection for prefill (head-major, Q8 activations)

Supports Q5_0 or Q8_0 weights with Q8_0 activations. Writes K/V directly into KV cache layout (kv_stride_tokens).

Definition at line 519 of file prefill_fused_gemm.c.

 {
     if (!x || !gamma || !Wq || !Wk || !Wv || !Q || !K || !V || !scratch) {
         return;
     }
     if (seq_len <= 0 || embed_dim <= 0 || aligned_embed_dim <= 0 ||
         head_dim <= 0 || aligned_head_dim <= 0 ||
         num_heads <= 0 || num_kv_heads <= 0) {
         return;
     }
     if (aligned_embed_dim % 32 != 0) {
         return;
     }
     if (kv_stride_tokens < seq_len) {
         return;
     }
     /* Determine quantization path: Q8_0 activations for Q5_0/Q8_0 weights,
      * Q8_K activations for Q4_K/Q6_K weights. All QKV weights must use
      * the same quantization family. */
     int use_q8_k_path = qkv_q8_k_dtype_supported(wq_dt);
     int use_q8_0_path = qkv_q8_0_dtype_supported(wq_dt);
  
     if (!use_q8_k_path && !use_q8_0_path) {
         /* Unsupported dtype for wq */
         return;
     }
  
     /* Verify all dtypes are from the same family */
     if (use_q8_k_path) {
         if (!qkv_q8_k_dtype_supported(wk_dt) || !qkv_q8_k_dtype_supported(wv_dt)) {
             return;  /* Mixed Q8_K and Q8_0 paths not supported */
         }
     } else {
         if (!qkv_q8_0_dtype_supported(wk_dt) || !qkv_q8_0_dtype_supported(wv_dt)) {
             return;
         }
     }
  
     const size_t float_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_embed_dim * sizeof(float);
     /* Q8_K has larger blocks (256) than Q8_0 (32), so use appropriate size */
     const CKDataType act_quant_type = use_q8_k_path ? CK_DT_Q8_K : CK_DT_Q8_0;
     const size_t q8_row_bytes = ck_dtype_row_bytes(act_quant_type, (size_t)aligned_embed_dim);
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     const size_t q8_offset = align_up_size(float_bytes, 64);
  
     float *normed = (float *)scratch;
     uint8_t *q8_tile = (uint8_t *)scratch + q8_offset;
     (void)q8_bytes;
  
     const size_t q_head_stride = (size_t)seq_len * (size_t)aligned_head_dim;
     const size_t kv_head_stride = (size_t)kv_stride_tokens * (size_t)aligned_head_dim;
     const size_t head_w_elems = (size_t)aligned_head_dim * (size_t)aligned_embed_dim;
     const size_t wq_head_bytes = ck_dtype_row_bytes(wq_dt, head_w_elems);
     const size_t wk_head_bytes = ck_dtype_row_bytes(wk_dt, head_w_elems);
     const size_t wv_head_bytes = ck_dtype_row_bytes(wv_dt, head_w_elems);
  
     for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
         int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                          ? PREFILL_TILE_M : (seq_len - m_start);
  
         const float *x_tile = x + (size_t)m_start * (size_t)aligned_embed_dim;
         rmsnorm_tile(x_tile, gamma, normed, tile_m, embed_dim, aligned_embed_dim, eps);
  
         /* Quantize activations to appropriate format */
         for (int t = 0; t < tile_m; ++t) {
             const float *row = normed + (size_t)t * (size_t)aligned_embed_dim;
             if (use_q8_k_path) {
                 quantize_row_q8_k(row,
                                   q8_tile + (size_t)t * q8_row_bytes,
                                   aligned_embed_dim);
             } else {
                 quantize_row_q8_0(row,
                                   q8_tile + (size_t)t * q8_row_bytes,
                                   aligned_embed_dim);
             }
         }
  
         for (int h = 0; h < num_heads; ++h) {
             const uint8_t *wq_h = (const uint8_t *)Wq + (size_t)h * wq_head_bytes;
             const float *bq_h = Bq ? (Bq + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *q_h = Q + (size_t)h * q_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             if (use_q8_k_path) {
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wq_h, bq_h, q_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wq_dt);
             } else {
                 gemm_nt_q8_0_dispatch(q8_tile, wq_h, bq_h, q_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wq_dt);
             }
         }
  
         for (int h = 0; h < num_kv_heads; ++h) {
             const uint8_t *wk_h = (const uint8_t *)Wk + (size_t)h * wk_head_bytes;
             const uint8_t *wv_h = (const uint8_t *)Wv + (size_t)h * wv_head_bytes;
             const float *bk_h = Bk ? (Bk + (size_t)h * (size_t)aligned_head_dim) : NULL;
             const float *bv_h = Bv ? (Bv + (size_t)h * (size_t)aligned_head_dim) : NULL;
             float *k_h = K + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
             float *v_h = V + (size_t)h * kv_head_stride + (size_t)m_start * (size_t)aligned_head_dim;
  
             if (use_q8_k_path) {
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wk_h, bk_h, k_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wk_dt);
                 gemm_nt_q8_k_qkv_dispatch(q8_tile, wv_h, bv_h, v_h,
                                           tile_m, aligned_head_dim, aligned_embed_dim, wv_dt);
             } else {
                 gemm_nt_q8_0_dispatch(q8_tile, wk_h, bk_h, k_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wk_dt);
                 gemm_nt_q8_0_dispatch(q8_tile, wv_h, bv_h, v_h,
                                       tile_m, aligned_head_dim, aligned_embed_dim, wv_dt);
             }
         }
     }
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), gemm_nt_q8_0_dispatch(), gemm_nt_q8_k_qkv_dispatch(), PREFILL_TILE_M, qkv_q8_0_dtype_supported(), qkv_q8_k_dtype_supported(), quantize_row_q8_0(), quantize_row_q8_k(), and rmsnorm_tile().

Referenced by mega_fused_attention_prefill(), and mega_fused_attention_prefill_q8_0().

◆ fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size()

size_t fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size ( int aligned_embed_dim )

Get scratch buffer size for fused_rmsnorm_qkv_prefill_head_major_quant.

Definition at line 651 of file prefill_fused_gemm.c.

                                                                                       {
     if (aligned_embed_dim <= 0) {
         return 0;
     }
     const size_t float_bytes = (size_t)PREFILL_TILE_M * (size_t)aligned_embed_dim * sizeof(float);
     /* Use max of Q8_0 and Q8_K sizes to support both paths */
     const size_t q8_0_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_0, (size_t)aligned_embed_dim);
     const size_t q8_k_row_bytes = ck_dtype_row_bytes(CK_DT_Q8_K, (size_t)aligned_embed_dim);
     const size_t q8_row_bytes = (q8_k_row_bytes > q8_0_row_bytes) ? q8_k_row_bytes : q8_0_row_bytes;
     const size_t q8_bytes = (size_t)PREFILL_TILE_M * q8_row_bytes;
     return align_up_size(float_bytes, 64) + q8_bytes;
 }

References align_up_size(), CK_DT_Q8_0, CK_DT_Q8_K, ck_dtype_row_bytes(), and PREFILL_TILE_M.

Referenced by mega_fused_attention_prefill(), mega_fused_attention_prefill_q8_0(), mega_fused_attention_prefill_q8_0_scratch_size(), and mega_fused_attention_prefill_scratch_size().

◆ fused_rmsnorm_qkv_scratch_size()

size_t fused_rmsnorm_qkv_scratch_size ( int hidden )

Get scratch size for fused prefill.

Get scratch buffer size for fused_rmsnorm_qkv_prefill.

Definition at line 739 of file prefill_fused_gemm.c.

                                                   {
     return (size_t)PREFILL_TILE_M * hidden * sizeof(float);
 }

References PREFILL_TILE_M.

◆ gemm_nt_q8_0_dispatch()

static void gemm_nt_q8_0_dispatch	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K,
		CKDataType	dt
	)

static

Definition at line 177 of file prefill_fused_gemm.c.

 {
     switch (dt) {
     case CK_DT_Q5_0:
         gemm_nt_q5_0_q8_0(A_q8, B, bias, C, M, N, K);
         break;
     case CK_DT_Q8_0:
         gemm_nt_q8_0_q8_0(A_q8, B, bias, C, M, N, K);
         break;
     default:
         break;
     }
 }

References C, CK_DT_Q5_0, CK_DT_Q8_0, gemm_nt_q5_0_q8_0(), and gemm_nt_q8_0_q8_0().

Referenced by fused_rmsnorm_qkv_prefill_head_major_quant().

◆ gemm_nt_q8_0_mlp_dispatch()

static void gemm_nt_q8_0_mlp_dispatch	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K,
		CKDataType	dt
	)

static

Definition at line 917 of file prefill_fused_gemm.c.

 {
     switch (dt) {
     case CK_DT_Q5_0:
         gemm_nt_q5_0_q8_0(A_q8, B, bias, C, M, N, K);
         break;
     case CK_DT_Q8_0:
         gemm_nt_q8_0_q8_0(A_q8, B, bias, C, M, N, K);
         break;
     default:
         break;
     }
 }

References C, CK_DT_Q5_0, CK_DT_Q8_0, gemm_nt_q5_0_q8_0(), and gemm_nt_q8_0_q8_0().

Referenced by fused_mlp_swiglu_prefill_w1w2_quant().

◆ gemm_nt_q8_k_mlp_dispatch()

static void gemm_nt_q8_k_mlp_dispatch	(	const void *	A_q8,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K,
		CKDataType	dt
	)

static

Definition at line 938 of file prefill_fused_gemm.c.

 {
     switch (dt) {
     case CK_DT_Q4_K:
         gemm_nt_q4_k_q8_k(A_q8, B, bias, C, M, N, K);
         break;
     case CK_DT_Q6_K:
         gemm_nt_q6_k_q8_k(A_q8, B, bias, C, M, N, K);
         break;
     default:
         break;
     }
 }

References C, CK_DT_Q4_K, CK_DT_Q6_K, gemm_nt_q4_k_q8_k(), and gemm_nt_q6_k_q8_k().

Referenced by fused_mlp_swiglu_prefill_w1w2_quant().

◆ gemm_nt_q8_k_qkv_dispatch()

static void gemm_nt_q8_k_qkv_dispatch	(	const void *	A_q8k,
		const void *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K,
		CKDataType	dt
	)

static

Definition at line 198 of file prefill_fused_gemm.c.

 {
     switch (dt) {
     case CK_DT_Q4_K:
         gemm_nt_q4_k_q8_k(A_q8k, B, bias, C, M, N, K);
         break;
     case CK_DT_Q6_K:
         gemm_nt_q6_k_q8_k(A_q8k, B, bias, C, M, N, K);
         break;
     default:
         break;
     }
 }

References C, CK_DT_Q4_K, CK_DT_Q6_K, gemm_nt_q4_k_q8_k(), and gemm_nt_q6_k_q8_k().

Referenced by fused_rmsnorm_qkv_prefill_head_major_quant().

◆ gemm_tile_nt_strided()

static void gemm_tile_nt_strided	(	const float *	A,
		const float *	B_tile,
		float *	C,
		int	tile_m,
		int	tile_n,
		int	K,
		int	C_stride
	)

static

GEMM tile with N-dimension tiling (weight reuse)

Computes: C[tile_m × tile_n] = A[tile_m × K] × B[tile_n × K]^T where B_tile is a slice of rows from the weight matrix.

Uses MKL if available for optimal performance.

Parameters

A	Input tile [tile_m × K]
B_tile	Weight tile [tile_n × K] (transposed layout)
C	Output tile [tile_m × tile_n] (column slice of full output)
C_stride	Stride between rows of C (= full N dimension)

Definition at line 236 of file prefill_fused_gemm.c.

 {
 #ifdef USE_MKL
     /* Use MKL SGEMM: C = A × B^T
      * But MKL expects contiguous output, so we need to handle strided output.
      * For now, if C_stride == tile_n (contiguous), use MKL directly.
      * Otherwise, fall back to naive. */
     if (C_stride == tile_n) {
         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                     tile_m, tile_n, K,
                     1.0f, A, K, B_tile, K,
                     0.0f, C, tile_n);
         return;
     }
     /* Strided output - use MKL per row */
     for (int i = 0; i < tile_m; ++i) {
         cblas_sgemv(CblasRowMajor, CblasNoTrans,
                     tile_n, K,
                     1.0f, B_tile, K, A + (size_t)i * K, 1,
                     0.0f, C + (size_t)i * C_stride, 1);
     }
 #else
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
     for (int i = 0; i < tile_m; ++i) {
         const float *a_row = A + (size_t)i * K;
         float *c_row = C + (size_t)i * C_stride;
  
         for (int j = 0; j < tile_n; ++j) {
             const float *b_row = B_tile + (size_t)j * K;
             float sum = 0.0f;
  
 #if defined(__AVX512F__)
             __m512 acc = _mm512_setzero_ps();
             int k = 0;
             for (; k + 16 <= K; k += 16) {
                 __m512 av = _mm512_loadu_ps(a_row + k);
                 __m512 bv = _mm512_loadu_ps(b_row + k);
                 acc = _mm512_fmadd_ps(av, bv, acc);
             }
             sum = _mm512_reduce_add_ps(acc);
             for (; k < K; ++k) {
                 sum += a_row[k] * b_row[k];
             }
 #elif defined(__AVX__)
             __m256 acc = _mm256_setzero_ps();
             int k = 0;
             for (; k + 8 <= K; k += 8) {
                 __m256 av = _mm256_loadu_ps(a_row + k);
                 __m256 bv = _mm256_loadu_ps(b_row + k);
                 acc = _mm256_add_ps(acc, _mm256_mul_ps(av, bv));
             }
             sum = hsum256_prefill(acc);
             for (; k < K; ++k) {
                 sum += a_row[k] * b_row[k];
             }
 #else
             for (int k = 0; k < K; ++k) {
                 sum += a_row[k] * b_row[k];
             }
 #endif
             c_row[j] = sum;
         }
     }
 #endif
 }

References C.

Referenced by fused_mlp_swiglu_prefill_bias(), fused_rmsnorm_gemm_2d_tiled(), fused_rmsnorm_qkv_prefill(), fused_rmsnorm_qkv_prefill_head_major(), and unfused_rmsnorm_qkv_prefill().

◆ mlp_q8_0_dtype_supported()

static int mlp_q8_0_dtype_supported ( CKDataType dt )

static

Definition at line 909 of file prefill_fused_gemm.c.

                                                    {
     return (dt == CK_DT_Q5_0 || dt == CK_DT_Q8_0);
 }

References CK_DT_Q5_0, and CK_DT_Q8_0.

Referenced by fused_mlp_swiglu_prefill_w1w2_quant().

◆ mlp_q8_k_dtype_supported()

static int mlp_q8_k_dtype_supported ( CKDataType dt )

static

Definition at line 913 of file prefill_fused_gemm.c.

                                                    {
     return (dt == CK_DT_Q4_K || dt == CK_DT_Q6_K);
 }

References CK_DT_Q4_K, and CK_DT_Q6_K.

Referenced by fused_mlp_swiglu_prefill_w1w2_quant().

◆ qkv_q8_0_dtype_supported()

static int qkv_q8_0_dtype_supported ( CKDataType dt )

static

Definition at line 169 of file prefill_fused_gemm.c.

                                                    {
     return (dt == CK_DT_Q5_0 || dt == CK_DT_Q8_0);
 }

References CK_DT_Q5_0, and CK_DT_Q8_0.

Referenced by fused_rmsnorm_qkv_prefill_head_major_quant().

◆ qkv_q8_k_dtype_supported()

static int qkv_q8_k_dtype_supported ( CKDataType dt )

static

Definition at line 173 of file prefill_fused_gemm.c.

                                                    {
     return (dt == CK_DT_Q4_K || dt == CK_DT_Q6_K);
 }

References CK_DT_Q4_K, and CK_DT_Q6_K.

Referenced by fused_rmsnorm_qkv_prefill_head_major_quant().

◆ rmsnorm_tile()

static void rmsnorm_tile	(	const float *	input,
		const float *	gamma,
		float *	output,
		int	tile_m,
		int	embed_dim,
		int	aligned_embed_dim,
		float	eps
	)

static

Compute RMSNorm for a tile of tokens.

Definition at line 86 of file prefill_fused_gemm.c.

 {
     for (int t = 0; t < tile_m; ++t) {
         const float *x = input + (size_t)t * (size_t)aligned_embed_dim;
         float *y = output + (size_t)t * (size_t)aligned_embed_dim;
  
 #if defined(__AVX512F__)
         __m512 sum_sq_vec = _mm512_setzero_ps();
         int d = 0;
         for (; d + 16 <= embed_dim; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             sum_sq_vec = _mm512_fmadd_ps(xv, xv, sum_sq_vec);
         }
         float sum_sq = _mm512_reduce_add_ps(sum_sq_vec);
         for (; d < embed_dim; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float rstd = 1.0f / sqrtf(sum_sq / (float)embed_dim + eps);
         __m512 rstd_vec = _mm512_set1_ps(rstd);
  
         d = 0;
         for (; d + 16 <= embed_dim; d += 16) {
             __m512 xv = _mm512_loadu_ps(&x[d]);
             __m512 gv = gamma ? _mm512_loadu_ps(&gamma[d]) : _mm512_set1_ps(1.0f);
             __m512 yv = _mm512_mul_ps(_mm512_mul_ps(xv, rstd_vec), gv);
             _mm512_storeu_ps(&y[d], yv);
         }
         for (; d < embed_dim; ++d) {
             float g = gamma ? gamma[d] : 1.0f;
             y[d] = x[d] * rstd * g;
         }
  
 #elif defined(__AVX__)
         __m256 sum_sq_vec = _mm256_setzero_ps();
         int d = 0;
         for (; d + 8 <= embed_dim; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             sum_sq_vec = _mm256_add_ps(sum_sq_vec, _mm256_mul_ps(xv, xv));
         }
         float sum_sq = hsum256_prefill(sum_sq_vec);
         for (; d < embed_dim; ++d) {
             sum_sq += x[d] * x[d];
         }
  
         float rstd = 1.0f / sqrtf(sum_sq / (float)embed_dim + eps);
         __m256 rstd_vec = _mm256_set1_ps(rstd);
  
         d = 0;
         for (; d + 8 <= embed_dim; d += 8) {
             __m256 xv = _mm256_loadu_ps(&x[d]);
             __m256 gv = gamma ? _mm256_loadu_ps(&gamma[d]) : _mm256_set1_ps(1.0f);
             __m256 yv = _mm256_mul_ps(_mm256_mul_ps(xv, rstd_vec), gv);
             _mm256_storeu_ps(&y[d], yv);
         }
         for (; d < embed_dim; ++d) {
             float g = gamma ? gamma[d] : 1.0f;
             y[d] = x[d] * rstd * g;
         }
 #else
         float sum_sq = 0.0f;
         for (int d = 0; d < embed_dim; ++d) {
             sum_sq += x[d] * x[d];
         }
         float rstd = 1.0f / sqrtf(sum_sq / (float)embed_dim + eps);
         for (int d = 0; d < embed_dim; ++d) {
             float g = gamma ? gamma[d] : 1.0f;
             y[d] = x[d] * rstd * g;
         }
 #endif
  
         for (int d = embed_dim; d < aligned_embed_dim; ++d) {
             y[d] = 0.0f;
         }
     }
 }

Referenced by fused_rmsnorm_gemm_2d_tiled(), fused_rmsnorm_qkv_prefill(), fused_rmsnorm_qkv_prefill_head_major(), fused_rmsnorm_qkv_prefill_head_major_quant(), and unfused_rmsnorm_qkv_prefill().

◆ silu_prefill()

static float silu_prefill ( float x )

inlinestatic

Definition at line 905 of file prefill_fused_gemm.c.

                                           {
     return x / (1.0f + expf(-x));
 }

Referenced by fused_mlp_swiglu_prefill_w1w2_quant().

◆ unfused_rmsnorm_qkv_prefill()

void unfused_rmsnorm_qkv_prefill	(	const float *	x,
		const float *	gamma,
		const float *	Wq,
		const float *	Wk,
		const float *	Wv,
		float *	x_norm,
		float *	Q,
		float *	K,
		float *	V,
		int	seq_len,
		int	hidden,
		int	q_dim,
		int	kv_dim,
		float	eps
	)

Unfused version for comparison.

Unfused version for benchmarking comparison.

Definition at line 667 of file prefill_fused_gemm.c.

 {
     /* Step 1: Full RMSNorm → writes x_norm to memory */
     rmsnorm_tile(x, gamma, x_norm, seq_len, hidden, hidden, eps);
  
     /* Step 2: Separate GEMMs with N-outer tiling for weight reuse */
     /* Q projection */
     for (int n_start = 0; n_start < q_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= q_dim)
                          ? PREFILL_TILE_N : (q_dim - n_start);
         const float *W_tile = Wq + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = Q + (size_t)m_start * q_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, q_dim);
         }
     }
  
     /* K projection */
     for (int n_start = 0; n_start < kv_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= kv_dim)
                          ? PREFILL_TILE_N : (kv_dim - n_start);
         const float *W_tile = Wk + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = K + (size_t)m_start * kv_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, kv_dim);
         }
     }
  
     /* V projection */
     for (int n_start = 0; n_start < kv_dim; n_start += PREFILL_TILE_N) {
         int tile_n = (n_start + PREFILL_TILE_N <= kv_dim)
                          ? PREFILL_TILE_N : (kv_dim - n_start);
         const float *W_tile = Wv + (size_t)n_start * hidden;
  
         for (int m_start = 0; m_start < seq_len; m_start += PREFILL_TILE_M) {
             int tile_m = (m_start + PREFILL_TILE_M <= seq_len)
                              ? PREFILL_TILE_M : (seq_len - m_start);
             const float *x_tile = x_norm + (size_t)m_start * hidden;
             float *out_tile = V + (size_t)m_start * kv_dim + n_start;
             gemm_tile_nt_strided(x_tile, W_tile, out_tile,
                                   tile_m, tile_n, hidden, kv_dim);
         }
     }
 }

References gemm_tile_nt_strided(), PREFILL_TILE_M, PREFILL_TILE_N, and rmsnorm_tile().

Macros

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

KEY INSIGHT:

TILING STRATEGY:

Macro Definition Documentation

◆ PREFILL_TILE_M

◆ PREFILL_TILE_N

Function Documentation

◆ add_bias_tile()

◆ align_up_size()

◆ fused_mlp_swiglu_prefill()

◆ fused_mlp_swiglu_prefill_bias()

◆ fused_mlp_swiglu_prefill_w1w2_quant()

◆ fused_mlp_swiglu_prefill_w1w2_quant_scratch_size()

◆ fused_mlp_swiglu_scratch_size()

◆ fused_rmsnorm_gemm_2d_tiled()

◆ fused_rmsnorm_qkv_prefill()

◆ fused_rmsnorm_qkv_prefill_head_major()

◆ fused_rmsnorm_qkv_prefill_head_major_quant()

◆ fused_rmsnorm_qkv_prefill_head_major_quant_scratch_size()

◆ fused_rmsnorm_qkv_scratch_size()

◆ gemm_nt_q8_0_dispatch()

◆ gemm_nt_q8_0_mlp_dispatch()

◆ gemm_nt_q8_k_mlp_dispatch()

◆ gemm_nt_q8_k_qkv_dispatch()

◆ gemm_tile_nt_strided()

◆ mlp_q8_0_dtype_supported()

◆ mlp_q8_k_dtype_supported()

◆ qkv_q8_0_dtype_supported()

◆ qkv_q8_k_dtype_supported()

◆ rmsnorm_tile()

◆ silu_prefill()

◆ unfused_rmsnorm_qkv_prefill()