Fully fused MLP decode kernel (T=1 token generation) More...

#include "ckernel_engine.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>

Macros
#define	MAX_SWIGLU_STACK 8192

#define	MLP_TILE_SIZE 64

#define	OUTPUT_TILE_SIZE 32

Functions
void	fused_mlp_swiglu_decode (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

void	fused_mlp_swiglu_decode_tiled (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

void	fused_mlp_swiglu_decode_v2 (const float x, const float W_gate, const float W_up, const float W_down, const float b_gate, const float b_up, const float b_down, float output, int D, int Hff)

static float	silu_scalar (float x)

Detailed Description

Fully fused MLP decode kernel (T=1 token generation)

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

LEGACY: This file is from v6/v6.5 and kept for backward compatibility.

This kernel fuses the ENTIRE MLP block into a single pass: output = Down(SwiGLU(Gate(x), Up(x))) + residual

Key optimization: The intermediate SwiGLU values (~4864 floats = 19KB for Qwen2) NEVER touch DRAM. They stay in L1/L2 cache through tiling.

Target: Intel Xeon 5th Gen (Emerald Rapids) with AVX-512 and AMX

Memory traffic comparison (Qwen2-0.5B, D=896, Hff=4864): Unfused: 76 KB activation traffic (38KB write + 38KB read) Fused: 0 KB activation traffic (tiles stay in L1)

Weight layout expected: Row-major, transposed for matvec W_gate[Hff, D], W_up[Hff, D], W_down[D, Hff]

Definition in file mlp_fused_decode.c.

Macro Definition Documentation

◆ MAX_SWIGLU_STACK

#define MAX_SWIGLU_STACK 8192

Definition at line 316 of file mlp_fused_decode.c.

◆ MLP_TILE_SIZE

#define MLP_TILE_SIZE 64

Definition at line 52 of file mlp_fused_decode.c.

◆ OUTPUT_TILE_SIZE

#define OUTPUT_TILE_SIZE 32

Definition at line 55 of file mlp_fused_decode.c.

Function Documentation

◆ fused_mlp_swiglu_decode()

void fused_mlp_swiglu_decode	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 154 of file mlp_fused_decode.c.

 {
 #if defined(__AVX512F__)
     // Initialize output with bias or zero
     if (b_down) {
         memcpy(output, b_down, D * sizeof(float));
     } else {
         memset(output, 0, D * sizeof(float));
     }
  
     // Process intermediate dimension in tiles
     // Each tile computes MLP_TILE_SIZE swiglu values and immediately
     // accumulates them into the output
  
     /* Bounds check for stack allocation */
     if (D > 4096) return;
  
     #pragma omp parallel
     {
         /* Thread-local accumulator on stack (no malloc!) */
         float local_output[4096] __attribute__((aligned(64)));
         memset(local_output, 0, D * sizeof(float));
  
         #pragma omp for schedule(static)
         for (int t = 0; t < Hff; t += MLP_TILE_SIZE) {
             int tile_end = (t + MLP_TILE_SIZE < Hff) ? t + MLP_TILE_SIZE : Hff;
             int tile_size = tile_end - t;
  
             // Compute SwiGLU for this tile (stays in L1 cache)
             float swiglu_tile[MLP_TILE_SIZE] __attribute__((aligned(64)));
  
             for (int j = t; j < tile_end; j++) {
                 const float *wg_row = &W_gate[j * D];
                 const float *wu_row = &W_up[j * D];
  
                 // Compute gate = x @ W_gate[j] using AVX-512
                 __m512 gate_acc = _mm512_setzero_ps();
                 __m512 up_acc = _mm512_setzero_ps();
  
                 int k = 0;
                 for (; k <= D - 16; k += 16) {
                     __m512 x_vec = _mm512_loadu_ps(&x[k]);
                     __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
                     __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
                     gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
                     up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
                 }
  
                 float gate = hsum512_ps(gate_acc);
                 float up = hsum512_ps(up_acc);
  
                 // Scalar remainder
                 for (; k < D; k++) {
                     gate += x[k] * wg_row[k];
                     up += x[k] * wu_row[k];
                 }
  
                 // Add biases
                 if (b_gate) gate += b_gate[j];
                 if (b_up) up += b_up[j];
  
                 // SwiGLU: SiLU(gate) * up
                 swiglu_tile[j - t] = silu_scalar(gate) * up;
             }
  
             // Accumulate into output via W_down
             // output[i] += sum_j(swiglu_tile[j] * W_down[i, t+j])
             for (int i = 0; i < D; i++) {
                 const float *wd_row = &W_down[i * Hff + t];
  
                 __m512 acc = _mm512_setzero_ps();
                 int j = 0;
                 for (; j <= tile_size - 16; j += 16) {
                     __m512 sw_vec = _mm512_loadu_ps(&swiglu_tile[j]);
                     __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
                     acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
                 }
  
                 float sum = hsum512_ps(acc);
                 for (; j < tile_size; j++) {
                     sum += swiglu_tile[j] * wd_row[j];
                 }
  
                 local_output[i] += sum;
             }
         }
  
         // Reduce thread-local outputs
         #pragma omp critical
         {
             for (int i = 0; i < D; i++) {
                 output[i] += local_output[i];
             }
         }
         /* No free - stack buffer auto-deallocates */
     }
  
 #else
     // Scalar fallback (same algorithm, no SIMD)
     if (b_down) {
         memcpy(output, b_down, D * sizeof(float));
     } else {
         memset(output, 0, D * sizeof(float));
     }
  
     for (int t = 0; t < Hff; t += MLP_TILE_SIZE) {
         int tile_end = (t + MLP_TILE_SIZE < Hff) ? t + MLP_TILE_SIZE : Hff;
         int tile_size = tile_end - t;
  
         float swiglu_tile[MLP_TILE_SIZE];
  
         for (int j = t; j < tile_end; j++) {
             float gate = 0.0f;
             float up = 0.0f;
  
             for (int k = 0; k < D; k++) {
                 gate += x[k] * W_gate[j * D + k];
                 up += x[k] * W_up[j * D + k];
             }
  
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             swiglu_tile[j - t] = silu_scalar(gate) * up;
         }
  
         for (int i = 0; i < D; i++) {
             for (int j = 0; j < tile_size; j++) {
                 output[i] += swiglu_tile[j] * W_down[i * Hff + t + j];
             }
         }
     }
 #endif
 }

References __attribute__(), MLP_TILE_SIZE, and silu_scalar().

◆ fused_mlp_swiglu_decode_tiled()

void fused_mlp_swiglu_decode_tiled	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 429 of file mlp_fused_decode.c.

 {
     // Tile size chosen to fit in L2 with W_down tile
     // Tile of swiglu: 256 floats = 1KB
     // Tile of W_down: 256 * D floats = 256 * 896 * 4 = 896KB
     // Fits in 2MB L2 with room for x and prefetch
     const int TILE = 256;
  
 #if defined(__AVX512F__)
     // Initialize output
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < D; i++) {
         output[i] = b_down ? b_down[i] : 0.0f;
     }
  
     // Process tiles of intermediate dimension
     for (int t = 0; t < Hff; t += TILE) {
         int tile_end = (t + TILE < Hff) ? t + TILE : Hff;
         int tile_size = tile_end - t;
  
         // Compute swiglu tile
         float swiglu_tile[256] __attribute__((aligned(64)));
  
         #pragma omp parallel for schedule(static)
         for (int jj = 0; jj < tile_size; jj++) {
             int j = t + jj;
             const float *wg_row = &W_gate[j * D];
             const float *wu_row = &W_up[j * D];
  
             __m512 gate_acc = _mm512_setzero_ps();
             __m512 up_acc = _mm512_setzero_ps();
  
             int k = 0;
             for (; k <= D - 16; k += 16) {
                 __m512 x_vec = _mm512_loadu_ps(&x[k]);
                 __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
                 __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
                 gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
                 up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
             }
  
             float gate = hsum512_ps(gate_acc);
             float up = hsum512_ps(up_acc);
  
             for (; k < D; k++) {
                 gate += x[k] * wg_row[k];
                 up += x[k] * wu_row[k];
             }
  
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
  
             swiglu_tile[jj] = silu_scalar(gate) * up;
         }
  
         // Accumulate into output (parallelize over D)
         #pragma omp parallel for schedule(static)
         for (int i = 0; i < D; i++) {
             const float *wd_row = &W_down[i * Hff + t];
  
             __m512 acc = _mm512_setzero_ps();
             int j = 0;
             for (; j <= tile_size - 16; j += 16) {
                 __m512 sw_vec = _mm512_loadu_ps(&swiglu_tile[j]);
                 __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
                 acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
             }
  
             float sum = hsum512_ps(acc);
             for (; j < tile_size; j++) {
                 sum += swiglu_tile[j] * wd_row[j];
             }
  
             // Atomic add (or use thread-local buffers for better perf)
             #pragma omp atomic
             output[i] += sum;
         }
     }
  
 #else
     // Scalar fallback
     for (int i = 0; i < D; i++) {
         output[i] = b_down ? b_down[i] : 0.0f;
     }
  
     for (int t = 0; t < Hff; t += TILE) {
         int tile_end = (t + TILE < Hff) ? t + TILE : Hff;
  
         float swiglu_tile[256];
  
         for (int j = t; j < tile_end; j++) {
             float gate = 0.0f, up = 0.0f;
             for (int k = 0; k < D; k++) {
                 gate += x[k] * W_gate[j * D + k];
                 up += x[k] * W_up[j * D + k];
             }
             if (b_gate) gate += b_gate[j];
             if (b_up) up += b_up[j];
             swiglu_tile[j - t] = silu_scalar(gate) * up;
         }
  
         for (int i = 0; i < D; i++) {
             for (int j = t; j < tile_end; j++) {
                 output[i] += swiglu_tile[j - t] * W_down[i * Hff + j];
             }
         }
     }
 #endif
 }

References __attribute__(), and silu_scalar().

Referenced by fused_mlp_swiglu_decode_v2().

◆ fused_mlp_swiglu_decode_v2()

void fused_mlp_swiglu_decode_v2	(	const float *	x,
		const float *	W_gate,
		const float *	W_up,
		const float *	W_down,
		const float *	b_gate,
		const float *	b_up,
		const float *	b_down,
		float *	output,
		int	D,
		int	Hff
	)

Definition at line 318 of file mlp_fused_decode.c.

 {
     // For large Hff, use tiled version to avoid stack overflow
     if (Hff > MAX_SWIGLU_STACK) {
         fused_mlp_swiglu_decode_tiled(x, W_gate, W_up, W_down,
                                       b_gate, b_up, b_down, output, D, Hff);
         return;
     }
  
 #if defined(__AVX512F__)
     // Stack-allocated swiglu buffer (max 32KB)
     float swiglu[MAX_SWIGLU_STACK] __attribute__((aligned(64)));
  
     // Phase 1: Compute all swiglu values (parallelize over Hff)
     #pragma omp parallel for schedule(static)
     for (int j = 0; j < Hff; j++) {
         const float *wg_row = &W_gate[j * D];
         const float *wu_row = &W_up[j * D];
  
         __m512 gate_acc = _mm512_setzero_ps();
         __m512 up_acc = _mm512_setzero_ps();
  
         int k = 0;
         for (; k <= D - 16; k += 16) {
             __m512 x_vec = _mm512_loadu_ps(&x[k]);
             __m512 wg_vec = _mm512_loadu_ps(&wg_row[k]);
             __m512 wu_vec = _mm512_loadu_ps(&wu_row[k]);
  
             gate_acc = _mm512_fmadd_ps(x_vec, wg_vec, gate_acc);
             up_acc = _mm512_fmadd_ps(x_vec, wu_vec, up_acc);
         }
  
         float gate = hsum512_ps(gate_acc);
         float up = hsum512_ps(up_acc);
  
         for (; k < D; k++) {
             gate += x[k] * wg_row[k];
             up += x[k] * wu_row[k];
         }
  
         if (b_gate) gate += b_gate[j];
         if (b_up) up += b_up[j];
  
         swiglu[j] = silu_scalar(gate) * up;
     }
  
     // Phase 2: Down projection (parallelize over D)
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < D; i++) {
         const float *wd_row = &W_down[i * Hff];
  
         __m512 acc = _mm512_setzero_ps();
         int j = 0;
         for (; j <= Hff - 16; j += 16) {
             __m512 sw_vec = _mm512_loadu_ps(&swiglu[j]);
             __m512 wd_vec = _mm512_loadu_ps(&wd_row[j]);
             acc = _mm512_fmadd_ps(sw_vec, wd_vec, acc);
         }
  
         float sum = hsum512_ps(acc);
         for (; j < Hff; j++) {
             sum += swiglu[j] * wd_row[j];
         }
  
         output[i] = sum + (b_down ? b_down[i] : 0.0f);
     }
  
 #else
     // Scalar fallback with stack buffer
     float swiglu[MAX_SWIGLU_STACK];
  
     for (int j = 0; j < Hff; j++) {
         float gate = 0.0f, up = 0.0f;
         for (int k = 0; k < D; k++) {
             gate += x[k] * W_gate[j * D + k];
             up += x[k] * W_up[j * D + k];
         }
         if (b_gate) gate += b_gate[j];
         if (b_up) up += b_up[j];
         swiglu[j] = silu_scalar(gate) * up;
     }
  
     for (int i = 0; i < D; i++) {
         float sum = 0.0f;
         for (int j = 0; j < Hff; j++) {
             sum += swiglu[j] * W_down[i * Hff + j];
         }
         output[i] = sum + (b_down ? b_down[i] : 0.0f);
     }
 #endif
 }

References __attribute__(), fused_mlp_swiglu_decode_tiled(), MAX_SWIGLU_STACK, and silu_scalar().

Referenced by ck_mlp_swiglu_forward_fully_fused_token().

◆ silu_scalar()

static float silu_scalar ( float x )

inlinestatic

Definition at line 134 of file mlp_fused_decode.c.

                                          {
     return x / (1.0f + expf(-x));
 }

Referenced by fused_mlp_swiglu_decode(), fused_mlp_swiglu_decode_tiled(), and fused_mlp_swiglu_decode_v2().

Macros

Functions