Optimized BF16 MLP Kernels. More...

#include <stddef.h>
#include <stdint.h>
#include <math.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Functions
static float	gelu_scalar (float x)

void	gemm_bf16_fp32out (const uint16_t A, const uint16_t B, const float bias, float C, int M, int N, int K)

void	mlp_token_parallel_bf16 (const uint16_t input, const uint16_t W_fc1, const uint16_t b_fc1, const uint16_t W_fc2, const uint16_t b_fc2, float fc1_output, float output, int T, int aligned_dim, int num_threads, float scratch_bias1_f, float scratch_bias2_f, uint16_t scratch_fc1_bf16)

void	mlp_token_parallel_bf16_fp32act (const uint16_t input, const uint16_t W_fc1, const uint16_t b_fc1, const uint16_t W_fc2, const uint16_t b_fc2, float fc1_output, float output, int T, int aligned_dim, int num_threads, float scratch_input_f, float scratch_bias1_f, float scratch_bias2_f, uint16_t *scratch_fc1_bf16)

Detailed Description

Optimized BF16 MLP Kernels.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Uses direct BF16 GEMM instead of converting to FP32. Layout: input[T,D] -> fc1[T,4D] -> GELU -> fc2[T,D]

All functions use caller-provided scratch buffers (no internal malloc).

Definition in file mlp_kernels_bf16.c.

Function Documentation

◆ gelu_scalar()

static float gelu_scalar ( float x )

inlinestatic

Definition at line 45 of file mlp_kernels_bf16.c.

 {
     const float c = 0.7978845608f;  /* sqrt(2/pi) */
     const float k = 0.044715f;
     float x3 = x * x * x;
     return 0.5f * x * (1.0f + tanhf(c * (x + k * x3)));
 }

Referenced by mlp_token_parallel_bf16(), and mlp_token_parallel_bf16_fp32act().

◆ gemm_bf16_fp32out()

void gemm_bf16_fp32out	(	const uint16_t *	A,
		const uint16_t *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 301 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
 #if defined(__AVX512F__)
     #pragma omp parallel for schedule(dynamic)
     for (int i = 0; i < M; ++i) {
         const uint16_t *a_row = A + (size_t)i * K;
  
         for (int j = 0; j < N; ++j) {
             const uint16_t *b_row = B + (size_t)j * K;
  
             __m512 sum_vec = _mm512_setzero_ps();
  
             int k = 0;
             for (; k <= K - 16; k += 16) {
                 __m256i a_bf16 = _mm256_loadu_si256((const __m256i *)(a_row + k));
                 __m256i b_bf16 = _mm256_loadu_si256((const __m256i *)(b_row + k));
                 sum_vec = bf16_dot16(a_bf16, b_bf16, sum_vec);
             }
  
             float sum = _mm512_reduce_add_ps(sum_vec);
  
             for (; k < K; ++k) {
                 sum += bf16_to_float(a_row[k]) * bf16_to_float(b_row[k]);
             }
  
             if (bias) {
                 sum += bias[j];
             }
  
             C[(size_t)i * N + j] = sum;
         }
     }
 #else
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; ++k) {
                 sum += bf16_to_float(A[(size_t)i * K + k]) *
                        bf16_to_float(B[(size_t)j * K + k]);
             }
             C[(size_t)i * N + j] = sum;
         }
     }
 #endif
 }

References bf16_to_float(), and C.

Referenced by mlp_token_parallel_bf16(), and mlp_token_parallel_bf16_fp32act().

◆ mlp_token_parallel_bf16()

void mlp_token_parallel_bf16	(	const uint16_t *	input,
		const uint16_t *	W_fc1,
		const uint16_t *	b_fc1,
		const uint16_t *	W_fc2,
		const uint16_t *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads,
		float *	scratch_bias1_f,
		float *	scratch_bias2_f,
		uint16_t *	scratch_fc1_bf16
	)

Optimized MLP Forward (BF16 weights, FP32 activations)

Caller-provided scratch buffers: scratch_bias1_f: [4*D] floats scratch_bias2_f: [D] floats scratch_fc1_bf16: [T * 4*D] uint16_t (BF16)

Definition at line 91 of file mlp_kernels_bf16.c.

 {
     if (!input || !W_fc1 || !b_fc1 || !W_fc2 || !b_fc2 || !fc1_output || !output) return;
     if (!scratch_bias1_f || !scratch_bias2_f || !scratch_fc1_bf16) return;
  
     (void)num_threads;
     const int D = aligned_dim;
     const int fourD = 4 * D;
  
     /* Convert biases to FP32 */
     for (int i = 0; i < fourD; ++i) {
         scratch_bias1_f[i] = bf16_to_float(b_fc1[i]);
     }
     for (int i = 0; i < D; ++i) {
         scratch_bias2_f[i] = bf16_to_float(b_fc2[i]);
     }
  
     /* FC1: [T, D] x [4D, D].T -> [T, 4D] */
     gemm_bf16_fp32out(input, W_fc1, scratch_bias1_f, fc1_output, T, fourD, D);
  
     /* GELU activation */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *row = fc1_output + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 x = _mm512_loadu_ps(row + j);
             _mm512_storeu_ps(row + j, gelu_avx512(x));
         }
         for (; j < fourD; ++j) {
             row[j] = gelu_scalar(row[j]);
         }
     }
 #else
     for (int t = 0; t < T; ++t) {
         for (int j = 0; j < fourD; ++j) {
             fc1_output[t * fourD + j] = gelu_scalar(fc1_output[t * fourD + j]);
         }
     }
 #endif
  
     /* Convert FP32 activations to BF16 */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *src = fc1_output + (size_t)t * fourD;
         uint16_t *dst = scratch_fc1_bf16 + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 fp32 = _mm512_loadu_ps(src + j);
             __m512i as_int = _mm512_castps_si512(fp32);
             __m512i lsb = _mm512_srli_epi32(as_int, 16);
             lsb = _mm512_and_si512(lsb, _mm512_set1_epi32(1));
             __m512i rounding = _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), lsb);
             __m512i rounded = _mm512_add_epi32(as_int, rounding);
             __m512i shifted = _mm512_srli_epi32(rounded, 16);
             __m256i bf16 = _mm512_cvtepi32_epi16(shifted);
             _mm256_storeu_si256((__m256i *)(dst + j), bf16);
         }
         for (; j < fourD; ++j) {
             dst[j] = float_to_bf16(src[j]);
         }
     }
 #else
     for (size_t i = 0; i < (size_t)T * fourD; ++i) {
         scratch_fc1_bf16[i] = float_to_bf16(fc1_output[i]);
     }
 #endif
  
     /* FC2: BF16 GEMM with FP32 output */
     gemm_bf16_fp32out(scratch_fc1_bf16, W_fc2, scratch_bias2_f, output, T, D, fourD);
 }

References bf16_to_float(), float_to_bf16(), gelu_scalar(), and gemm_bf16_fp32out().

◆ mlp_token_parallel_bf16_fp32act()

void mlp_token_parallel_bf16_fp32act	(	const uint16_t *	input,
		const uint16_t *	W_fc1,
		const uint16_t *	b_fc1,
		const uint16_t *	W_fc2,
		const uint16_t *	b_fc2,
		float *	fc1_output,
		float *	output,
		int	T,
		int	aligned_dim,
		int	num_threads,
		float *	scratch_input_f,
		float *	scratch_bias1_f,
		float *	scratch_bias2_f,
		uint16_t *	scratch_fc1_bf16
	)

Alternative: Fully FP32 activations throughout

Caller-provided scratch buffers: scratch_input_f: [T * D] floats scratch_bias1_f: [4*D] floats scratch_bias2_f: [D] floats scratch_fc1_bf16: [T * 4*D] uint16_t (BF16)

Definition at line 186 of file mlp_kernels_bf16.c.

 {
     if (!input || !W_fc1 || !b_fc1 || !W_fc2 || !b_fc2 || !fc1_output || !output) return;
     if (!scratch_input_f || !scratch_bias1_f || !scratch_bias2_f || !scratch_fc1_bf16) return;
  
     (void)num_threads;
     const int D = aligned_dim;
     const int fourD = 4 * D;
  
     /* Convert input and biases to FP32 */
     bf16_tensor_to_float(input, scratch_input_f, (size_t)T * D);
     bf16_tensor_to_float(b_fc1, scratch_bias1_f, fourD);
     bf16_tensor_to_float(b_fc2, scratch_bias2_f, D);
  
     /* FC1 */
     gemm_bf16_fp32out(input, W_fc1, scratch_bias1_f, fc1_output, T, fourD, D);
  
     /* GELU */
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int t = 0; t < T; ++t) {
         float *row = fc1_output + (size_t)t * fourD;
         int j = 0;
         for (; j <= fourD - 16; j += 16) {
             __m512 x = _mm512_loadu_ps(row + j);
             _mm512_storeu_ps(row + j, gelu_avx512(x));
         }
         for (; j < fourD; ++j) {
             row[j] = gelu_scalar(row[j]);
         }
     }
 #else
     for (size_t i = 0; i < (size_t)T * fourD; ++i) {
         fc1_output[i] = gelu_scalar(fc1_output[i]);
     }
 #endif
  
     /* Convert fc1_output to BF16 for FC2 */
     float_tensor_to_bf16(fc1_output, scratch_fc1_bf16, (size_t)T * fourD);
     gemm_bf16_fp32out(scratch_fc1_bf16, W_fc2, scratch_bias2_f, output, T, D, fourD);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), gelu_scalar(), and gemm_bf16_fp32out().