Optimized BF16 GEMM Kernels for AVX-512. More...

#include <stdint.h>
#include <string.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Macros
#define	BLK_K 256

#define	BLK_M 64

#define	BLK_N 64

Functions
	__attribute__ ((unused))

static int	ck_min_i (int a, int b)

void	gemm_bf16_fp32out (const uint16_t A, const uint16_t B, const float bias, float C, int M, int N, int K)

void	gemm_blocked_serial_bf16 (const uint16_t A, const uint16_t B, const uint16_t bias, uint16_t C, int M, int N, int K)

void	gemm_nn_bf16 (const uint16_t A, const uint16_t B, const uint16_t bias, uint16_t C, int M, int N, int K)

void	gemm_tn_bf16 (const uint16_t A, const uint16_t B, const uint16_t bias, uint16_t C, int M, int N, int K)

Detailed Description

Optimized BF16 GEMM Kernels for AVX-512.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Layout: A: [M x K] row-major (BF16) B: [N x K] row-major, stored as [out x in] (BF16) C: [M x N] row-major (BF16 or FP32)

Key optimizations:

AVX-512 BF16 instructions (VDPBF16PS) when available
Cache blocking for L1/L2 efficiency
Vectorized BF16<->FP32 conversion
OpenMP parallelization

Definition in file gemm_kernels_bf16.c.

Macro Definition Documentation

◆ BLK_K

#define BLK_K 256

Definition at line 43 of file gemm_kernels_bf16.c.

◆ BLK_M

#define BLK_M 64

Definition at line 41 of file gemm_kernels_bf16.c.

◆ BLK_N

#define BLK_N 64

Definition at line 42 of file gemm_kernels_bf16.c.

Function Documentation

◆ attribute()

__attribute__ ( (unused) )

Definition at line 51 of file gemm_kernels_bf16.c.

 {
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float sum = bias ? bf16_to_float(bias[j]) : 0.0f;
             const size_t a_row = (size_t)i * (size_t)K;
             const size_t b_row = (size_t)j * (size_t)K;
             for (int k = 0; k < K; ++k) {
                 sum += bf16_to_float(A[a_row + k]) * bf16_to_float(B[b_row + k]);
             }
             C[(size_t)i * (size_t)N + j] = float_to_bf16(sum);
         }
     }
 }

References bf16_to_float(), C, and float_to_bf16().

◆ ck_min_i()

static int ck_min_i	(	int	a,
		int	b
	)

inlinestatic

Definition at line 45 of file gemm_kernels_bf16.c.

45 { return a < b ? a : b; }

◆ gemm_bf16_fp32out()

void gemm_bf16_fp32out	(	const uint16_t *	A,
		const uint16_t *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 301 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
 #if defined(__AVX512F__)
     #pragma omp parallel for schedule(dynamic)
     for (int i = 0; i < M; ++i) {
         const uint16_t *a_row = A + (size_t)i * K;
  
         for (int j = 0; j < N; ++j) {
             const uint16_t *b_row = B + (size_t)j * K;
  
             __m512 sum_vec = _mm512_setzero_ps();
  
             int k = 0;
             for (; k <= K - 16; k += 16) {
                 __m256i a_bf16 = _mm256_loadu_si256((const __m256i *)(a_row + k));
                 __m256i b_bf16 = _mm256_loadu_si256((const __m256i *)(b_row + k));
                 sum_vec = bf16_dot16(a_bf16, b_bf16, sum_vec);
             }
  
             float sum = _mm512_reduce_add_ps(sum_vec);
  
             for (; k < K; ++k) {
                 sum += bf16_to_float(a_row[k]) * bf16_to_float(b_row[k]);
             }
  
             if (bias) {
                 sum += bias[j];
             }
  
             C[(size_t)i * N + j] = sum;
         }
     }
 #else
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float sum = bias ? bias[j] : 0.0f;
             for (int k = 0; k < K; ++k) {
                 sum += bf16_to_float(A[(size_t)i * K + k]) *
                        bf16_to_float(B[(size_t)j * K + k]);
             }
             C[(size_t)i * N + j] = sum;
         }
     }
 #endif
 }

References bf16_to_float(), and C.

Referenced by mlp_token_parallel_bf16(), and mlp_token_parallel_bf16_fp32act().

◆ gemm_blocked_serial_bf16()

void gemm_blocked_serial_bf16	(	const uint16_t *	A,
		const uint16_t *	B,
		const uint16_t *	bias,
		uint16_t *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 272 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
 #if HAVE_NATIVE_BF16
     /* Native BF16 instructions available (Ice Lake / Sapphire Rapids+) */
     gemm_bf16_native(A, B, bias, C, M, N, K);
 #elif defined(__AVX512F__)
     /* Use AVX-512F with software BF16 conversion */
     if (M * N > 4096) {
         gemm_bf16_blocked_avx512(A, B, bias, C, M, N, K);
     } else {
         gemm_bf16_avx512(A, B, bias, C, M, N, K);
     }
 #else
     /* Scalar fallback */
     gemm_bf16_scalar(A, B, bias, C, M, N, K);
 #endif
 }

References C.

◆ gemm_nn_bf16()

void gemm_nn_bf16	(	const uint16_t *	A,
		const uint16_t *	B,
		const uint16_t *	bias,
		uint16_t *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 360 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
 #if defined(__AVX512F__)
     #pragma omp parallel for
     for (int i = 0; i < M; ++i) {
         /* Initialize row with bias */
         int j = 0;
         for (; j <= N - 16; j += 16) {
             __m512 b_vec = bias ? bf16x16_to_fp32(_mm256_loadu_si256((const __m256i *)(bias + j)))
                                 : _mm512_setzero_ps();
             __m256i out = fp32x16_to_bf16(b_vec);
             _mm256_storeu_si256((__m256i *)(C + (size_t)i * N + j), out);
         }
         for (; j < N; ++j) {
             float b = bias ? bf16_to_float(bias[j]) : 0.0f;
             C[(size_t)i * N + j] = float_to_bf16(b);
         }
  
         /* Accumulate: C[i,:] += A[i,k] * B[k,:] */
         for (int k = 0; k < K; ++k) {
             float a_val = bf16_to_float(A[(size_t)i * K + k]);
             __m512 a_broadcast = _mm512_set1_ps(a_val);
  
             j = 0;
             for (; j <= N - 16; j += 16) {
                 __m256i b_bf16 = _mm256_loadu_si256((const __m256i *)(B + (size_t)k * N + j));
                 __m512 b_fp32 = bf16x16_to_fp32(b_bf16);
  
                 __m256i c_bf16 = _mm256_loadu_si256((const __m256i *)(C + (size_t)i * N + j));
                 __m512 c_fp32 = bf16x16_to_fp32(c_bf16);
  
                 c_fp32 = _mm512_fmadd_ps(a_broadcast, b_fp32, c_fp32);
  
                 __m256i c_out = fp32x16_to_bf16(c_fp32);
                 _mm256_storeu_si256((__m256i *)(C + (size_t)i * N + j), c_out);
             }
             for (; j < N; ++j) {
                 float c_val = bf16_to_float(C[(size_t)i * N + j]);
                 c_val += a_val * bf16_to_float(B[(size_t)k * N + j]);
                 C[(size_t)i * N + j] = float_to_bf16(c_val);
             }
         }
     }
 #else
     /* Scalar fallback */
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float sum = bias ? bf16_to_float(bias[j]) : 0.0f;
             for (int k = 0; k < K; ++k) {
                 sum += bf16_to_float(A[(size_t)i * K + k]) *
                        bf16_to_float(B[(size_t)k * N + j]);
             }
             C[(size_t)i * N + j] = float_to_bf16(sum);
         }
     }
 #endif
 }

References bf16_to_float(), C, and float_to_bf16().

◆ gemm_tn_bf16()

void gemm_tn_bf16	(	const uint16_t *	A,
		const uint16_t *	B,
		const uint16_t *	bias,
		uint16_t *	C,
		int	M,
		int	N,
		int	K
	)

Definition at line 427 of file gemm_kernels_bf16.c.

 {
     if (!A || !B || !C || M <= 0 || N <= 0 || K <= 0) {
         return;
     }
  
     /* A is [K x M], we want A.T which is [M x K] */
     /* B is [K x N] */
     /* C is [M x N] */
  
 #if defined(__AVX512F__)
     /* Initialize C with bias */
     #pragma omp parallel for
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float b = bias ? bf16_to_float(bias[j]) : 0.0f;
             C[(size_t)i * N + j] = float_to_bf16(b);
         }
     }
  
     /* Accumulate: C[i,j] += sum_k A[k,i] * B[k,j] */
     #pragma omp parallel for
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             __m512 sum_vec = _mm512_setzero_ps();
  
             int k = 0;
             for (; k <= K - 16; k += 16) {
                 /* Gather A[k:k+16, i] - strided access */
                 __m512 a_fp32 = _mm512_setzero_ps();
                 for (int kk = 0; kk < 16; ++kk) {
                     float val = bf16_to_float(A[(size_t)(k + kk) * M + i]);
                     a_fp32 = _mm512_mask_mov_ps(a_fp32, 1 << kk, _mm512_set1_ps(val));
                 }
  
                 /* Note: B has stride N, so we need to gather element by element */
                 __m512 b_fp32 = _mm512_setzero_ps();
                 for (int kk = 0; kk < 16; ++kk) {
                     float val = bf16_to_float(B[(size_t)(k + kk) * N + j]);
                     b_fp32 = _mm512_mask_mov_ps(b_fp32, 1 << kk, _mm512_set1_ps(val));
                 }
  
                 sum_vec = _mm512_fmadd_ps(a_fp32, b_fp32, sum_vec);
             }
  
             float sum = _mm512_reduce_add_ps(sum_vec);
  
             for (; k < K; ++k) {
                 sum += bf16_to_float(A[(size_t)k * M + i]) *
                        bf16_to_float(B[(size_t)k * N + j]);
             }
  
             float old_val = bf16_to_float(C[(size_t)i * N + j]);
             C[(size_t)i * N + j] = float_to_bf16(old_val + sum);
         }
     }
 #else
     for (int i = 0; i < M; ++i) {
         for (int j = 0; j < N; ++j) {
             float sum = bias ? bf16_to_float(bias[j]) : 0.0f;
             for (int k = 0; k < K; ++k) {
                 sum += bf16_to_float(A[(size_t)k * M + i]) *
                        bf16_to_float(B[(size_t)k * N + j]);
             }
             C[(size_t)i * N + j] = float_to_bf16(sum);
         }
     }
 #endif
 }

References bf16_to_float(), C, and float_to_bf16().

Macros

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Macro Definition Documentation

◆ BLK_K

◆ BLK_M

◆ BLK_N

Function Documentation

◆ __attribute__()

◆ ck_min_i()

◆ gemm_bf16_fp32out()

◆ gemm_blocked_serial_bf16()

◆ gemm_nn_bf16()

◆ gemm_tn_bf16()

◆ attribute()