RMSNorm kernels for BF16 tensors. More...

#include "bf16_utils.h"
#include "ckernel_engine.h"
#include <math.h>
#include <stdint.h>

Functions
void	rmsnorm_backward_bf16 (const uint16_t d_output, const uint16_t input, const float gamma, const float rstd_cache, uint16_t d_input, float d_gamma, int tokens, int d_model, int aligned_embed_dim)

void	rmsnorm_forward_bf16 (const uint16_t input, const float gamma, uint16_t output, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

Detailed Description

RMSNorm kernels for BF16 tensors.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

RMSNorm: y[i] = gamma[i] * x[i] / sqrt(mean(x^2) + eps)

Definition in file rmsnorm_kernels_bf16.c.

Function Documentation

◆ rmsnorm_backward_bf16()

void rmsnorm_backward_bf16	(	const uint16_t *	d_output,
		const uint16_t *	input,
		const float *	gamma,
		const float *	rstd_cache,
		uint16_t *	d_input,
		float *	d_gamma,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim
	)

Definition at line 113 of file rmsnorm_kernels_bf16.c.

 {
     int T = tokens;
     int D = d_model;
     int aligned = aligned_embed_dim;
  
     if (!d_output || !input || !gamma || !rstd_cache || !d_input || !d_gamma) {
         return;
     }
  
     // Zero parameter gradients
 #if defined(__AVX512F__)
     {
         int d = 0;
         for (; d + 16 <= D; d += 16) {
             _mm512_storeu_ps(&d_gamma[d], _mm512_setzero_ps());
         }
         for (; d < D; ++d) {
             d_gamma[d] = 0.0f;
         }
     }
 #else
     for (int d = 0; d < D; ++d) {
         d_gamma[d] = 0.0f;
     }
 #endif
  
     for (int t = 0; t < T; ++t) {
         const uint16_t *x_bf16 = input + (size_t)t * aligned;
         const uint16_t *dY_bf16 = d_output + (size_t)t * aligned;
         uint16_t *dX_bf16 = d_input + (size_t)t * aligned;
         float rstd = rstd_cache[t];
  
 #if defined(__AVX512F__)
         // Compute m = (1/D) * sum_j (dY_j * gamma_j * x_hat_j)
         __m512 rstd_vec = _mm512_set1_ps(rstd);
         __m512 sum_vec = _mm512_setzero_ps();
         int d = 0;
  
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             __m512 dyv = bf16_loadu_cvt_fp32(&dY_bf16[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
             // sum += dY * gamma * x_hat
             __m512 prod = _mm512_mul_ps(dyv, gv);
             sum_vec = _mm512_fmadd_ps(prod, x_hat, sum_vec);
         }
         float sum_dY_g_xhat = _mm512_reduce_add_ps(sum_vec);
  
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             sum_dY_g_xhat += dy * gamma[d] * x_hat;
         }
         float m = sum_dY_g_xhat / (float)D;
  
         // Compute dX and accumulate dGamma (vectorized)
         __m512 m_vec = _mm512_set1_ps(m);
         d = 0;
         for (; d + 16 <= D; d += 16) {
             __m512 xv = bf16_loadu_cvt_fp32(&x_bf16[d]);
             __m512 dyv = bf16_loadu_cvt_fp32(&dY_bf16[d]);
             __m512 gv = _mm512_loadu_ps(&gamma[d]);
             __m512 dgv = _mm512_loadu_ps(&d_gamma[d]);
  
             __m512 x_hat = _mm512_mul_ps(xv, rstd_vec);
  
             // dX = rstd * (dY * gamma - x_hat * m)
             __m512 dy_g = _mm512_mul_ps(dyv, gv);
             __m512 xhat_m = _mm512_mul_ps(x_hat, m_vec);
             __m512 diff = _mm512_sub_ps(dy_g, xhat_m);
             __m512 dxv = _mm512_mul_ps(rstd_vec, diff);
             fp32_cvt_storeu_bf16(&dX_bf16[d], dxv);
  
             // d_gamma += dY * x_hat
             dgv = _mm512_fmadd_ps(dyv, x_hat, dgv);
             _mm512_storeu_ps(&d_gamma[d], dgv);
         }
         // Handle remaining elements
         for (; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             float dx = rstd * (dy * gamma[d] - x_hat * m);
             dX_bf16[d] = float_to_bf16(dx);
             d_gamma[d] += dy * x_hat;
         }
  
 #else
         // Scalar fallback
         double sum_dY_g_xhat = 0.0;
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             sum_dY_g_xhat += (double)dy * (double)gamma[d] * (double)x_hat;
         }
         float m = (float)(sum_dY_g_xhat / (double)D);
  
         for (int d = 0; d < D; ++d) {
             float x = bf16_to_float(x_bf16[d]);
             float x_hat = x * rstd;
             float dy = bf16_to_float(dY_bf16[d]);
             float dx = rstd * (dy * gamma[d] - x_hat * m);
             dX_bf16[d] = float_to_bf16(dx);
             d_gamma[d] += dy * x_hat;
         }
 #endif
  
         // Zero padding gradients
         for (int d = D; d < aligned; ++d) {
             dX_bf16[d] = 0;
         }
     }
 }

References bf16_to_float(), and float_to_bf16().

◆ rmsnorm_forward_bf16()

void rmsnorm_forward_bf16	(	const uint16_t *	input,
		const float *	gamma,
		uint16_t *	output,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

Definition at line 24 of file rmsnorm_kernels_bf16.c.