← Back to C-Kernel-Engine Docs Doxygen Source Documentation
layernorm_kernels.c File Reference

LayerNorm forward/backward kernels with SIMD (SSE/AVX/AVX512) More...

#include <math.h>

Go to the source code of this file.

Functions

void layernorm_backward_kernel (const float *d_output, const float *input, const float *gamma, const float *mean, const float *rstd, float *d_input, float *d_gamma, float *d_beta, int tokens, int d_model, int aligned_embed_dim)
 
void layernorm_forward_rolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_forward_unrolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)
 
static void layernorm_forward_unrolled_slice_scalar (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)
 
void layernorm_naive_serial (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_naive_serial_matched_precision (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps)
 
static void zero_layernorm_padding (float *out_ptr, int d_model, int aligned_embed_dim)
 

Detailed Description

LayerNorm forward/backward kernels with SIMD (SSE/AVX/AVX512)

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

LayerNorm: y = gamma * (x - mean) / sqrt(var + eps) + beta

Definition in file layernorm_kernels.c.

Function Documentation

◆ layernorm_backward_kernel()

void layernorm_backward_kernel ( const float *  d_output,
const float *  input,
const float *  gamma,
const float *  mean,
const float *  rstd,
float *  d_input,
float *  d_gamma,
float *  d_beta,
int  tokens,
int  d_model,
int  aligned_embed_dim 
)

Definition at line 668 of file layernorm_kernels.c.

677 {
678  int T = tokens;
679  int D = d_model;
680  int aligned_D = aligned_embed_dim;
681 
682  // Per-token input gradients
683  for (int t = 0; t < T; ++t) {
684  float mean_t = mean[t];
685  float rstd_t = rstd[t];
686 
687  float d_y_gamma_sum = 0.0f;
688  float d_y_gamma_xhat_sum = 0.0f;
689 
690  // First pass: compute sums
691  for (int d = 0; d < D; ++d) {
692  float x = input[t * aligned_D + d];
693  float x_hat = (x - mean_t) * rstd_t;
694  float d_y = d_output[t * aligned_D + d];
695  float d_y_gamma = d_y * gamma[d];
696 
697  d_y_gamma_sum += d_y_gamma;
698  d_y_gamma_xhat_sum += d_y_gamma * x_hat;
699  }
700 
701  // Second pass: compute input gradients
702  float scale = rstd_t / (float)D;
703  for (int d = 0; d < D; ++d) {
704  float x = input[t * aligned_D + d];
705  float x_hat = (x - mean_t) * rstd_t;
706  float d_y = d_output[t * aligned_D + d];
707 
708  d_input[t * aligned_D + d] =
709  scale * ((float)D * d_y * gamma[d] - d_y_gamma_sum - x_hat * d_y_gamma_xhat_sum);
710  }
711 
712  // Zero padding for aligned dimension beyond D
713  for (int d = D; d < aligned_D; ++d) {
714  d_input[t * aligned_D + d] = 0.0f;
715  }
716  }
717 
718  // Parameter gradients (gamma, beta)
719  for (int d = 0; d < D; ++d) {
720  float gamma_grad = 0.0f;
721  float beta_grad = 0.0f;
722 
723  for (int t = 0; t < T; ++t) {
724  float x = input[t * aligned_D + d];
725  float x_hat = (x - mean[t]) * rstd[t];
726  float d_y = d_output[t * aligned_D + d];
727 
728  gamma_grad += d_y * x_hat;
729  beta_grad += d_y;
730  }
731 
732  d_gamma[d] += gamma_grad;
733  d_beta[d] += beta_grad;
734  }
735 }

Referenced by layernorm_backward_kernel_bf16().

◆ layernorm_forward_rolled_slice()

void layernorm_forward_rolled_slice ( const float *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
float *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
int  aligned_embed_dim,
float  eps 
)

Definition at line 274 of file layernorm_kernels.c.

284 {
285 #if defined(__AVX512F__)
286  layernorm_forward_rolled_slice_avx512(input_slice_base, gamma, beta,
287  output_slice_base, mean_cache_slice, rstd_cache_slice,
288  num_tokens_in_slice, d_model, aligned_embed_dim, eps);
289 #elif defined(__AVX2__) || defined(__AVX__)
290  layernorm_forward_rolled_slice_avx256(input_slice_base, gamma, beta,
291  output_slice_base, mean_cache_slice, rstd_cache_slice,
292  num_tokens_in_slice, d_model, aligned_embed_dim, eps);
293 #else
294  layernorm_naive_serial(input_slice_base, gamma, beta,
295  output_slice_base, mean_cache_slice, rstd_cache_slice,
296  num_tokens_in_slice, d_model, aligned_embed_dim, eps);
297 #endif
298 }
void layernorm_naive_serial(const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

References layernorm_naive_serial().

Referenced by layernorm_forward_rolled_slice_bf16().

◆ layernorm_forward_unrolled_slice()

void layernorm_forward_unrolled_slice ( const float *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
float *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
float  eps 
)

Definition at line 598 of file layernorm_kernels.c.

607 {
608 #if defined(__AVX512F__)
609  layernorm_forward_unrolled_slice_avx512(input_slice_base, gamma, beta,
610  output_slice_base, mean_cache_slice, rstd_cache_slice,
611  num_tokens_in_slice, d_model, eps);
612 #elif defined(__AVX2__) || defined(__AVX__)
613  layernorm_forward_unrolled_slice_avx256(input_slice_base, gamma, beta,
614  output_slice_base, mean_cache_slice, rstd_cache_slice,
615  num_tokens_in_slice, d_model, eps);
616 #else
617  layernorm_forward_unrolled_slice_scalar(input_slice_base, gamma, beta,
618  output_slice_base, mean_cache_slice, rstd_cache_slice,
619  num_tokens_in_slice, d_model, eps);
620 #endif
621 }
static void layernorm_forward_unrolled_slice_scalar(const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)

References layernorm_forward_unrolled_slice_scalar().

Referenced by layernorm_forward_unrolled_slice_bf16().

◆ layernorm_forward_unrolled_slice_scalar()

static void layernorm_forward_unrolled_slice_scalar ( const float *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
float *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
float  eps 
)
static

Definition at line 582 of file layernorm_kernels.c.

591 {
592  layernorm_naive_serial_matched_precision(input_slice_base, gamma, beta,
593  output_slice_base, mean_cache_slice, rstd_cache_slice,
594  num_tokens_in_slice, d_model, eps);
595 }
void layernorm_naive_serial_matched_precision(const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps)

References layernorm_naive_serial_matched_precision().

Referenced by layernorm_forward_unrolled_slice().

◆ layernorm_naive_serial()

void layernorm_naive_serial ( const float *  input,
const float *  gamma,
const float *  beta,
float *  output,
float *  mean_cache,
float *  rstd_cache,
int  tokens,
int  d_model,
int  aligned_embed_dim,
float  eps 
)

Definition at line 51 of file layernorm_kernels.c.

59 {
60  for (int t = 0; t < tokens; ++t) {
61  const float *in_ptr = input + t * aligned_embed_dim;
62  float *out_ptr = output + t * aligned_embed_dim;
63 
64  float sum_val = 0.0f;
65  for (int i = 0; i < d_model; ++i) {
66  sum_val += in_ptr[i];
67  }
68  float mean = sum_val / (float)d_model;
69 
70  float sum_sq_diff = 0.0f;
71  for (int i = 0; i < d_model; ++i) {
72  float diff = in_ptr[i] - mean;
73  sum_sq_diff += diff * diff;
74  }
75  float variance = sum_sq_diff / (float)d_model + eps;
76 
77  double var_double = (double)variance;
78  float inv_std = (float)(1.0 / sqrt(var_double));
79 
80  for (int i = 0; i < d_model; ++i) {
81  float normalized_val = (in_ptr[i] - mean) * inv_std;
82  out_ptr[i] = normalized_val * gamma[i] + beta[i];
83  }
84 
85  if (mean_cache) {
86  mean_cache[t] = mean;
87  }
88  if (rstd_cache) {
89  rstd_cache[t] = inv_std;
90  }
91  /* Keep aligned padding quiet so future GEMMs see deterministic memory. */
92  if (aligned_embed_dim > d_model) {
93  /* Keep padded lanes zeroed so subsequent GEMMs never read stale data. */
94  for (int i = d_model; i < aligned_embed_dim; ++i) {
95  out_ptr[i] = 0.0f;
96  }
97  }
98  }
99 }

Referenced by layernorm_forward_rolled_slice().

◆ layernorm_naive_serial_matched_precision()

void layernorm_naive_serial_matched_precision ( const float *  input,
const float *  gamma,
const float *  beta,
float *  output,
float *  mean_cache,
float *  rstd_cache,
int  tokens,
int  d_model,
float  eps 
)

Definition at line 624 of file layernorm_kernels.c.

631 {
632  for (int t = 0; t < tokens; ++t) {
633  const float *in_ptr = input + t * d_model;
634  float *out_ptr = output + t * d_model;
635 
636  float sum_val = 0.0f;
637  for (int i = 0; i < d_model; ++i) {
638  sum_val += in_ptr[i];
639  }
640  float mean = sum_val / (float)d_model;
641 
642  float sum_sq_diff = 0.0f;
643  for (int i = 0; i < d_model; ++i) {
644  float diff = in_ptr[i] - mean;
645  sum_sq_diff += diff * diff;
646  }
647  float variance = sum_sq_diff / (float)d_model + eps;
648 
649  double var_double = (double)variance;
650  float inv_std = (float)(1.0 / sqrt(var_double));
651 
652  for (int i = 0; i < d_model; ++i) {
653  float normalized_val = (in_ptr[i] - mean) * inv_std;
654  out_ptr[i] = normalized_val * gamma[i] + beta[i];
655  }
656 
657  if (mean_cache) {
658  mean_cache[t] = mean;
659  }
660  if (rstd_cache) {
661  rstd_cache[t] = inv_std;
662  }
663  }
664 }

Referenced by layernorm_forward_unrolled_slice_scalar().

◆ zero_layernorm_padding()

static void zero_layernorm_padding ( float *  out_ptr,
int  d_model,
int  aligned_embed_dim 
)
inlinestatic

Definition at line 22 of file layernorm_kernels.c.

25 {
26  for (int idx = d_model; idx < aligned_embed_dim; ++idx) {
27  out_ptr[idx] = 0.0f;
28  }
29 }