← Back to C-Kernel-Engine Docs Doxygen Source Documentation
layernorm_kernels_bf16.c File Reference

LayerNorm kernels for BF16 tensors. More...

#include <stdint.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Go to the source code of this file.

Functions

void layernorm_backward_kernel_bf16 (const uint16_t *d_output, const uint16_t *input, const float *gamma, const float *mean, const float *rstd, uint16_t *d_input, float *d_gamma, float *d_beta, int tokens, int d_model, int aligned_embed_dim, float *scratch_d_output, float *scratch_input, float *scratch_d_input)
 
void layernorm_forward_rolled_slice_bf16 (const uint16_t *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, uint16_t *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps, float *scratch_input, float *scratch_output)
 
void layernorm_forward_unrolled_slice_bf16 (const uint16_t *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, uint16_t *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps, float *scratch_input, float *scratch_output)
 

Detailed Description

LayerNorm kernels for BF16 tensors.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

LayerNorm: y = gamma * (x - mean) / sqrt(var + eps) + beta

Definition in file layernorm_kernels_bf16.c.

Function Documentation

◆ layernorm_backward_kernel_bf16()

void layernorm_backward_kernel_bf16 ( const uint16_t *  d_output,
const uint16_t *  input,
const float *  gamma,
const float *  mean,
const float *  rstd,
uint16_t *  d_input,
float *  d_gamma,
float *  d_beta,
int  tokens,
int  d_model,
int  aligned_embed_dim,
float *  scratch_d_output,
float *  scratch_input,
float *  scratch_d_input 
)

Definition at line 84 of file layernorm_kernels_bf16.c.

96 {
97  if (!scratch_d_output || !scratch_input || !scratch_d_input) return;
98 
99  size_t total = (size_t)tokens * (size_t)aligned_embed_dim;
100 
101  bf16_tensor_to_float(d_output, scratch_d_output, total);
102  bf16_tensor_to_float(input, scratch_input, total);
103 
104  layernorm_backward_kernel(scratch_d_output, scratch_input, gamma, mean, rstd,
105  scratch_d_input, d_gamma, d_beta,
106  tokens, d_model, aligned_embed_dim);
107 
108  float_tensor_to_bf16(scratch_d_input, d_input, total);
109 }
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250
void layernorm_backward_kernel(const float *d_output, const float *input, const float *gamma, const float *mean, const float *rstd, float *d_input, float *d_gamma, float *d_beta, int tokens, int d_model, int aligned_embed_dim)

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_backward_kernel().

◆ layernorm_forward_rolled_slice_bf16()

void layernorm_forward_rolled_slice_bf16 ( const uint16_t *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
uint16_t *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
int  aligned_embed_dim,
float  eps,
float *  scratch_input,
float *  scratch_output 
)

Definition at line 30 of file layernorm_kernels_bf16.c.

42 {
43  if (!scratch_input || !scratch_output) return;
44 
45  size_t total = (size_t)num_tokens_in_slice * (size_t)aligned_embed_dim;
46 
47  bf16_tensor_to_float(input_slice_base, scratch_input, total);
48  layernorm_forward_rolled_slice(scratch_input, gamma, beta,
49  scratch_output, mean_cache_slice, rstd_cache_slice,
50  num_tokens_in_slice, d_model, aligned_embed_dim, eps);
51  float_tensor_to_bf16(scratch_output, output_slice_base, total);
52 }
void layernorm_forward_rolled_slice(const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_rolled_slice().

◆ layernorm_forward_unrolled_slice_bf16()

void layernorm_forward_unrolled_slice_bf16 ( const uint16_t *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
uint16_t *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
float  eps,
float *  scratch_input,
float *  scratch_output 
)

Definition at line 57 of file layernorm_kernels_bf16.c.

68 {
69  if (!scratch_input || !scratch_output) return;
70 
71  size_t total = (size_t)num_tokens_in_slice * (size_t)d_model;
72 
73  bf16_tensor_to_float(input_slice_base, scratch_input, total);
74  layernorm_forward_unrolled_slice(scratch_input, gamma, beta,
75  scratch_output, mean_cache_slice, rstd_cache_slice,
76  num_tokens_in_slice, d_model, eps);
77  float_tensor_to_bf16(scratch_output, output_slice_base, total);
78 }
void layernorm_forward_unrolled_slice(const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)

References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_unrolled_slice().