LayerNorm kernels for BF16 tensors. More...
Go to the source code of this file.
Functions | |
| void | layernorm_backward_kernel_bf16 (const uint16_t *d_output, const uint16_t *input, const float *gamma, const float *mean, const float *rstd, uint16_t *d_input, float *d_gamma, float *d_beta, int tokens, int d_model, int aligned_embed_dim, float *scratch_d_output, float *scratch_input, float *scratch_d_input) |
| void | layernorm_forward_rolled_slice_bf16 (const uint16_t *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, uint16_t *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps, float *scratch_input, float *scratch_output) |
| void | layernorm_forward_unrolled_slice_bf16 (const uint16_t *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, uint16_t *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps, float *scratch_input, float *scratch_output) |
LayerNorm kernels for BF16 tensors.
After changes: make test && make llamacpp-parity-full
LayerNorm: y = gamma * (x - mean) / sqrt(var + eps) + beta
Definition in file layernorm_kernels_bf16.c.
| void layernorm_backward_kernel_bf16 | ( | const uint16_t * | d_output, |
| const uint16_t * | input, | ||
| const float * | gamma, | ||
| const float * | mean, | ||
| const float * | rstd, | ||
| uint16_t * | d_input, | ||
| float * | d_gamma, | ||
| float * | d_beta, | ||
| int | tokens, | ||
| int | d_model, | ||
| int | aligned_embed_dim, | ||
| float * | scratch_d_output, | ||
| float * | scratch_input, | ||
| float * | scratch_d_input | ||
| ) |
Definition at line 84 of file layernorm_kernels_bf16.c.
References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_backward_kernel().
| void layernorm_forward_rolled_slice_bf16 | ( | const uint16_t *__restrict | input_slice_base, |
| const float *__restrict | gamma, | ||
| const float *__restrict | beta, | ||
| uint16_t *__restrict | output_slice_base, | ||
| float *__restrict | mean_cache_slice, | ||
| float *__restrict | rstd_cache_slice, | ||
| int | num_tokens_in_slice, | ||
| int | d_model, | ||
| int | aligned_embed_dim, | ||
| float | eps, | ||
| float * | scratch_input, | ||
| float * | scratch_output | ||
| ) |
Definition at line 30 of file layernorm_kernels_bf16.c.
References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_rolled_slice().
| void layernorm_forward_unrolled_slice_bf16 | ( | const uint16_t *__restrict | input_slice_base, |
| const float *__restrict | gamma, | ||
| const float *__restrict | beta, | ||
| uint16_t *__restrict | output_slice_base, | ||
| float *__restrict | mean_cache_slice, | ||
| float *__restrict | rstd_cache_slice, | ||
| int | num_tokens_in_slice, | ||
| int | d_model, | ||
| float | eps, | ||
| float * | scratch_input, | ||
| float * | scratch_output | ||
| ) |
Definition at line 57 of file layernorm_kernels_bf16.c.
References bf16_tensor_to_float(), float_tensor_to_bf16(), and layernorm_forward_unrolled_slice().