Element-wise addition kernels for BF16 tensors. More...
Go to the source code of this file.
Functions | |
| void | add_backward_bf16 (const uint16_t *d_y, uint16_t *d_a, uint16_t *d_b, size_t n) |
| void | add_forward_2d_bf16 (const uint16_t *a, const uint16_t *b, uint16_t *y, int tokens, int dim, int aligned_dim) |
| void | add_forward_bf16 (const uint16_t *a, const uint16_t *b, uint16_t *y, size_t n) |
| void | add_forward_f32 (const float *a, const float *b, float *y, size_t n) |
| void | add_inplace_bf16 (uint16_t *a, const uint16_t *b, size_t n) |
| void | add_inplace_f32 (float *a, const float *b, size_t n) |
| void | add_scaled_forward_bf16 (const uint16_t *a, const uint16_t *b, uint16_t *y, float alpha, size_t n) |
| void | add_scaled_inplace_bf16 (uint16_t *a, const uint16_t *b, float alpha, size_t n) |
Element-wise addition kernels for BF16 tensors.
After changes: make test && make llamacpp-parity-full
Used for residual connections in transformer models: residual = x + sublayer_output
Supports:
Definition in file add_kernels_bf16.c.
| void add_backward_bf16 | ( | const uint16_t * | d_y, |
| uint16_t * | d_a, | ||
| uint16_t * | d_b, | ||
| size_t | n | ||
| ) |
Definition at line 173 of file add_kernels_bf16.c.
| void add_forward_2d_bf16 | ( | const uint16_t * | a, |
| const uint16_t * | b, | ||
| uint16_t * | y, | ||
| int | tokens, | ||
| int | dim, | ||
| int | aligned_dim | ||
| ) |
| void add_forward_bf16 | ( | const uint16_t * | a, |
| const uint16_t * | b, | ||
| uint16_t * | y, | ||
| size_t | n | ||
| ) |
| void add_forward_f32 | ( | const float * | a, |
| const float * | b, | ||
| float * | y, | ||
| size_t | n | ||
| ) |
Element-wise add: y = a + b
test_add.py::TestAddForward::test_add_forward_f32
test_add.py::TestAddForward::test_add_inplace_f32
test_multi_layer_parity.py::TestMultiLayerParity::test_residual_add
Element-wise addition of two vectors.
After changes: make test
Definition at line 270 of file add_kernels_bf16.c.
| void add_inplace_bf16 | ( | uint16_t * | a, |
| const uint16_t * | b, | ||
| size_t | n | ||
| ) |
| void add_inplace_f32 | ( | float * | a, |
| const float * | b, | ||
| size_t | n | ||
| ) |
| void add_scaled_forward_bf16 | ( | const uint16_t * | a, |
| const uint16_t * | b, | ||
| uint16_t * | y, | ||
| float | alpha, | ||
| size_t | n | ||
| ) |
| void add_scaled_inplace_bf16 | ( | uint16_t * | a, |
| const uint16_t * | b, | ||
| float | alpha, | ||
| size_t | n | ||
| ) |