← Back to C-Kernel-Engine Docs Doxygen Source Documentation
relu_kernels_bf16.c
Go to the documentation of this file.
1 /**
2  * @file relu_kernels_bf16.c
3  * @brief ReLU activation kernels for BF16 tensors
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * ReLU: y = max(0, x)
15  */
16 
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 #include "bf16_utils.h"
21 #include "ckernel_engine.h"
22 
23 void relu_forward_bf16(const uint16_t *input, uint16_t *output, size_t n)
24 {
25  if (!input || !output) {
26  return;
27  }
28  for (size_t i = 0; i < n; ++i) {
29  float x = bf16_to_float(input[i]);
30  output[i] = float_to_bf16(x > 0.0f ? x : 0.0f);
31  }
32 }
33 
34 void relu_forward_inplace_bf16(uint16_t *data, size_t n)
35 {
36  if (!data) {
37  return;
38  }
39  for (size_t i = 0; i < n; ++i) {
40  float x = bf16_to_float(data[i]);
41  data[i] = float_to_bf16(x > 0.0f ? x : 0.0f);
42  }
43 }
44 
45 void relu_backward_bf16(const uint16_t *input,
46  const uint16_t *d_output,
47  uint16_t *d_input,
48  size_t n)
49 {
50  if (!input || !d_output || !d_input) {
51  return;
52  }
53  for (size_t i = 0; i < n; ++i) {
54  float x = bf16_to_float(input[i]);
55  float dy = bf16_to_float(d_output[i]);
56  d_input[i] = float_to_bf16(x > 0.0f ? dy : 0.0f);
57  }
58 }
59 
static uint16_t float_to_bf16(float f)
Definition: bf16_utils.h:90
static float bf16_to_float(uint16_t v)
Definition: bf16_utils.h:38
void relu_forward_inplace_bf16(uint16_t *data, size_t n)
void relu_forward_bf16(const uint16_t *input, uint16_t *output, size_t n)
void relu_backward_bf16(const uint16_t *input, const uint16_t *d_output, uint16_t *d_input, size_t n)