← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gelu_kernels_bf16.c
Go to the documentation of this file.
1 /**
2  * @file gelu_kernels_bf16.c
3  * @brief GELU activation kernels for BF16 tensors
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * GELU: y = x * 0.5 * (1 + erf(x / sqrt(2)))
15  */
16 
17 #include <stdint.h>
18 #include <string.h>
19 
20 #include "bf16_utils.h"
21 #include "ckernel_engine.h"
22 
23 /* Suppress false positive warnings about uninitialized variables */
24 #pragma GCC diagnostic push
25 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
26 
27 /*
28  * BF16 GELU with caller-provided scratch buffer.
29  * scratch: [n] floats - caller allocates and reuses
30  */
31 void gelu_fast_inplace_bf16(uint16_t *data, size_t n, float *scratch)
32 {
33  if (!scratch) return;
34 
35  bf16_tensor_to_float(data, scratch, n);
36  // Use exact version to avoid fast tanh approximation error accumulating
37  // with BF16 precision loss. Conversion overhead dominates anyway.
38  gelu_exact_inplace(scratch, n);
39  float_tensor_to_bf16(scratch, data, n);
40 }
41 
42 /*
43  * BF16 GELU backward with caller-provided scratch buffers.
44  * scratch_input, scratch_d_output, scratch_d_input: each [n] floats
45  */
46 void gelu_backward_exact_bf16(const uint16_t *input,
47  const uint16_t *d_output,
48  uint16_t *d_input,
49  size_t n,
50  float *scratch_input,
51  float *scratch_d_output,
52  float *scratch_d_input)
53 {
54  if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
55 
56  bf16_tensor_to_float(input, scratch_input, n);
57  bf16_tensor_to_float(d_output, scratch_d_output, n);
58 
59  // Use scalar exact version to avoid fast tanh approximation error
60  // accumulating with BF16 precision loss.
61  gelu_backward_scalar(scratch_input, scratch_d_output, scratch_d_input, n);
62 
63  float_tensor_to_bf16(scratch_d_input, d_input, n);
64 }
65 
66 /*
67  * BF16 GELU backward (fast) with caller-provided scratch buffers.
68  */
69 void gelu_backward_fast_bf16(const uint16_t *input,
70  const uint16_t *d_output,
71  uint16_t *d_input,
72  size_t n,
73  float *scratch_input,
74  float *scratch_d_output,
75  float *scratch_d_input)
76 {
77  if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
78 
79  bf16_tensor_to_float(input, scratch_input, n);
80  bf16_tensor_to_float(d_output, scratch_d_output, n);
81 
82  gelu_backward_fast(scratch_input, scratch_d_output, scratch_d_input, n);
83 
84  float_tensor_to_bf16(scratch_d_input, d_input, n);
85 }
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250
void gelu_exact_inplace(float *data, size_t n)
Definition: gelu_kernels.c:446
void gelu_backward_scalar(const float *input, const float *d_output, float *d_input, size_t n)
Definition: gelu_kernels.c:462
void gelu_backward_fast(const float *input, const float *d_output, float *d_input, size_t n)
Definition: gelu_kernels.c:486
void gelu_backward_fast_bf16(const uint16_t *input, const uint16_t *d_output, uint16_t *d_input, size_t n, float *scratch_input, float *scratch_d_output, float *scratch_d_input)
void gelu_backward_exact_bf16(const uint16_t *input, const uint16_t *d_output, uint16_t *d_input, size_t n, float *scratch_input, float *scratch_d_output, float *scratch_d_input)
void gelu_fast_inplace_bf16(uint16_t *data, size_t n, float *scratch)