← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gelu_kernels_bf16.c File Reference

GELU activation kernels for BF16 tensors. More...

#include <stdint.h>
#include <string.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Go to the source code of this file.

Functions

void gelu_backward_exact_bf16 (const uint16_t *input, const uint16_t *d_output, uint16_t *d_input, size_t n, float *scratch_input, float *scratch_d_output, float *scratch_d_input)
 
void gelu_backward_fast_bf16 (const uint16_t *input, const uint16_t *d_output, uint16_t *d_input, size_t n, float *scratch_input, float *scratch_d_output, float *scratch_d_input)
 
void gelu_fast_inplace_bf16 (uint16_t *data, size_t n, float *scratch)
 

Detailed Description

GELU activation kernels for BF16 tensors.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

GELU: y = x * 0.5 * (1 + erf(x / sqrt(2)))

Definition in file gelu_kernels_bf16.c.

Function Documentation

◆ gelu_backward_exact_bf16()

void gelu_backward_exact_bf16 ( const uint16_t *  input,
const uint16_t *  d_output,
uint16_t *  d_input,
size_t  n,
float *  scratch_input,
float *  scratch_d_output,
float *  scratch_d_input 
)

Definition at line 46 of file gelu_kernels_bf16.c.

53 {
54  if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
55 
56  bf16_tensor_to_float(input, scratch_input, n);
57  bf16_tensor_to_float(d_output, scratch_d_output, n);
58 
59  // Use scalar exact version to avoid fast tanh approximation error
60  // accumulating with BF16 precision loss.
61  gelu_backward_scalar(scratch_input, scratch_d_output, scratch_d_input, n);
62 
63  float_tensor_to_bf16(scratch_d_input, d_input, n);
64 }
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250
void gelu_backward_scalar(const float *input, const float *d_output, float *d_input, size_t n)
Definition: gelu_kernels.c:462

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_backward_scalar().

◆ gelu_backward_fast_bf16()

void gelu_backward_fast_bf16 ( const uint16_t *  input,
const uint16_t *  d_output,
uint16_t *  d_input,
size_t  n,
float *  scratch_input,
float *  scratch_d_output,
float *  scratch_d_input 
)

Definition at line 69 of file gelu_kernels_bf16.c.

76 {
77  if (!scratch_input || !scratch_d_output || !scratch_d_input) return;
78 
79  bf16_tensor_to_float(input, scratch_input, n);
80  bf16_tensor_to_float(d_output, scratch_d_output, n);
81 
82  gelu_backward_fast(scratch_input, scratch_d_output, scratch_d_input, n);
83 
84  float_tensor_to_bf16(scratch_d_input, d_input, n);
85 }
void gelu_backward_fast(const float *input, const float *d_output, float *d_input, size_t n)
Definition: gelu_kernels.c:486

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_backward_fast().

◆ gelu_fast_inplace_bf16()

void gelu_fast_inplace_bf16 ( uint16_t *  data,
size_t  n,
float *  scratch 
)

Definition at line 31 of file gelu_kernels_bf16.c.

32 {
33  if (!scratch) return;
34 
35  bf16_tensor_to_float(data, scratch, n);
36  // Use exact version to avoid fast tanh approximation error accumulating
37  // with BF16 precision loss. Conversion overhead dominates anyway.
38  gelu_exact_inplace(scratch, n);
39  float_tensor_to_bf16(scratch, data, n);
40 }
void gelu_exact_inplace(float *data, size_t n)
Definition: gelu_kernels.c:446

References bf16_tensor_to_float(), float_tensor_to_bf16(), and gelu_exact_inplace().