GELU activation kernels with SIMD (SSE/AVX/AVX512) More...

#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "bf16_utils.h"

Functions
void	geglu_backward_fp32 (const float x, const float d_out, float *d_x, int tokens, int dim)

void	geglu_forward_bf16 (const uint16_t x, uint16_t out, int tokens, int dim, float *scratch)

void	geglu_forward_fp32 (const float x, float out, int tokens, int dim)

void	gelu_backward_exact (const float input, const float d_output, float *d_input, size_t n)

void	gelu_backward_fast (const float input, const float d_output, float *d_input, size_t n)

void	gelu_backward_scalar (const float input, const float d_output, float *d_input, size_t n)

void	gelu_exact_inplace (float *data, size_t n)

void	gelu_fast_inplace (float *data, size_t n)

Detailed Description

GELU activation kernels with SIMD (SSE/AVX/AVX512)

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

GELU: y = x * 0.5 * (1 + erf(x / sqrt(2))) Fast approx: y = x * sigmoid(1.702 * x)

Definition in file gelu_kernels.c.

Function Documentation

◆ geglu_backward_fp32()

void geglu_backward_fp32	(	const float *	x,
		const float *	d_out,
		float *	d_x,
		int	tokens,
		int	dim
	)

GeGLU backward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_backward_fp32

dL/dx given dL/d(out) where out = GELU(a) * b Chain rule: dL/da = dL/dout * d(GELU)/da * b dL/db = dL/dout * GELU(a)

After changes: make test

Definition at line 843 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         const float *d_out_ptr = d_out + (size_t)t * dim;
         float *d_x_ptr = d_x + (size_t)t * inner_dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float dout = d_out_ptr[d];
  
             // GELU(a) derivative components
             float a2 = a * a;
             float a3 = a2 * a;
             float g = sqrt_2_over_pi * (a + coeff * a3);
             float tanh_g = tanhf(g);
             float sech2_g = 1.0f - tanh_g * tanh_g;
             float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * a2);
  
             // d(GELU)/da = 0.5 * (1 + tanh(g)) + 0.5 * a * sech^2(g) * g'
             float d_gelu = 0.5f * (1.0f + tanh_g) + 0.5f * a * sech2_g * g_prime;
  
             // dL/da = dL/dout * d(GELU)/da * b
             d_x_ptr[d] = dout * d_gelu * b;
  
             // dL/db = dL/dout * GELU(a)
             float gelu_a = 0.5f * a * (1.0f + tanh_g);
             d_x_ptr[dim + d] = dout * gelu_a;
         }
     }
 }

Referenced by ck_test_geglu_backward().

◆ geglu_forward_bf16()

void geglu_forward_bf16	(	const uint16_t *	x,
		uint16_t *	out,
		int	tokens,
		int	dim,
		float *	scratch
	)

GeGLU forward pass (bf16)

Test:: test_geglu.py::TestGeGLU::test_geglu_forward_bf16

BF16 version: converts to FP32, computes, converts back. Caller provides scratch buffer of size 3 * tokens * dim * sizeof(float).

Layout:

scratch[0 : 2*tokens*dim] = FP32 input [a, b]
scratch[2*tokens*dim : ...] = FP32 output

Note: We need separate buffers for input and output to avoid overlap when tokens > 1. The input is 2*dim per token, output is dim per token.

After changes: make test

Definition at line 813 of file gelu_kernels.c.

 {
     if (!x || !out || !scratch) return;
  
     const size_t fp32_size = (size_t)tokens * (size_t)dim;
     const size_t input_size = fp32_size * 2;  // [a, b] = 2*dim per token
     float *fp32_input = scratch;
     float *fp32_output = scratch + input_size;
  
     // Convert BF16 input to FP32
     bf16_tensor_to_float(x, fp32_input, input_size);
  
     // Run FP32 GeGLU (output goes to separate buffer to avoid overlap)
     geglu_forward_fp32(fp32_input, fp32_output, tokens, dim);
  
     // Convert FP32 output back to BF16
     float_tensor_to_bf16(fp32_output, out, fp32_size);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and geglu_forward_fp32().

◆ geglu_forward_fp32()

void geglu_forward_fp32	(	const float *	x,
		float *	out,
		int	tokens,
		int	dim
	)

GeGLU forward pass (fp32)

Test:: test_geglu.py::TestGeGLU::test_geglu_forward_fp32

Computes out = GELU(a) * b where x = [a, b] along last dimension. Input shape: [tokens, 2 * dim], Output shape: [tokens, dim]

After changes: make test

Definition at line 623 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     const int inner_dim = dim * 2;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         // Process first half (a) with GELU, second half (b) directly
         for (; d + 32 <= dim; d += 32) {
             // Load a (first half of inner_dim)
             __m512 a0 = _mm512_loadu_ps(&x_ptr[d]);
             __m512 a1 = _mm512_loadu_ps(&x_ptr[d + 16]);
  
             // Compute GELU(a)
             __m512 a0_sq = _mm512_mul_ps(a0, a0);
             __m512 a0_cu = _mm512_mul_ps(a0_sq, a0);
             __m512 a1_sq = _mm512_mul_ps(a1, a1);
             __m512 a1_cu = _mm512_mul_ps(a1_sq, a1);
  
             // inner = sqrt(2/pi) * (a + 0.044715 * a^3)
             __m512 inner0 = _mm512_fmadd_ps(coeff_vec, a0_cu, a0);
             __m512 inner1 = _mm512_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm512_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm512_mul_ps(sqrt_2_pi_vec, inner1);
  
             // tanh(inner)
             __m512 tanh0 = tanh512_fast(inner0);
             __m512 tanh1 = tanh512_fast(inner1);
  
             // GELU = 0.5 * a * (1 + tanh)
             __m512 gelu0 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a0, _mm512_add_ps(one_vec, tanh0)));
             __m512 gelu1 = _mm512_mul_ps(half_vec, _mm512_mul_ps(a1, _mm512_add_ps(one_vec, tanh1)));
  
             // Load b (second half of inner_dim)
             __m512 b0 = _mm512_loadu_ps(&x_ptr[dim + d]);
             __m512 b1 = _mm512_loadu_ps(&x_ptr[dim + d + 16]);
  
             // out = GELU(a) * b
             _mm512_storeu_ps(&out_ptr[d], _mm512_mul_ps(gelu0, b0));
             _mm512_storeu_ps(&out_ptr[d + 16], _mm512_mul_ps(gelu1, b1));
         }
         // Handle remaining
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 16 <= dim; d += 16) {
             // Load a
             __m256 a0 = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a1 = _mm256_loadu_ps(&x_ptr[d + 8]);
  
             // GELU(a)
             __m256 a0_sq = _mm256_mul_ps(a0, a0);
             __m256 a0_cu = _mm256_mul_ps(a0_sq, a0);
             __m256 a1_sq = _mm256_mul_ps(a1, a1);
             __m256 a1_cu = _mm256_mul_ps(a1_sq, a1);
  
             __m256 inner0 = _mm256_fmadd_ps(coeff_vec, a0_cu, a0);
             __m256 inner1 = _mm256_fmadd_ps(coeff_vec, a1_cu, a1);
             inner0 = _mm256_mul_ps(sqrt_2_pi_vec, inner0);
             inner1 = _mm256_mul_ps(sqrt_2_pi_vec, inner1);
  
             __m256 tanh0 = tanh256_fast(inner0);
             __m256 tanh1 = tanh256_fast(inner1);
  
             __m256 gelu0 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a0, _mm256_add_ps(one_vec, tanh0)));
             __m256 gelu1 = _mm256_mul_ps(half_vec, _mm256_mul_ps(a1, _mm256_add_ps(one_vec, tanh1)));
  
             // b
             __m256 b0 = _mm256_loadu_ps(&x_ptr[dim + d]);
             __m256 b1 = _mm256_loadu_ps(&x_ptr[dim + d + 8]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu0, b0));
             _mm256_storeu_ps(&out_ptr[d + 8], _mm256_mul_ps(gelu1, b1));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #elif defined(__AVX__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     float inner_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         int d = 0;
         for (; d + 8 <= dim; d += 8) {
             __m256 a = _mm256_loadu_ps(&x_ptr[d]);
             __m256 a_sq = _mm256_mul_ps(a, a);
             __m256 a_cu = _mm256_mul_ps(a_sq, a);
  
             __m256 coeff_a_cu = _mm256_mul_ps(coeff_vec, a_cu);
             __m256 inner = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(a, coeff_a_cu));
  
             _mm256_store_ps(inner_arr, inner);
             for (int j = 0; j < 8; ++j) {
                 tanh_arr[j] = tanhf(inner_arr[j]);
             }
             __m256 tanh_val = _mm256_load_ps(tanh_arr);
  
             __m256 gelu = _mm256_mul_ps(half_vec, _mm256_mul_ps(a, _mm256_add_ps(one_vec, tanh_val)));
             __m256 b = _mm256_loadu_ps(&x_ptr[dim + d]);
  
             _mm256_storeu_ps(&out_ptr[d], _mm256_mul_ps(gelu, b));
         }
         for (; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
  
 #else
     // Scalar fallback
     for (int t = 0; t < tokens; ++t) {
         const float *x_ptr = x + (size_t)t * inner_dim;
         float *out_ptr = out + (size_t)t * dim;
  
         for (int d = 0; d < dim; ++d) {
             float a = x_ptr[d];
             float b = x_ptr[dim + d];
             float a3 = a * a * a;
             float inner = sqrt_2_over_pi * (a + coeff * a3);
             float gelu_a = 0.5f * a * (1.0f + tanhf(inner));
             out_ptr[d] = gelu_a * b;
         }
     }
 #endif
 }

References __attribute__().

Referenced by ck_test_geglu(), and geglu_forward_bf16().

◆ gelu_backward_exact()

void gelu_backward_exact	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 257 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 coeff3_vec = _mm512_set1_ps(3.0f * coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&input[i]);
         __m512 dy = _mm512_loadu_ps(&d_output[i]);
  
         __m512 x2 = _mm512_mul_ps(x, x);
         __m512 x3 = _mm512_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m512 g = _mm512_fmadd_ps(coeff_vec, x3, x);
         g = _mm512_mul_ps(sqrt_2_pi_vec, g);
  
         __m512 tanh_g = tanh512_fast(g);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m512 g_prime = _mm512_fmadd_ps(coeff3_vec, x2, one_vec);
         g_prime = _mm512_mul_ps(sqrt_2_pi_vec, g_prime);
  
         // sech^2(g) = 1 - tanh^2(g)
         __m512 sech2_g = _mm512_fnmadd_ps(tanh_g, tanh_g, one_vec);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m512 term1 = _mm512_mul_ps(half_vec, _mm512_add_ps(one_vec, tanh_g));
         __m512 term2 = _mm512_mul_ps(half_vec, _mm512_mul_ps(x, _mm512_mul_ps(sech2_g, g_prime)));
         __m512 gelu_deriv = _mm512_add_ps(term1, term2);
  
         __m512 result = _mm512_mul_ps(dy, gelu_deriv);
         _mm512_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 coeff3_vec = _mm256_set1_ps(3.0f * coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 g = _mm256_fmadd_ps(coeff_vec, x3, x);
         g = _mm256_mul_ps(sqrt_2_pi_vec, g);
  
         __m256 tanh_g = tanh256_fast(g);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m256 g_prime = _mm256_fmadd_ps(coeff3_vec, x2, one_vec);
         g_prime = _mm256_mul_ps(sqrt_2_pi_vec, g_prime);
  
         // sech^2(g) = 1 - tanh^2(g)
         __m256 sech2_g = _mm256_fnmadd_ps(tanh_g, tanh_g, one_vec);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m256 term1 = _mm256_mul_ps(half_vec, _mm256_add_ps(one_vec, tanh_g));
         __m256 term2 = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, _mm256_mul_ps(sech2_g, g_prime)));
         __m256 gelu_deriv = _mm256_add_ps(term1, term2);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar tanh
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 coeff3_vec = _mm256_set1_ps(3.0f * coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     float g_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // g = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 coeff_x3 = _mm256_mul_ps(coeff_vec, x3);
         __m256 g = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(x, coeff_x3));
  
         // Compute tanh scalarly
         _mm256_store_ps(g_arr, g);
         for (int j = 0; j < 8; ++j) {
             tanh_arr[j] = tanhf(g_arr[j]);
         }
         __m256 tanh_g = _mm256_load_ps(tanh_arr);
  
         // g' = sqrt(2/pi) * (1 + 3 * 0.044715 * x^2)
         __m256 coeff3_x2 = _mm256_mul_ps(coeff3_vec, x2);
         __m256 g_prime = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(one_vec, coeff3_x2));
  
         // sech^2(g) = 1 - tanh^2(g)
         __m256 tanh_g_sq = _mm256_mul_ps(tanh_g, tanh_g);
         __m256 sech2_g = _mm256_sub_ps(one_vec, tanh_g_sq);
  
         // gelu_derivative = 0.5 * (1 + tanh_g) + 0.5 * x * sech2_g * g_prime
         __m256 term1 = _mm256_mul_ps(half_vec, _mm256_add_ps(one_vec, tanh_g));
         __m256 term2 = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, _mm256_mul_ps(sech2_g, g_prime)));
         __m256 gelu_deriv = _mm256_add_ps(term1, term2);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #else
     // Scalar fallback
     for (size_t i = 0; i < n; ++i) {
         float x = input[i];
  
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
  
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
  
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative =
             0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
  
         d_input[i] = d_output[i] * gelu_derivative;
     }
 #endif
 }

References __attribute__().

◆ gelu_backward_fast()

void gelu_backward_fast	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 486 of file gelu_kernels.c.

 {
     const float beta = 1.702f;
  
 #if defined(__AVX512F__)
     const __m512 beta_vec = _mm512_set1_ps(beta);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
     const __m512 neg_beta_vec = _mm512_set1_ps(-beta);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&input[i]);
         __m512 dy = _mm512_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m512 neg_beta_x = _mm512_mul_ps(neg_beta_vec, x);
         __m512 exp_neg = exp512_fast(neg_beta_x);
         __m512 s = _mm512_div_ps(one_vec, _mm512_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m512 one_minus_s = _mm512_sub_ps(one_vec, s);
         __m512 inner = _mm512_fmadd_ps(_mm512_mul_ps(x, one_minus_s), beta_vec, one_vec);
         __m512 gelu_deriv = _mm512_mul_ps(s, inner);
  
         __m512 result = _mm512_mul_ps(dy, gelu_deriv);
         _mm512_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX2__)
     const __m256 beta_vec = _mm256_set1_ps(beta);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
     const __m256 neg_beta_vec = _mm256_set1_ps(-beta);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m256 neg_beta_x = _mm256_mul_ps(neg_beta_vec, x);
         __m256 exp_neg = exp256_fast(neg_beta_x);
         __m256 s = _mm256_div_ps(one_vec, _mm256_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m256 one_minus_s = _mm256_sub_ps(one_vec, s);
         __m256 inner = _mm256_fmadd_ps(_mm256_mul_ps(x, one_minus_s), beta_vec, one_vec);
         __m256 gelu_deriv = _mm256_mul_ps(s, inner);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar exp
     const __m256 beta_vec = _mm256_set1_ps(beta);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
     const __m256 neg_beta_vec = _mm256_set1_ps(-beta);
  
     size_t i = 0;
     float neg_beta_x_arr[8] __attribute__((aligned(32)));
     float exp_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&input[i]);
         __m256 dy = _mm256_loadu_ps(&d_output[i]);
  
         // s = sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
         __m256 neg_beta_x = _mm256_mul_ps(neg_beta_vec, x);
  
         // Compute exp scalarly
         _mm256_store_ps(neg_beta_x_arr, neg_beta_x);
         for (int j = 0; j < 8; ++j) {
             exp_arr[j] = expf(neg_beta_x_arr[j]);
         }
         __m256 exp_neg = _mm256_load_ps(exp_arr);
  
         __m256 s = _mm256_div_ps(one_vec, _mm256_add_ps(one_vec, exp_neg));
  
         // gelu_derivative = s * (1 + x * (1 - s) * beta)
         __m256 one_minus_s = _mm256_sub_ps(one_vec, s);
         __m256 x_one_minus_s = _mm256_mul_ps(x, one_minus_s);
         __m256 x_one_minus_s_beta = _mm256_mul_ps(x_one_minus_s, beta_vec);
         __m256 inner = _mm256_add_ps(one_vec, x_one_minus_s_beta);
         __m256 gelu_deriv = _mm256_mul_ps(s, inner);
  
         __m256 result = _mm256_mul_ps(dy, gelu_deriv);
         _mm256_storeu_ps(&d_input[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = input[i];
         float s = 1.0f / (1.0f + expf(-beta * x));
         float gelu_derivative = s * (1.0f + x * (1.0f - s) * beta);
         d_input[i] = d_output[i] * gelu_derivative;
     }
 #endif
 }

References __attribute__().

Referenced by gelu_backward_fast_bf16().

◆ gelu_backward_scalar()

void gelu_backward_scalar	(	const float *	input,
		const float *	d_output,
		float *	d_input,
		size_t	n
	)

Definition at line 462 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     for (size_t i = 0; i < n; ++i) {
         float x = input[i];
         float x3 = x * x * x;
         float g = sqrt_2_over_pi * (x + coeff * x3);
         float tanh_g = tanhf(g);
         float x2 = x * x;
         float g_prime = sqrt_2_over_pi * (1.0f + 3.0f * coeff * x2);
         float sech2_g = 1.0f - tanh_g * tanh_g;
         float gelu_derivative = 0.5f * (1.0f + tanh_g) + 0.5f * x * sech2_g * g_prime;
         d_input[i] = d_output[i] * gelu_derivative;
     }
 }

Referenced by gelu_backward_exact_bf16().

◆ gelu_exact_inplace()

void gelu_exact_inplace	(	float *	data,
		size_t	n
	)

Definition at line 446 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
     for (size_t i = 0; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
 }

Referenced by gelu_fast_inplace_bf16(), and mlp_token_parallel_exact().

◆ gelu_fast_inplace()

void gelu_fast_inplace	(	float *	data,
		size_t	n
	)

GELU activation forward (fast approximation, in-place)

Test:

test_gelu.py::TestGELUForward::test_gelu_fast_inplace

test_gelu.py::TestGELUForward::test_gelu_vs_exact

test_parity.py::test_gelu_parity

Fast GELU approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) In-place on contiguous buffer.

After changes: make test && make llamacpp-parity-full

Definition at line 132 of file gelu_kernels.c.

 {
     const float sqrt_2_over_pi = 0.7978845608f;
     const float coeff = 0.044715f;
  
 #if defined(__AVX512F__)
     const __m512 sqrt_2_pi_vec = _mm512_set1_ps(sqrt_2_over_pi);
     const __m512 coeff_vec = _mm512_set1_ps(coeff);
     const __m512 half_vec = _mm512_set1_ps(0.5f);
     const __m512 one_vec = _mm512_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 16 <= n; i += 16) {
         __m512 x = _mm512_loadu_ps(&data[i]);
         __m512 x2 = _mm512_mul_ps(x, x);
         __m512 x3 = _mm512_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m512 inner = _mm512_fmadd_ps(coeff_vec, x3, x);
         inner = _mm512_mul_ps(sqrt_2_pi_vec, inner);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m512 tanh_val = tanh512_fast(inner);
         __m512 one_plus_tanh = _mm512_add_ps(one_vec, tanh_val);
         __m512 result = _mm512_mul_ps(half_vec, _mm512_mul_ps(x, one_plus_tanh));
  
         _mm512_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #elif defined(__AVX2__)
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&data[i]);
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 inner = _mm256_fmadd_ps(coeff_vec, x3, x);
         inner = _mm256_mul_ps(sqrt_2_pi_vec, inner);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m256 tanh_val = tanh256_fast(inner);
         __m256 one_plus_tanh = _mm256_add_ps(one_vec, tanh_val);
         __m256 result = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, one_plus_tanh));
  
         _mm256_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #elif defined(__AVX__)
     // AVX1: Vectorize arithmetic, use scalar tanh
     const __m256 sqrt_2_pi_vec = _mm256_set1_ps(sqrt_2_over_pi);
     const __m256 coeff_vec = _mm256_set1_ps(coeff);
     const __m256 half_vec = _mm256_set1_ps(0.5f);
     const __m256 one_vec = _mm256_set1_ps(1.0f);
  
     size_t i = 0;
     float inner_arr[8] __attribute__((aligned(32)));
     float tanh_arr[8] __attribute__((aligned(32)));
  
     for (; i + 8 <= n; i += 8) {
         __m256 x = _mm256_loadu_ps(&data[i]);
         __m256 x2 = _mm256_mul_ps(x, x);
         __m256 x3 = _mm256_mul_ps(x2, x);
  
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
         __m256 coeff_x3 = _mm256_mul_ps(coeff_vec, x3);
         __m256 inner = _mm256_mul_ps(sqrt_2_pi_vec, _mm256_add_ps(x, coeff_x3));
  
         // Compute tanh scalarly
         _mm256_store_ps(inner_arr, inner);
         for (int j = 0; j < 8; ++j) {
             tanh_arr[j] = tanhf(inner_arr[j]);
         }
         __m256 tanh_val = _mm256_load_ps(tanh_arr);
  
         // result = 0.5 * x * (1 + tanh(inner))
         __m256 one_plus_tanh = _mm256_add_ps(one_vec, tanh_val);
         __m256 result = _mm256_mul_ps(half_vec, _mm256_mul_ps(x, one_plus_tanh));
  
         _mm256_storeu_ps(&data[i], result);
     }
     // Handle remaining elements
     for (; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
  
 #else
     // Scalar fallback
     for (size_t i = 0; i < n; ++i) {
         float x = data[i];
         float x3 = x * x * x;
         float inner = sqrt_2_over_pi * (x + coeff * x3);
         data[i] = 0.5f * x * (1.0f + tanhf(inner));
     }
 #endif
 }

References __attribute__().

Referenced by mlp_token_parallel().

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Function Documentation

◆ geglu_backward_fp32()

◆ geglu_forward_bf16()

◆ geglu_forward_fp32()

◆ gelu_backward_exact()

◆ gelu_backward_fast()

◆ gelu_backward_scalar()

◆ gelu_exact_inplace()

◆ gelu_fast_inplace()