← Back to C-Kernel-Engine Docs Doxygen Source Documentation
sigmoid_kernels.c
Go to the documentation of this file.
1 /**
2  * @file sigmoid_kernels.c
3  * @brief Sigmoid activation kernels with SIMD (AVX512)
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Sigmoid: y = 1 / (1 + exp(-x))
15  */
16 
17 #include <math.h>
18 #include <stddef.h>
19 #include <stdlib.h>
20 
21 #if defined(__AVX512F__)
22 #include <immintrin.h>
23 #endif
24 
25 /* Core sigmoid scalar kernel */
26 float sigmoid_scalar(float x)
27 {
28  return 1.0f / (1.0f + expf(-x));
29 }
30 
31 #if defined(__AVX512F__)
32 // Fast exp approximation using polynomial (avoids SVML dependency)
33 // Based on Schraudolph's algorithm with refinement
34 static inline __m512 exp_approx_avx512(__m512 x)
35 {
36  // Clamp x to avoid overflow/underflow
37  const __m512 max_val = _mm512_set1_ps(88.0f);
38  const __m512 min_val = _mm512_set1_ps(-88.0f);
39  x = _mm512_max_ps(_mm512_min_ps(x, max_val), min_val);
40 
41  // exp(x) = 2^(x * log2(e)) = 2^(x * 1.4426950408889634)
42  const __m512 log2e = _mm512_set1_ps(1.4426950408889634f);
43  __m512 z = _mm512_mul_ps(x, log2e);
44 
45  // Split into integer and fractional parts
46  __m512 zf = _mm512_roundscale_ps(z, _MM_FROUND_TO_NEAREST_INT);
47  __m512 f = _mm512_sub_ps(z, zf); // fractional part in [-0.5, 0.5]
48 
49  // Polynomial approximation for 2^f where f in [-0.5, 0.5]
50  // 2^f ≈ 1 + f*ln(2) + f^2*ln(2)^2/2 + f^3*ln(2)^3/6 + ...
51  // Using optimized coefficients
52  const __m512 c0 = _mm512_set1_ps(1.0f);
53  const __m512 c1 = _mm512_set1_ps(0.6931471805599453f); // ln(2)
54  const __m512 c2 = _mm512_set1_ps(0.2402265069591007f); // ln(2)^2/2
55  const __m512 c3 = _mm512_set1_ps(0.05550410866482158f); // ln(2)^3/6
56  const __m512 c4 = _mm512_set1_ps(0.009618129107628478f); // ln(2)^4/24
57 
58  // Horner's method: c0 + f*(c1 + f*(c2 + f*(c3 + f*c4)))
59  __m512 poly = _mm512_fmadd_ps(f, c4, c3);
60  poly = _mm512_fmadd_ps(f, poly, c2);
61  poly = _mm512_fmadd_ps(f, poly, c1);
62  poly = _mm512_fmadd_ps(f, poly, c0);
63 
64  // Scale by 2^n using integer manipulation
65  __m512i zi = _mm512_cvtps_epi32(zf);
66  zi = _mm512_add_epi32(zi, _mm512_set1_epi32(127)); // Add IEEE754 exponent bias
67  zi = _mm512_slli_epi32(zi, 23); // Shift to exponent position
68  __m512 scale = _mm512_castsi512_ps(zi);
69 
70  return _mm512_mul_ps(poly, scale);
71 }
72 
73 static inline __m512 sigmoid_avx512_vec(__m512 x)
74 {
75  __m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), x);
76  __m512 exp_neg = exp_approx_avx512(neg);
77  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg);
78  return _mm512_div_ps(_mm512_set1_ps(1.0f), denom);
79 }
80 
81 static void sigmoid_forward_avx512(const float *input,
82  float *output,
83  size_t n)
84 {
85  size_t i = 0;
86  for (; i + 16 <= n; i += 16) {
87  __m512 in_vec = _mm512_loadu_ps(input + i);
88  __m512 sig = sigmoid_avx512_vec(in_vec);
89  _mm512_storeu_ps(output + i, sig);
90  }
91 
92  for (; i < n; ++i) {
93  output[i] = sigmoid_scalar(input[i]);
94  }
95 }
96 
97 static void sigmoid_backward_avx512(const float *input,
98  const float *d_output,
99  float *d_input,
100  size_t n)
101 {
102  const __m512 one = _mm512_set1_ps(1.0f);
103  size_t i = 0;
104  for (; i + 16 <= n; i += 16) {
105  __m512 in_vec = _mm512_loadu_ps(input + i);
106  __m512 s = sigmoid_avx512_vec(in_vec);
107  __m512 dout = _mm512_loadu_ps(d_output + i);
108  __m512 grad = _mm512_mul_ps(_mm512_mul_ps(s, _mm512_sub_ps(one, s)), dout);
109  _mm512_storeu_ps(d_input + i, grad);
110  }
111 
112  for (; i < n; ++i) {
113  float x = input[i];
114  float s = sigmoid_scalar(x);
115  float s_prime = s * (1.0f - s);
116  d_input[i] = d_output[i] * s_prime;
117  }
118 }
119 #endif
120 
121 // Vectorized (loop) sigmoid forward over a contiguous buffer.
122 void sigmoid_forward(const float *input,
123  float *output,
124  size_t n)
125 {
126 #if defined(__AVX512F__)
127  sigmoid_forward_avx512(input, output, n);
128 #else
129  for (size_t i = 0; i < n; ++i) {
130  output[i] = sigmoid_scalar(input[i]);
131  }
132 #endif
133 }
134 
135 // Sigmoid backward over a contiguous buffer:
136 // Given dY and X, compute dX = dY * s * (1 - s),
137 // where s = sigmoid(X).
138 void sigmoid_backward(const float *input,
139  const float *d_output,
140  float *d_input,
141  size_t n)
142 {
143 #if defined(__AVX512F__)
144  sigmoid_backward_avx512(input, d_output, d_input, n);
145 #else
146  for (size_t i = 0; i < n; ++i) {
147  float x = input[i];
148  float s = sigmoid_scalar(x);
149  float s_prime = s * (1.0f - s);
150  d_input[i] = d_output[i] * s_prime;
151  }
152 #endif
153 }
float sigmoid_scalar(float x)
void sigmoid_backward(const float *input, const float *d_output, float *d_input, size_t n)
void sigmoid_forward(const float *input, float *output, size_t n)