← Back to C-Kernel-Engine Docs Doxygen Source Documentation
axpy_kernels.c
Go to the documentation of this file.
1 /**
2  * @file axpy_kernels.c
3  * @brief AXPY kernels for FP32: y = y + alpha * x
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Classic BLAS Level-1 operation used in MoE expert output accumulation.
15  * When gathering expert outputs: output += weight[i] * expert_output[i]
16  *
17  * Operations:
18  * - axpy_f32: y += alpha * x (in-place)
19  * - axpy_strided_f32: strided version for non-contiguous memory
20  * - weighted_sum_f32: sum multiple vectors with weights
21  */
22 
23 #include <stdint.h>
24 #include <stddef.h>
25 #include <string.h>
26 
27 #ifdef __AVX512F__
28 #include <immintrin.h>
29 #endif
30 
31 #ifdef __AVX2__
32 #include <immintrin.h>
33 #endif
34 
35 /* =============================================================================
36  * AXPY: y = y + alpha * x
37  *
38  * Core operation for MoE expert gathering:
39  * output = sum_i(weight_i * expert_output_i)
40  *
41  * Implemented as: output += weight * expert_output (called for each expert)
42  * ============================================================================= */
43 
44 /**
45  * @brief In-place AXPY: y += alpha * x
46  * @test test_axpy.py::TestAXPY::test_axpy_f32
47  * @test test_axpy.py::TestAXPY::test_axpy_vs_naive
48  *
49  * In-place scaled vector addition: y += alpha * x
50  * BLAS-like axpy operation.
51  *
52  * After changes: make test
53  */
54 void axpy_f32(float *y,
55  const float *x,
56  float alpha,
57  int n)
58 {
59  if (!y || !x || n <= 0) {
60  return;
61  }
62 
63  int i = 0;
64 
65 #ifdef __AVX512F__
66  __m512 valpha = _mm512_set1_ps(alpha);
67  for (; i + 16 <= n; i += 16) {
68  __m512 vy = _mm512_loadu_ps(&y[i]);
69  __m512 vx = _mm512_loadu_ps(&x[i]);
70  vy = _mm512_fmadd_ps(vx, valpha, vy); /* y = y + alpha * x */
71  _mm512_storeu_ps(&y[i], vy);
72  }
73 #endif
74 
75 #ifdef __AVX2__
76  __m256 valpha256 = _mm256_set1_ps(alpha);
77  for (; i + 8 <= n; i += 8) {
78  __m256 vy = _mm256_loadu_ps(&y[i]);
79  __m256 vx = _mm256_loadu_ps(&x[i]);
80  vy = _mm256_fmadd_ps(vx, valpha256, vy);
81  _mm256_storeu_ps(&y[i], vy);
82  }
83 #endif
84 
85  /* Scalar remainder */
86  for (; i < n; i++) {
87  y[i] += alpha * x[i];
88  }
89 }
90 
91 /* =============================================================================
92  * Scaled copy: y = alpha * x
93  *
94  * First step when accumulating: initialize output with first expert's result.
95  * ============================================================================= */
96 
97 /**
98  * @brief Scaled copy: y = alpha * x
99  *
100  * @param y Output vector [n]
101  * @param x Input vector [n]
102  * @param alpha Scalar multiplier
103  * @param n Vector length
104  */
105 void scal_copy_f32(float *y,
106  const float *x,
107  float alpha,
108  int n)
109 {
110  if (!y || !x || n <= 0) {
111  return;
112  }
113 
114  int i = 0;
115 
116 #ifdef __AVX512F__
117  __m512 valpha = _mm512_set1_ps(alpha);
118  for (; i + 16 <= n; i += 16) {
119  __m512 vx = _mm512_loadu_ps(&x[i]);
120  __m512 vy = _mm512_mul_ps(vx, valpha);
121  _mm512_storeu_ps(&y[i], vy);
122  }
123 #endif
124 
125 #ifdef __AVX2__
126  __m256 valpha256 = _mm256_set1_ps(alpha);
127  for (; i + 8 <= n; i += 8) {
128  __m256 vx = _mm256_loadu_ps(&x[i]);
129  __m256 vy = _mm256_mul_ps(vx, valpha256);
130  _mm256_storeu_ps(&y[i], vy);
131  }
132 #endif
133 
134  for (; i < n; i++) {
135  y[i] = alpha * x[i];
136  }
137 }
138 
139 /* =============================================================================
140  * Weighted sum: y = sum_i(weights[i] * x[i])
141  *
142  * Combine multiple expert outputs with their routing weights in one pass.
143  * More efficient than multiple axpy calls when all inputs are available.
144  * ============================================================================= */
145 
146 /**
147  * @brief Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i])
148  *
149  * @param y Output vector [n]
150  * @param vectors Array of k input vector pointers, each [n]
151  * @param weights Array of k scalar weights
152  * @param k Number of vectors to combine
153  * @param n Vector length
154  */
155 void weighted_sum_f32(float *y,
156  const float **vectors,
157  const float *weights,
158  int k,
159  int n)
160 {
161  if (!y || !vectors || !weights || k <= 0 || n <= 0) {
162  return;
163  }
164 
165  /* Initialize with first vector */
166  scal_copy_f32(y, vectors[0], weights[0], n);
167 
168  /* Accumulate rest */
169  for (int i = 1; i < k; i++) {
170  axpy_f32(y, vectors[i], weights[i], n);
171  }
172 }
173 
174 /* =============================================================================
175  * Zero-initialized AXPY accumulation
176  *
177  * Zero output first, then accumulate. Useful when output may contain garbage.
178  * ============================================================================= */
179 
180 /**
181  * @brief Zero output then accumulate: y = 0; y += alpha * x
182  *
183  * @param y Output vector [n], zeroed then accumulated
184  * @param x Input vector [n]
185  * @param alpha Scalar multiplier
186  * @param n Vector length
187  */
188 void axpy_zero_f32(float *y,
189  const float *x,
190  float alpha,
191  int n)
192 {
193  if (!y || n <= 0) {
194  return;
195  }
196 
197  memset(y, 0, n * sizeof(float));
198 
199  if (x) {
200  axpy_f32(y, x, alpha, n);
201  }
202 }
203 
204 /* =============================================================================
205  * 2D batched AXPY for [tokens, hidden] shaped tensors
206  *
207  * Process multiple tokens at once, common in transformer inference.
208  * ============================================================================= */
209 
210 /**
211  * @brief Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:]
212  *
213  * @param Y Output tensor [num_tokens, dim]
214  * @param X Input tensor [num_tokens, dim]
215  * @param alpha Scalar multiplier
216  * @param num_tokens Number of tokens
217  * @param dim Hidden dimension
218  * @param y_stride Stride between Y rows (for alignment)
219  * @param x_stride Stride between X rows
220  */
221 void axpy_2d_f32(float *Y,
222  const float *X,
223  float alpha,
224  int num_tokens,
225  int dim,
226  int y_stride,
227  int x_stride)
228 {
229  if (!Y || !X || num_tokens <= 0 || dim <= 0) {
230  return;
231  }
232 
233  /* Default strides if not specified */
234  if (y_stride <= 0) y_stride = dim;
235  if (x_stride <= 0) x_stride = dim;
236 
237  for (int t = 0; t < num_tokens; t++) {
238  axpy_f32(Y + t * y_stride, X + t * x_stride, alpha, dim);
239  }
240 }
241 
242 /* =============================================================================
243  * MoE-specific: Accumulate expert output with routing weight
244  *
245  * Convenience wrapper with clear semantics for MoE usage.
246  * ============================================================================= */
247 
248 /**
249  * @brief Accumulate expert output: output += routing_weight * expert_output
250  *
251  * @param output Token output buffer [hidden_dim], accumulated in place
252  * @param expert_output Expert's output for this token [hidden_dim]
253  * @param routing_weight Softmax routing weight for this expert
254  * @param hidden_dim Hidden dimension
255  */
256 void moe_accumulate_expert_f32(float *output,
257  const float *expert_output,
258  float routing_weight,
259  int hidden_dim)
260 {
261  axpy_f32(output, expert_output, routing_weight, hidden_dim);
262 }
void axpy_f32(float *y, const float *x, float alpha, int n)
In-place AXPY: y += alpha * x.
Definition: axpy_kernels.c:54
void moe_accumulate_expert_f32(float *output, const float *expert_output, float routing_weight, int hidden_dim)
Accumulate expert output: output += routing_weight * expert_output.
Definition: axpy_kernels.c:256
void axpy_zero_f32(float *y, const float *x, float alpha, int n)
Zero output then accumulate: y = 0; y += alpha * x.
Definition: axpy_kernels.c:188
void weighted_sum_f32(float *y, const float **vectors, const float *weights, int k, int n)
Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i])
Definition: axpy_kernels.c:155
void axpy_2d_f32(float *Y, const float *X, float alpha, int num_tokens, int dim, int y_stride, int x_stride)
Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:].
Definition: axpy_kernels.c:221
void scal_copy_f32(float *y, const float *x, float alpha, int n)
Scaled copy: y = alpha * x.
Definition: axpy_kernels.c:105