← Back to C-Kernel-Engine Docs Doxygen Source Documentation
axpy_kernels.c File Reference

AXPY kernels for FP32: y = y + alpha * x. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>

Go to the source code of this file.

Functions

void axpy_2d_f32 (float *Y, const float *X, float alpha, int num_tokens, int dim, int y_stride, int x_stride)
 Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:]. More...
 
void axpy_f32 (float *y, const float *x, float alpha, int n)
 In-place AXPY: y += alpha * x. More...
 
void axpy_zero_f32 (float *y, const float *x, float alpha, int n)
 Zero output then accumulate: y = 0; y += alpha * x. More...
 
void moe_accumulate_expert_f32 (float *output, const float *expert_output, float routing_weight, int hidden_dim)
 Accumulate expert output: output += routing_weight * expert_output. More...
 
void scal_copy_f32 (float *y, const float *x, float alpha, int n)
 Scaled copy: y = alpha * x. More...
 
void weighted_sum_f32 (float *y, const float **vectors, const float *weights, int k, int n)
 Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i]) More...
 

Detailed Description

AXPY kernels for FP32: y = y + alpha * x.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Classic BLAS Level-1 operation used in MoE expert output accumulation. When gathering expert outputs: output += weight[i] * expert_output[i]

Operations:

  • axpy_f32: y += alpha * x (in-place)
  • axpy_strided_f32: strided version for non-contiguous memory
  • weighted_sum_f32: sum multiple vectors with weights

Definition in file axpy_kernels.c.

Function Documentation

◆ axpy_2d_f32()

void axpy_2d_f32 ( float *  Y,
const float *  X,
float  alpha,
int  num_tokens,
int  dim,
int  y_stride,
int  x_stride 
)

Batched AXPY for 2D tensors: Y[t,:] += alpha * X[t,:].

Parameters
YOutput tensor [num_tokens, dim]
XInput tensor [num_tokens, dim]
alphaScalar multiplier
num_tokensNumber of tokens
dimHidden dimension
y_strideStride between Y rows (for alignment)
x_strideStride between X rows

Definition at line 221 of file axpy_kernels.c.

228 {
229  if (!Y || !X || num_tokens <= 0 || dim <= 0) {
230  return;
231  }
232 
233  /* Default strides if not specified */
234  if (y_stride <= 0) y_stride = dim;
235  if (x_stride <= 0) x_stride = dim;
236 
237  for (int t = 0; t < num_tokens; t++) {
238  axpy_f32(Y + t * y_stride, X + t * x_stride, alpha, dim);
239  }
240 }
void axpy_f32(float *y, const float *x, float alpha, int n)
In-place AXPY: y += alpha * x.
Definition: axpy_kernels.c:54

References axpy_f32().

◆ axpy_f32()

void axpy_f32 ( float *  y,
const float *  x,
float  alpha,
int  n 
)

In-place AXPY: y += alpha * x.

Test:

test_axpy.py::TestAXPY::test_axpy_f32

test_axpy.py::TestAXPY::test_axpy_vs_naive

In-place scaled vector addition: y += alpha * x BLAS-like axpy operation.

After changes: make test

Definition at line 54 of file axpy_kernels.c.

58 {
59  if (!y || !x || n <= 0) {
60  return;
61  }
62 
63  int i = 0;
64 
65 #ifdef __AVX512F__
66  __m512 valpha = _mm512_set1_ps(alpha);
67  for (; i + 16 <= n; i += 16) {
68  __m512 vy = _mm512_loadu_ps(&y[i]);
69  __m512 vx = _mm512_loadu_ps(&x[i]);
70  vy = _mm512_fmadd_ps(vx, valpha, vy); /* y = y + alpha * x */
71  _mm512_storeu_ps(&y[i], vy);
72  }
73 #endif
74 
75 #ifdef __AVX2__
76  __m256 valpha256 = _mm256_set1_ps(alpha);
77  for (; i + 8 <= n; i += 8) {
78  __m256 vy = _mm256_loadu_ps(&y[i]);
79  __m256 vx = _mm256_loadu_ps(&x[i]);
80  vy = _mm256_fmadd_ps(vx, valpha256, vy);
81  _mm256_storeu_ps(&y[i], vy);
82  }
83 #endif
84 
85  /* Scalar remainder */
86  for (; i < n; i++) {
87  y[i] += alpha * x[i];
88  }
89 }

Referenced by axpy_2d_f32(), axpy_zero_f32(), moe_accumulate_expert_f32(), and weighted_sum_f32().

◆ axpy_zero_f32()

void axpy_zero_f32 ( float *  y,
const float *  x,
float  alpha,
int  n 
)

Zero output then accumulate: y = 0; y += alpha * x.

Parameters
yOutput vector [n], zeroed then accumulated
xInput vector [n]
alphaScalar multiplier
nVector length

Definition at line 188 of file axpy_kernels.c.

192 {
193  if (!y || n <= 0) {
194  return;
195  }
196 
197  memset(y, 0, n * sizeof(float));
198 
199  if (x) {
200  axpy_f32(y, x, alpha, n);
201  }
202 }

References axpy_f32().

◆ moe_accumulate_expert_f32()

void moe_accumulate_expert_f32 ( float *  output,
const float *  expert_output,
float  routing_weight,
int  hidden_dim 
)

Accumulate expert output: output += routing_weight * expert_output.

Parameters
outputToken output buffer [hidden_dim], accumulated in place
expert_outputExpert's output for this token [hidden_dim]
routing_weightSoftmax routing weight for this expert
hidden_dimHidden dimension

Definition at line 256 of file axpy_kernels.c.

260 {
261  axpy_f32(output, expert_output, routing_weight, hidden_dim);
262 }

References axpy_f32().

◆ scal_copy_f32()

void scal_copy_f32 ( float *  y,
const float *  x,
float  alpha,
int  n 
)

Scaled copy: y = alpha * x.

Parameters
yOutput vector [n]
xInput vector [n]
alphaScalar multiplier
nVector length

Definition at line 105 of file axpy_kernels.c.

109 {
110  if (!y || !x || n <= 0) {
111  return;
112  }
113 
114  int i = 0;
115 
116 #ifdef __AVX512F__
117  __m512 valpha = _mm512_set1_ps(alpha);
118  for (; i + 16 <= n; i += 16) {
119  __m512 vx = _mm512_loadu_ps(&x[i]);
120  __m512 vy = _mm512_mul_ps(vx, valpha);
121  _mm512_storeu_ps(&y[i], vy);
122  }
123 #endif
124 
125 #ifdef __AVX2__
126  __m256 valpha256 = _mm256_set1_ps(alpha);
127  for (; i + 8 <= n; i += 8) {
128  __m256 vx = _mm256_loadu_ps(&x[i]);
129  __m256 vy = _mm256_mul_ps(vx, valpha256);
130  _mm256_storeu_ps(&y[i], vy);
131  }
132 #endif
133 
134  for (; i < n; i++) {
135  y[i] = alpha * x[i];
136  }
137 }

Referenced by weighted_sum_f32().

◆ weighted_sum_f32()

void weighted_sum_f32 ( float *  y,
const float **  vectors,
const float *  weights,
int  k,
int  n 
)

Weighted sum of k vectors: y = sum_i(weights[i] * vectors[i])

Parameters
yOutput vector [n]
vectorsArray of k input vector pointers, each [n]
weightsArray of k scalar weights
kNumber of vectors to combine
nVector length

Definition at line 155 of file axpy_kernels.c.

160 {
161  if (!y || !vectors || !weights || k <= 0 || n <= 0) {
162  return;
163  }
164 
165  /* Initialize with first vector */
166  scal_copy_f32(y, vectors[0], weights[0], n);
167 
168  /* Accumulate rest */
169  for (int i = 1; i < k; i++) {
170  axpy_f32(y, vectors[i], weights[i], n);
171  }
172 }
void scal_copy_f32(float *y, const float *x, float alpha, int n)
Scaled copy: y = alpha * x.
Definition: axpy_kernels.c:105

References axpy_f32(), and scal_copy_f32().