← Back to C-Kernel-Engine Docs Doxygen Source Documentation
topk_kernels.c File Reference

Top-K selection kernels for MoE router dispatch. More...

#include <stdint.h>
#include <stddef.h>
#include <float.h>
#include <math.h>

Go to the source code of this file.

Functions

int argmax_f32 (const float *scores, int n)
 Find index of maximum value. More...
 
void topk_batched_f32 (const float *scores, int num_tokens, int n_experts, int k, int *indices, float *weights)
 Batched top-K selection for multiple tokens. More...
 
void topk_f32 (const float *scores, int n, int k, int *indices, float *values)
 Find top-K indices and values from a score vector. More...
 
void topk_softmax_f32 (const float *scores, int n, int k, int *indices, float *weights)
 Find top-K indices with softmax-normalized weights. More...
 

Detailed Description

Top-K selection kernels for MoE router dispatch.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Provides efficient top-K selection from a score vector. Used in Mixture-of-Experts models to select which experts process each token.

Operations:

  • topk_f32: Find top-K indices and values from N scores
  • topk_softmax_f32: Top-K with softmax normalization of selected scores

Definition in file topk_kernels.c.

Function Documentation

◆ argmax_f32()

int argmax_f32 ( const float *  scores,
int  n 
)

Find index of maximum value.

Parameters
scoresInput scores [n]
nNumber of scores
Returns
Index of maximum value

Definition at line 226 of file topk_kernels.c.

227 {
228  if (!scores || n <= 0) {
229  return -1;
230  }
231 
232  int max_idx = 0;
233  float max_val = scores[0];
234 
235 #ifdef __AVX512F__
236  /* AVX-512 vectorized argmax for large arrays */
237  if (n >= 16) {
238  __m512 vmax = _mm512_set1_ps(-FLT_MAX);
239  __m512i vidx = _mm512_setzero_si512();
240  __m512i vcur_max_idx = _mm512_setzero_si512();
241 
242  int i = 0;
243  for (; i + 16 <= n; i += 16) {
244  __m512 v = _mm512_loadu_ps(&scores[i]);
245  __m512i cur_idx = _mm512_add_epi32(
246  _mm512_set1_epi32(i),
247  _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
248  );
249 
250  __mmask16 gt_mask = _mm512_cmp_ps_mask(v, vmax, _CMP_GT_OQ);
251  vmax = _mm512_mask_blend_ps(gt_mask, vmax, v);
252  vcur_max_idx = _mm512_mask_blend_epi32(gt_mask, vcur_max_idx, cur_idx);
253  }
254 
255  /* Horizontal reduction */
256  float vals[16];
257  int idxs[16];
258  _mm512_storeu_ps(vals, vmax);
259  _mm512_storeu_si512(idxs, vcur_max_idx);
260 
261  max_val = vals[0];
262  max_idx = idxs[0];
263  for (int j = 1; j < 16; j++) {
264  if (vals[j] > max_val) {
265  max_val = vals[j];
266  max_idx = idxs[j];
267  }
268  }
269 
270  /* Handle remainder */
271  for (; i < n; i++) {
272  if (scores[i] > max_val) {
273  max_val = scores[i];
274  max_idx = i;
275  }
276  }
277 
278  return max_idx;
279  }
280 #endif
281 
282  /* Scalar fallback */
283  for (int i = 1; i < n; i++) {
284  if (scores[i] > max_val) {
285  max_val = scores[i];
286  max_idx = i;
287  }
288  }
289 
290  return max_idx;
291 }

◆ topk_batched_f32()

void topk_batched_f32 ( const float *  scores,
int  num_tokens,
int  n_experts,
int  k,
int *  indices,
float *  weights 
)

Batched top-K selection for multiple tokens.

Parameters
scoresInput scores [num_tokens, n_experts]
num_tokensNumber of tokens
n_expertsNumber of experts
kNumber of experts to select per token
indicesOutput: selected expert indices [num_tokens, k]
weightsOutput: routing weights [num_tokens, k] (can be NULL for no softmax)

Definition at line 191 of file topk_kernels.c.

197 {
198  if (!scores || !indices || num_tokens <= 0 || n_experts <= 0 || k <= 0) {
199  return;
200  }
201 
202  for (int t = 0; t < num_tokens; t++) {
203  const float *token_scores = scores + t * n_experts;
204  int *token_indices = indices + t * k;
205 
206  if (weights) {
207  float *token_weights = weights + t * k;
208  topk_softmax_f32(token_scores, n_experts, k, token_indices, token_weights);
209  } else {
210  topk_f32(token_scores, n_experts, k, token_indices, NULL);
211  }
212  }
213 }
void topk_f32(const float *scores, int n, int k, int *indices, float *values)
Find top-K indices and values from a score vector.
Definition: topk_kernels.c:49
void topk_softmax_f32(const float *scores, int n, int k, int *indices, float *weights)
Find top-K indices with softmax-normalized weights.
Definition: topk_kernels.c:134

References topk_f32(), and topk_softmax_f32().

◆ topk_f32()

void topk_f32 ( const float *  scores,
int  n,
int  k,
int *  indices,
float *  values 
)

Find top-K indices and values from a score vector.

Parameters
scoresInput scores [n]
nNumber of scores (e.g., number of experts)
kNumber of top scores to select
indicesOutput: indices of top-K scores [k], sorted descending by value
valuesOutput: top-K score values [k], sorted descending (can be NULL)

Definition at line 49 of file topk_kernels.c.

54 {
55  if (!scores || !indices || n <= 0 || k <= 0) {
56  return;
57  }
58 
59  /* Clamp k to n */
60  if (k > n) {
61  k = n;
62  }
63 
64  /* Initialize with first k elements */
65  float local_values[k];
66  for (int i = 0; i < k; i++) {
67  indices[i] = i;
68  local_values[i] = scores[i];
69  }
70 
71  /* Find the minimum in our current top-k */
72  int min_idx = 0;
73  for (int i = 1; i < k; i++) {
74  if (local_values[i] < local_values[min_idx]) {
75  min_idx = i;
76  }
77  }
78 
79  /* Scan remaining elements */
80  for (int i = k; i < n; i++) {
81  if (scores[i] > local_values[min_idx]) {
82  /* Replace the minimum */
83  indices[min_idx] = i;
84  local_values[min_idx] = scores[i];
85 
86  /* Find new minimum */
87  min_idx = 0;
88  for (int j = 1; j < k; j++) {
89  if (local_values[j] < local_values[min_idx]) {
90  min_idx = j;
91  }
92  }
93  }
94  }
95 
96  /* Sort results in descending order (simple insertion sort for small k) */
97  for (int i = 1; i < k; i++) {
98  float val = local_values[i];
99  int idx = indices[i];
100  int j = i - 1;
101  while (j >= 0 && local_values[j] < val) {
102  local_values[j + 1] = local_values[j];
103  indices[j + 1] = indices[j];
104  j--;
105  }
106  local_values[j + 1] = val;
107  indices[j + 1] = idx;
108  }
109 
110  /* Copy values if output requested */
111  if (values) {
112  for (int i = 0; i < k; i++) {
113  values[i] = local_values[i];
114  }
115  }
116 }

Referenced by topk_batched_f32(), and topk_softmax_f32().

◆ topk_softmax_f32()

void topk_softmax_f32 ( const float *  scores,
int  n,
int  k,
int *  indices,
float *  weights 
)

Find top-K indices with softmax-normalized weights.

Parameters
scoresInput scores [n] (router logits)
nNumber of scores
kNumber of top scores to select
indicesOutput: indices of top-K scores [k]
weightsOutput: softmax-normalized weights for selected [k], sum to 1.0

Definition at line 134 of file topk_kernels.c.

139 {
140  if (!scores || !indices || !weights || n <= 0 || k <= 0) {
141  return;
142  }
143 
144  if (k > n) {
145  k = n;
146  }
147 
148  /* First get top-K indices and values */
149  float values[k];
150  topk_f32(scores, n, k, indices, values);
151 
152  /* Compute softmax over the selected values */
153  /* Find max for numerical stability */
154  float max_val = values[0];
155  for (int i = 1; i < k; i++) {
156  if (values[i] > max_val) {
157  max_val = values[i];
158  }
159  }
160 
161  /* Compute exp and sum */
162  float sum = 0.0f;
163  for (int i = 0; i < k; i++) {
164  weights[i] = expf(values[i] - max_val);
165  sum += weights[i];
166  }
167 
168  /* Normalize */
169  float inv_sum = 1.0f / sum;
170  for (int i = 0; i < k; i++) {
171  weights[i] *= inv_sum;
172  }
173 }

References topk_f32().

Referenced by topk_batched_f32().