← Back to C-Kernel-Engine Docs Doxygen Source Documentation
softmax_kernels_bf16.c
Go to the documentation of this file.
1 /**
2  * @file softmax_kernels_bf16.c
3  * @brief Softmax kernels for BF16 tensors
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x)))
15  */
16 
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 #include "bf16_utils.h"
21 #include "ckernel_engine.h"
22 
23 /* Suppress false positive warnings about uninitialized variables */
24 #pragma GCC diagnostic push
25 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
26 
27 /*
28  * BF16 causal softmax with caller-provided scratch buffer.
29  * scratch: [num_heads * aligned_context_window * aligned_context_window] floats
30  */
31 void causal_softmax_head_major_bf16(uint16_t *scores,
32  int num_heads,
33  int num_tokens,
34  int aligned_context_window,
35  float *scratch)
36 {
37  if (!scores || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;
38  if (!scratch) return;
39 
40  const size_t total = (size_t)num_heads *
41  (size_t)aligned_context_window *
42  (size_t)aligned_context_window;
43 
44  bf16_tensor_to_float(scores, scratch, total);
45  causal_softmax_head_major(scratch, num_heads, num_tokens, aligned_context_window);
46  float_tensor_to_bf16(scratch, scores, total);
47 }
48 
49 /*
50  * BF16 backward causal softmax with caller-provided scratch buffers.
51  * scratch_d_scores, scratch_weights: each [num_heads * aligned_context_window * aligned_context_window] floats
52  */
54  const uint16_t *weights,
55  int num_heads,
56  int num_tokens,
57  int aligned_context_window,
58  float *scratch_d_scores,
59  float *scratch_weights)
60 {
61  if (!d_scores || !weights || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;
62  if (!scratch_d_scores || !scratch_weights) return;
63 
64  const size_t total = (size_t)num_heads *
65  (size_t)aligned_context_window *
66  (size_t)aligned_context_window;
67 
68  bf16_tensor_to_float(d_scores, scratch_d_scores, total);
69  bf16_tensor_to_float(weights, scratch_weights, total);
70  backward_causal_softmax_head_major(scratch_d_scores, scratch_weights, num_heads, num_tokens, aligned_context_window);
71  float_tensor_to_bf16(scratch_d_scores, d_scores, total);
72 }
73 
74 #pragma GCC diagnostic pop
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250
void backward_causal_softmax_head_major(float *d_scores, const float *weights, int num_heads, int num_tokens, int aligned_context_window)
void causal_softmax_head_major(float *scores, int num_heads, int num_tokens, int aligned_context_window)
void backward_causal_softmax_head_major_bf16(uint16_t *d_scores, const uint16_t *weights, int num_heads, int num_tokens, int aligned_context_window, float *scratch_d_scores, float *scratch_weights)
void causal_softmax_head_major_bf16(uint16_t *scores, int num_heads, int num_tokens, int aligned_context_window, float *scratch)