C-Kernel-Engine/doxygen/softmax__kernels__bf16_8c_source.html

 /**

  * @file softmax_kernels_bf16.c

  * @brief Softmax kernels for BF16 tensors

  *

  * CK-ENGINE KERNEL RULES:

  * =======================

  * 1. NO malloc/free - memory via bump allocator, pointers passed in

  * 2. NO OpenMP - parallelization at orchestrator/codegen layer

  * 3. API must define: inputs, outputs, workspace, and memory layouts

  * 4. Pure computation - deterministic, no side effects

  *

  * After changes: make test && make llamacpp-parity-full

  *

  * Softmax: y[i] = exp(x[i] - max(x)) / sum(exp(x - max(x)))

  */


 #include <stddef.h>

 #include <stdint.h>


 #include "bf16_utils.h"

 #include "ckernel_engine.h"


 /* Suppress false positive warnings about uninitialized variables */

 #pragma GCC diagnostic push

 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"


 /*

  * BF16 causal softmax with caller-provided scratch buffer.

  * scratch: [num_heads * aligned_context_window * aligned_context_window] floats

  */

 void causal_softmax_head_major_bf16(uint16_t *scores,

                                    int num_heads,

                                    int num_tokens,

                                    int aligned_context_window,

                                    float *scratch)

 {

     if (!scores || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;

     if (!scratch) return;


     const size_t total = (size_t)num_heads *

                          (size_t)aligned_context_window *

                          (size_t)aligned_context_window;


     bf16_tensor_to_float(scores, scratch, total);

     causal_softmax_head_major(scratch, num_heads, num_tokens, aligned_context_window);

     float_tensor_to_bf16(scratch, scores, total);

 }


 /*

  * BF16 backward causal softmax with caller-provided scratch buffers.

  * scratch_d_scores, scratch_weights: each [num_heads * aligned_context_window * aligned_context_window] floats

  */

 void backward_causal_softmax_head_major_bf16(uint16_t *d_scores,

                                             const uint16_t *weights,

                                             int num_heads,

                                             int num_tokens,

                                             int aligned_context_window,

                                             float *scratch_d_scores,

                                             float *scratch_weights)

 {

     if (!d_scores || !weights || num_heads <= 0 || num_tokens <= 0 || aligned_context_window <= 0) return;

     if (!scratch_d_scores || !scratch_weights) return;


     const size_t total = (size_t)num_heads *

                          (size_t)aligned_context_window *

                          (size_t)aligned_context_window;


     bf16_tensor_to_float(d_scores, scratch_d_scores, total);

     bf16_tensor_to_float(weights, scratch_weights, total);

     backward_causal_softmax_head_major(scratch_d_scores, scratch_weights, num_heads, num_tokens, aligned_context_window);

     float_tensor_to_bf16(scratch_d_scores, d_scores, total);

 }


 #pragma GCC diagnostic pop

bf16_utils.h

float_tensor_to_bf16
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271

bf16_tensor_to_float
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250

ckernel_engine.h

backward_causal_softmax_head_major
void backward_causal_softmax_head_major(float *d_scores, const float *weights, int num_heads, int num_tokens, int aligned_context_window)
Definition: softmax_kernels.c:382

causal_softmax_head_major
void causal_softmax_head_major(float *scores, int num_heads, int num_tokens, int aligned_context_window)
Definition: softmax_kernels.c:144

backward_causal_softmax_head_major_bf16
void backward_causal_softmax_head_major_bf16(uint16_t *d_scores, const uint16_t *weights, int num_heads, int num_tokens, int aligned_context_window, float *scratch_d_scores, float *scratch_weights)
Definition: softmax_kernels_bf16.c:53

causal_softmax_head_major_bf16
void causal_softmax_head_major_bf16(uint16_t *scores, int num_heads, int num_tokens, int aligned_context_window, float *scratch)
Definition: softmax_kernels_bf16.c:31