← Back to C-Kernel-Engine Docs Doxygen Source Documentation
rope_kernels_bf16.c File Reference

RoPE (Rotary Position Embedding) kernels for BF16. More...

#include <stdint.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Go to the source code of this file.

Functions

void rope_backward_bf16 (const uint16_t *d_out, uint16_t *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch_d_out, float *scratch_d_x)
 
void rope_backward_qk_bf16 (const uint16_t *d_q_out, const uint16_t *d_k_out, uint16_t *d_q, uint16_t *d_k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch_dq_out, float *scratch_dq, float *scratch_dk_out, float *scratch_dk)
 
void rope_forward_bf16 (uint16_t *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch)
 
void rope_forward_qk_bf16 (uint16_t *q, uint16_t *k, const float *cos_cache, const float *sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch_q, float *scratch_k)
 

Detailed Description

RoPE (Rotary Position Embedding) kernels for BF16.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Definition in file rope_kernels_bf16.c.

Function Documentation

◆ rope_backward_bf16()

void rope_backward_bf16 ( const uint16_t *  d_out,
uint16_t *  d_x,
const float *  cos_cache,
const float *  sin_cache,
int  num_heads,
int  num_tokens,
int  head_dim,
int  aligned_head_dim,
int  pos_offset,
float *  scratch_d_out,
float *  scratch_d_x 
)

Definition at line 52 of file rope_kernels_bf16.c.

63 {
64  if (!scratch_d_out || !scratch_d_x) return;
65 
66  size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
67 
68  bf16_tensor_to_float(d_out, scratch_d_out, total);
69  rope_backward(scratch_d_out, scratch_d_x, cos_cache, sin_cache,
70  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
71  float_tensor_to_bf16(scratch_d_x, d_x, total);
72 }
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250
void rope_backward(const float *d_out, float *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:238

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_backward().

Referenced by rope_backward_qk_bf16().

◆ rope_backward_qk_bf16()

void rope_backward_qk_bf16 ( const uint16_t *  d_q_out,
const uint16_t *  d_k_out,
uint16_t *  d_q,
uint16_t *  d_k,
const float *  cos_cache,
const float *  sin_cache,
int  num_heads,
int  num_kv_heads,
int  num_tokens,
int  head_dim,
int  aligned_head_dim,
int  pos_offset,
float *  scratch_dq_out,
float *  scratch_dq,
float *  scratch_dk_out,
float *  scratch_dk 
)

Definition at line 103 of file rope_kernels_bf16.c.

119 {
120  if (!d_q_out || !d_k_out || !d_q || !d_k) return;
121 
122  rope_backward_bf16(d_q_out, d_q, cos_cache, sin_cache,
123  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
124  scratch_dq_out, scratch_dq);
125  rope_backward_bf16(d_k_out, d_k, cos_cache, sin_cache,
126  num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
127  scratch_dk_out, scratch_dk);
128 }
void rope_backward_bf16(const uint16_t *d_out, uint16_t *d_x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch_d_out, float *scratch_d_x)

References rope_backward_bf16().

◆ rope_forward_bf16()

void rope_forward_bf16 ( uint16_t *  x,
const float *  cos_cache,
const float *  sin_cache,
int  num_heads,
int  num_tokens,
int  head_dim,
int  aligned_head_dim,
int  pos_offset,
float *  scratch 
)

Definition at line 28 of file rope_kernels_bf16.c.

37 {
38  if (!scratch) return;
39 
40  size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
41 
42  bf16_tensor_to_float(x, scratch, total);
43  rope_forward(scratch, cos_cache, sin_cache,
44  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
45  float_tensor_to_bf16(scratch, x, total);
46 }
void rope_forward(float *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset)
Definition: rope_kernels.c:180

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_forward().

Referenced by rope_forward_qk_bf16().

◆ rope_forward_qk_bf16()

void rope_forward_qk_bf16 ( uint16_t *  q,
uint16_t *  k,
const float *  cos_cache,
const float *  sin_cache,
int  num_heads,
int  num_kv_heads,
int  num_tokens,
int  head_dim,
int  aligned_head_dim,
int  pos_offset,
float *  scratch_q,
float *  scratch_k 
)

Definition at line 79 of file rope_kernels_bf16.c.

91 {
92  if (!q || !k) return;
93 
94  rope_forward_bf16(q, cos_cache, sin_cache,
95  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_q);
96  rope_forward_bf16(k, cos_cache, sin_cache,
97  num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_k);
98 }
void rope_forward_bf16(uint16_t *x, const float *cos_cache, const float *sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float *scratch)

References rope_forward_bf16().