RoPE (Rotary Position Embedding) kernels for BF16. More...

#include <stdint.h>
#include "bf16_utils.h"
#include "ckernel_engine.h"

Functions
void	rope_backward_bf16 (const uint16_t d_out, uint16_t d_x, const float cos_cache, const float sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_d_out, float scratch_d_x)

void	rope_backward_qk_bf16 (const uint16_t d_q_out, const uint16_t d_k_out, uint16_t d_q, uint16_t d_k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_dq_out, float scratch_dq, float scratch_dk_out, float scratch_dk)

void	rope_forward_bf16 (uint16_t x, const float cos_cache, const float sin_cache, int num_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch)

void	rope_forward_qk_bf16 (uint16_t q, uint16_t k, const float cos_cache, const float sin_cache, int num_heads, int num_kv_heads, int num_tokens, int head_dim, int aligned_head_dim, int pos_offset, float scratch_q, float scratch_k)

Detailed Description

RoPE (Rotary Position Embedding) kernels for BF16.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Definition in file rope_kernels_bf16.c.

Function Documentation

◆ rope_backward_bf16()

void rope_backward_bf16	(	const uint16_t *	d_out,
		uint16_t *	d_x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_d_out,
		float *	scratch_d_x
	)

Definition at line 52 of file rope_kernels_bf16.c.

 {
     if (!scratch_d_out || !scratch_d_x) return;
  
     size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     bf16_tensor_to_float(d_out, scratch_d_out, total);
     rope_backward(scratch_d_out, scratch_d_x, cos_cache, sin_cache,
                   num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     float_tensor_to_bf16(scratch_d_x, d_x, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_backward().

Referenced by rope_backward_qk_bf16().

◆ rope_backward_qk_bf16()

void rope_backward_qk_bf16	(	const uint16_t *	d_q_out,
		const uint16_t *	d_k_out,
		uint16_t *	d_q,
		uint16_t *	d_k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_dq_out,
		float *	scratch_dq,
		float *	scratch_dk_out,
		float *	scratch_dk
	)

Definition at line 103 of file rope_kernels_bf16.c.

 {
     if (!d_q_out || !d_k_out || !d_q || !d_k) return;
  
     rope_backward_bf16(d_q_out, d_q, cos_cache, sin_cache,
                        num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
                        scratch_dq_out, scratch_dq);
     rope_backward_bf16(d_k_out, d_k, cos_cache, sin_cache,
                        num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset,
                        scratch_dk_out, scratch_dk);
 }

References rope_backward_bf16().

◆ rope_forward_bf16()

void rope_forward_bf16	(	uint16_t *	x,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch
	)

Definition at line 28 of file rope_kernels_bf16.c.

 {
     if (!scratch) return;
  
     size_t total = (size_t)num_heads * (size_t)num_tokens * (size_t)aligned_head_dim;
  
     bf16_tensor_to_float(x, scratch, total);
     rope_forward(scratch, cos_cache, sin_cache,
                  num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset);
     float_tensor_to_bf16(scratch, x, total);
 }

References bf16_tensor_to_float(), float_tensor_to_bf16(), and rope_forward().

Referenced by rope_forward_qk_bf16().

◆ rope_forward_qk_bf16()

void rope_forward_qk_bf16	(	uint16_t *	q,
		uint16_t *	k,
		const float *	cos_cache,
		const float *	sin_cache,
		int	num_heads,
		int	num_kv_heads,
		int	num_tokens,
		int	head_dim,
		int	aligned_head_dim,
		int	pos_offset,
		float *	scratch_q,
		float *	scratch_k
	)

Definition at line 79 of file rope_kernels_bf16.c.

 {
     if (!q || !k) return;
  
     rope_forward_bf16(q, cos_cache, sin_cache,
                       num_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_q);
     rope_forward_bf16(k, cos_cache, sin_cache,
                       num_kv_heads, num_tokens, head_dim, aligned_head_dim, pos_offset, scratch_k);
 }

References rope_forward_bf16().