← Back to C-Kernel-Engine Docs Doxygen Source Documentation
kv_cache_kernels.c File Reference

KV-cache helper kernels (head-major layout) More...

#include "ckernel_engine.h"
#include <stddef.h>
#include <string.h>

Go to the source code of this file.

Functions

void kv_cache_repack_head_major_inplace (float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim)
 
void kv_cache_store (float *__restrict kv_cache_k, float *__restrict kv_cache_v, const float *__restrict k, const float *__restrict v, int layer, int pos, int num_kv_heads, int head_dim, int max_seq_len)
 
void kv_cache_write_head_major (const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)
 
void logits_copy_to_position (const float *__restrict src, float *__restrict dst, int position, int vocab_size)
 Copy logits to position-indexed location in output buffer. More...
 

Detailed Description

KV-cache helper kernels (head-major layout)

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Small, explicit helpers used by the runtime/orchestrator to maintain per-layer KV caches during autoregressive decoding.

Layout: k_cache[kv_head, token, aligned_head_dim] v_cache[kv_head, token, aligned_head_dim] with contiguous row-major storage and stride aligned_head_dim.

Definition in file kv_cache_kernels.c.

Function Documentation

◆ kv_cache_repack_head_major_inplace()

void kv_cache_repack_head_major_inplace ( float *  buf,
int  num_heads,
int  tokens,
int  cache_capacity,
int  aligned_head_dim 
)

Definition at line 28 of file kv_cache_kernels.c.

33 {
34  if (!buf) {
35  return;
36  }
37  if (num_heads <= 0 || tokens <= 0 || cache_capacity <= 0 || aligned_head_dim <= 0) {
38  return;
39  }
40  if (tokens > cache_capacity) {
41  tokens = cache_capacity;
42  }
43  if (tokens == cache_capacity) {
44  return;
45  }
46 
47  const size_t old_head_stride = (size_t)tokens * (size_t)aligned_head_dim;
48  const size_t new_head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
49  const size_t bytes = (size_t)tokens * (size_t)aligned_head_dim * sizeof(float);
50 
51  // Move head blocks from high to low to avoid overwriting source data
52  // for heads that have not yet been moved.
53  for (int h = num_heads - 1; h >= 0; --h) {
54  float *src = buf + (size_t)h * old_head_stride;
55  float *dst = buf + (size_t)h * new_head_stride;
56  memmove(dst, src, bytes);
57  }
58 }

Referenced by qwen2_0_5b_decode_forward_prefill_impl().

◆ kv_cache_store()

void kv_cache_store ( float *__restrict  kv_cache_k,
float *__restrict  kv_cache_v,
const float *__restrict  k,
const float *__restrict  v,
int  layer,
int  pos,
int  num_kv_heads,
int  head_dim,
int  max_seq_len 
)

Definition at line 101 of file kv_cache_kernels.c.

110 {
111  (void)layer;
113  kv_cache_k, kv_cache_v,
114  num_kv_heads,
115  pos,
116  max_seq_len,
117  head_dim,
118  head_dim);
119 }
void kv_cache_write_head_major(const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)

References kv_cache_write_head_major().

◆ kv_cache_write_head_major()

void kv_cache_write_head_major ( const float *__restrict  k_token,
const float *__restrict  v_token,
float *__restrict  k_cache,
float *__restrict  v_cache,
int  num_kv_heads,
int  token_index,
int  cache_capacity,
int  head_dim,
int  aligned_head_dim 
)

Definition at line 60 of file kv_cache_kernels.c.

69 {
70  if (!k_token || !v_token || !k_cache || !v_cache) {
71  return;
72  }
73  if (num_kv_heads <= 0 || token_index < 0 || cache_capacity <= 0) {
74  return;
75  }
76  if (token_index >= cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
77  return;
78  }
79 
80  const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
81  const size_t token_stride = (size_t)aligned_head_dim;
82 
83  for (int h = 0; h < num_kv_heads; ++h) {
84  const float *k_src = k_token + (size_t)h * token_stride;
85  const float *v_src = v_token + (size_t)h * token_stride;
86 
87  float *k_dst = k_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
88  float *v_dst = v_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
89 
90  for (int d = 0; d < head_dim; ++d) {
91  k_dst[d] = k_src[d];
92  v_dst[d] = v_src[d];
93  }
94  for (int d = head_dim; d < aligned_head_dim; ++d) {
95  k_dst[d] = 0.0f;
96  v_dst[d] = 0.0f;
97  }
98  }
99 }

Referenced by ck_layer_forward_rmsnorm_swiglu_decode(), ck_layer_forward_rmsnorm_swiglu_decode_fused(), ck_layer_forward_rmsnorm_swiglu_decode_fused_attn_impl(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), ck_layer_forward_rmsnorm_swiglu_decode_quant(), kv_cache_store(), mega_fused_attention_decode(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().

◆ logits_copy_to_position()

void logits_copy_to_position ( const float *__restrict  src,
float *__restrict  dst,
int  position,
int  vocab_size 
)

Copy logits to position-indexed location in output buffer.

Used in decode mode to copy single-token logits from position 0 to the correct sequence position. This moves buffer management logic from codegen to the IR layer, making codegen "dumb" - just emit kernel calls, no runtime if-statements.

Parameters
srcSource logits buffer (single token) [vocab_size]
dstDestination logits buffer [max_seq_len, vocab_size]
positionToken position index (0-based)
vocab_sizeNumber of logits per token

Definition at line 134 of file kv_cache_kernels.c.

138 {
139  if (!src || !dst || position < 0 || vocab_size <= 0) {
140  return;
141  }
142 
143  // Copy logits to dst[position * vocab_size : (position+1) * vocab_size]
144  // Use memmove for safety in case src and dst overlap (e.g., src == dst)
145  float *dst_pos = dst + (size_t)position * (size_t)vocab_size;
146  memmove(dst_pos, src, (size_t)vocab_size * sizeof(float));
147 }
int vocab_size
Definition: true_bpe.h:185

References vocab_size.