← Back to C-Kernel-Engine Docs Doxygen Source Documentation
kv_cache_kernels.c
Go to the documentation of this file.
1 /**
2  * @file kv_cache_kernels.c
3  * @brief KV-cache helper kernels (head-major layout)
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. API must define: inputs, outputs, workspace, and memory layouts
10  * 4. Pure computation - deterministic, no side effects
11  *
12  * After changes: make test && make llamacpp-parity-full
13  *
14  * Small, explicit helpers used by the runtime/orchestrator to maintain
15  * per-layer KV caches during autoregressive decoding.
16  *
17  * Layout:
18  * k_cache[kv_head, token, aligned_head_dim]
19  * v_cache[kv_head, token, aligned_head_dim]
20  * with contiguous row-major storage and stride aligned_head_dim.
21  */
22 
23 #include "ckernel_engine.h"
24 
25 #include <stddef.h>
26 #include <string.h>
27 
29  int num_heads,
30  int tokens,
31  int cache_capacity,
32  int aligned_head_dim)
33 {
34  if (!buf) {
35  return;
36  }
37  if (num_heads <= 0 || tokens <= 0 || cache_capacity <= 0 || aligned_head_dim <= 0) {
38  return;
39  }
40  if (tokens > cache_capacity) {
41  tokens = cache_capacity;
42  }
43  if (tokens == cache_capacity) {
44  return;
45  }
46 
47  const size_t old_head_stride = (size_t)tokens * (size_t)aligned_head_dim;
48  const size_t new_head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
49  const size_t bytes = (size_t)tokens * (size_t)aligned_head_dim * sizeof(float);
50 
51  // Move head blocks from high to low to avoid overwriting source data
52  // for heads that have not yet been moved.
53  for (int h = num_heads - 1; h >= 0; --h) {
54  float *src = buf + (size_t)h * old_head_stride;
55  float *dst = buf + (size_t)h * new_head_stride;
56  memmove(dst, src, bytes);
57  }
58 }
59 
60 void kv_cache_write_head_major(const float *__restrict k_token,
61  const float *__restrict v_token,
62  float *__restrict k_cache,
63  float *__restrict v_cache,
64  int num_kv_heads,
65  int token_index,
66  int cache_capacity,
67  int head_dim,
68  int aligned_head_dim)
69 {
70  if (!k_token || !v_token || !k_cache || !v_cache) {
71  return;
72  }
73  if (num_kv_heads <= 0 || token_index < 0 || cache_capacity <= 0) {
74  return;
75  }
76  if (token_index >= cache_capacity || head_dim <= 0 || aligned_head_dim <= 0) {
77  return;
78  }
79 
80  const size_t head_stride = (size_t)cache_capacity * (size_t)aligned_head_dim;
81  const size_t token_stride = (size_t)aligned_head_dim;
82 
83  for (int h = 0; h < num_kv_heads; ++h) {
84  const float *k_src = k_token + (size_t)h * token_stride;
85  const float *v_src = v_token + (size_t)h * token_stride;
86 
87  float *k_dst = k_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
88  float *v_dst = v_cache + (size_t)h * head_stride + (size_t)token_index * token_stride;
89 
90  for (int d = 0; d < head_dim; ++d) {
91  k_dst[d] = k_src[d];
92  v_dst[d] = v_src[d];
93  }
94  for (int d = head_dim; d < aligned_head_dim; ++d) {
95  k_dst[d] = 0.0f;
96  v_dst[d] = 0.0f;
97  }
98  }
99 }
100 
101 void kv_cache_store(float *__restrict kv_cache_k,
102  float *__restrict kv_cache_v,
103  const float *__restrict k,
104  const float *__restrict v,
105  int layer,
106  int pos,
107  int num_kv_heads,
108  int head_dim,
109  int max_seq_len)
110 {
111  (void)layer;
113  kv_cache_k, kv_cache_v,
114  num_kv_heads,
115  pos,
116  max_seq_len,
117  head_dim,
118  head_dim);
119 }
120 
121 /**
122  * @brief Copy logits to position-indexed location in output buffer.
123  *
124  * Used in decode mode to copy single-token logits from position 0 to
125  * the correct sequence position. This moves buffer management logic
126  * from codegen to the IR layer, making codegen "dumb" - just emit
127  * kernel calls, no runtime if-statements.
128  *
129  * @param src Source logits buffer (single token) [vocab_size]
130  * @param dst Destination logits buffer [max_seq_len, vocab_size]
131  * @param position Token position index (0-based)
132  * @param vocab_size Number of logits per token
133  */
134 void logits_copy_to_position(const float *__restrict src,
135  float *__restrict dst,
136  int position,
137  int vocab_size)
138 {
139  if (!src || !dst || position < 0 || vocab_size <= 0) {
140  return;
141  }
142 
143  // Copy logits to dst[position * vocab_size : (position+1) * vocab_size]
144  // Use memmove for safety in case src and dst overlap (e.g., src == dst)
145  float *dst_pos = dst + (size_t)position * (size_t)vocab_size;
146  memmove(dst_pos, src, (size_t)vocab_size * sizeof(float));
147 }
void kv_cache_repack_head_major_inplace(float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim)
void kv_cache_write_head_major(const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim)
void kv_cache_store(float *__restrict kv_cache_k, float *__restrict kv_cache_v, const float *__restrict k, const float *__restrict v, int layer, int pos, int num_kv_heads, int head_dim, int max_seq_len)
void logits_copy_to_position(const float *__restrict src, float *__restrict dst, int position, int vocab_size)
Copy logits to position-indexed location in output buffer.
int vocab_size
Definition: true_bpe.h:185