KV-cache helper kernels (head-major layout) More...
Go to the source code of this file.
Functions | |
| void | kv_cache_repack_head_major_inplace (float *buf, int num_heads, int tokens, int cache_capacity, int aligned_head_dim) |
| void | kv_cache_store (float *__restrict kv_cache_k, float *__restrict kv_cache_v, const float *__restrict k, const float *__restrict v, int layer, int pos, int num_kv_heads, int head_dim, int max_seq_len) |
| void | kv_cache_write_head_major (const float *__restrict k_token, const float *__restrict v_token, float *__restrict k_cache, float *__restrict v_cache, int num_kv_heads, int token_index, int cache_capacity, int head_dim, int aligned_head_dim) |
| void | logits_copy_to_position (const float *__restrict src, float *__restrict dst, int position, int vocab_size) |
| Copy logits to position-indexed location in output buffer. More... | |
KV-cache helper kernels (head-major layout)
After changes: make test && make llamacpp-parity-full
Small, explicit helpers used by the runtime/orchestrator to maintain per-layer KV caches during autoregressive decoding.
Layout: k_cache[kv_head, token, aligned_head_dim] v_cache[kv_head, token, aligned_head_dim] with contiguous row-major storage and stride aligned_head_dim.
Definition in file kv_cache_kernels.c.
| void kv_cache_repack_head_major_inplace | ( | float * | buf, |
| int | num_heads, | ||
| int | tokens, | ||
| int | cache_capacity, | ||
| int | aligned_head_dim | ||
| ) |
Definition at line 28 of file kv_cache_kernels.c.
Referenced by qwen2_0_5b_decode_forward_prefill_impl().
| void kv_cache_store | ( | float *__restrict | kv_cache_k, |
| float *__restrict | kv_cache_v, | ||
| const float *__restrict | k, | ||
| const float *__restrict | v, | ||
| int | layer, | ||
| int | pos, | ||
| int | num_kv_heads, | ||
| int | head_dim, | ||
| int | max_seq_len | ||
| ) |
Definition at line 101 of file kv_cache_kernels.c.
References kv_cache_write_head_major().
| void kv_cache_write_head_major | ( | const float *__restrict | k_token, |
| const float *__restrict | v_token, | ||
| float *__restrict | k_cache, | ||
| float *__restrict | v_cache, | ||
| int | num_kv_heads, | ||
| int | token_index, | ||
| int | cache_capacity, | ||
| int | head_dim, | ||
| int | aligned_head_dim | ||
| ) |
Definition at line 60 of file kv_cache_kernels.c.
Referenced by ck_layer_forward_rmsnorm_swiglu_decode(), ck_layer_forward_rmsnorm_swiglu_decode_fused(), ck_layer_forward_rmsnorm_swiglu_decode_fused_attn_impl(), ck_layer_forward_rmsnorm_swiglu_decode_q4_k(), ck_layer_forward_rmsnorm_swiglu_decode_quant(), kv_cache_store(), mega_fused_attention_decode(), qwen2_0_5b_decode_layer_0_decode(), qwen2_0_5b_decode_layer_10_decode(), qwen2_0_5b_decode_layer_11_decode(), qwen2_0_5b_decode_layer_12_decode(), qwen2_0_5b_decode_layer_13_decode(), qwen2_0_5b_decode_layer_14_decode(), qwen2_0_5b_decode_layer_15_decode(), qwen2_0_5b_decode_layer_16_decode(), qwen2_0_5b_decode_layer_17_decode(), qwen2_0_5b_decode_layer_18_decode(), qwen2_0_5b_decode_layer_19_decode(), qwen2_0_5b_decode_layer_1_decode(), qwen2_0_5b_decode_layer_20_decode(), qwen2_0_5b_decode_layer_21_decode(), qwen2_0_5b_decode_layer_22_decode(), qwen2_0_5b_decode_layer_23_decode(), qwen2_0_5b_decode_layer_2_decode(), qwen2_0_5b_decode_layer_3_decode(), qwen2_0_5b_decode_layer_4_decode(), qwen2_0_5b_decode_layer_5_decode(), qwen2_0_5b_decode_layer_6_decode(), qwen2_0_5b_decode_layer_7_decode(), qwen2_0_5b_decode_layer_8_decode(), and qwen2_0_5b_decode_layer_9_decode().
| void logits_copy_to_position | ( | const float *__restrict | src, |
| float *__restrict | dst, | ||
| int | position, | ||
| int | vocab_size | ||
| ) |
Copy logits to position-indexed location in output buffer.
Used in decode mode to copy single-token logits from position 0 to the correct sequence position. This moves buffer management logic from codegen to the IR layer, making codegen "dumb" - just emit kernel calls, no runtime if-statements.
| src | Source logits buffer (single token) [vocab_size] |
| dst | Destination logits buffer [max_seq_len, vocab_size] |
| position | Token position index (0-based) |
| vocab_size | Number of logits per token |
Definition at line 134 of file kv_cache_kernels.c.
References vocab_size.