[LEGACY] Parallel decode orchestration prototype — NOT USED by v6.6 More...
#include <omp.h>#include <string.h>#include <math.h>#include "ckernel_engine.h"#include "ckernel_quant.h"Go to the source code of this file.
Functions | |
| void | decode_layer_parallel (float *hidden, const void *ln1_weight, const void *ln2_weight, const void *WQ, const void *WK, const void *WV, const void *WO, const void *W_gate, const void *W_up, const void *W_down, float *k_cache, float *v_cache, int token_index, float *scratch, int embed_dim, int intermediate, int H, int H_kv, int head_dim, int max_seq, float eps, int num_threads) |
| int | get_optimal_decode_threads (void) |
| void | mlp_parallel (const void *ln2_q8, const void *W_gate, const void *W_up, const void *W_down, float *gate_buf, float *up_buf, float *swiglu_buf, void *down_q8, float *mlp_out, int intermediate, int embed_dim, int num_threads) |
| void | qkv_projection_parallel (const void *ln1_q8, const void *WQ, const void *WK, const void *WV, float *q_out, float *k_out, float *v_out, int H, int H_kv, int head_dim, int embed_dim, int num_threads) |
| static void | residual_add_parallel (const float *a, const float *b, float *out, int n, int ith, int nth) |
| static void | vec_scale_parallel (float *y, float scale, int n, int ith, int nth) |
| static void | vec_zero_parallel (float *y, int n, int ith, int nth) |
[LEGACY] Parallel decode orchestration prototype — NOT USED by v6.6
This file was an early prototype demonstrating llama.cpp-style OpenMP parallelization patterns. It is NOT compiled into the v6.6 build and has no callers in the generated inference code path.
v6.6 decode runs entirely through the generated code in: version/v6.6/src/generated/ck-kernel-inference.c → ck_model_decode_internal()
Threading for v6.6 is handled by ck_threadpool (include/ck_threadpool.h), which replaces the OpenMP approach used here.
Kept for reference only. See the original design notes below.
Original design (OpenMP, superseded):
Definition in file parallel_orchestration.c.
| void decode_layer_parallel | ( | float * | hidden, |
| const void * | ln1_weight, | ||
| const void * | ln2_weight, | ||
| const void * | WQ, | ||
| const void * | WK, | ||
| const void * | WV, | ||
| const void * | WO, | ||
| const void * | W_gate, | ||
| const void * | W_up, | ||
| const void * | W_down, | ||
| float * | k_cache, | ||
| float * | v_cache, | ||
| int | token_index, | ||
| float * | scratch, | ||
| int | embed_dim, | ||
| int | intermediate, | ||
| int | H, | ||
| int | H_kv, | ||
| int | head_dim, | ||
| int | max_seq, | ||
| float | eps, | ||
| int | num_threads | ||
| ) |
Process one transformer layer in parallel.
This demonstrates the full parallel pattern for a single layer. In production, this would be called in a loop for all layers.
Definition at line 254 of file parallel_orchestration.c.
References gemv_q4_k_q8_k_parallel_simd(), quantize_row_q8_k(), and residual_add_parallel().
| int get_optimal_decode_threads | ( | void | ) |
Definition at line 422 of file parallel_orchestration.c.
| void mlp_parallel | ( | const void * | ln2_q8, |
| const void * | W_gate, | ||
| const void * | W_up, | ||
| const void * | W_down, | ||
| float * | gate_buf, | ||
| float * | up_buf, | ||
| float * | swiglu_buf, | ||
| void * | down_q8, | ||
| float * | mlp_out, | ||
| int | intermediate, | ||
| int | embed_dim, | ||
| int | num_threads | ||
| ) |
Parallel MLP (gate/up + SwiGLU + down projection).
| ln2_q8 | Input: RMSNorm output quantized to Q8_K |
| W_gate | Gate weights [intermediate, embed] in Q4_K |
| W_up | Up weights [intermediate, embed] in Q4_K |
| W_down | Down weights [embed, intermediate] in Q4_K |
| gate_buf | Scratch: gate output [intermediate] |
| up_buf | Scratch: up output [intermediate] |
| swiglu_buf | Scratch: SwiGLU output [intermediate] |
| down_q8 | Scratch: down input quantized [intermediate Q8_K blocks] |
| mlp_out | Output: MLP output [embed] |
| intermediate | Intermediate dimension |
| embed_dim | Embedding dimension |
| num_threads | Number of threads (0 = auto) |
Definition at line 184 of file parallel_orchestration.c.
References gemv_q4_k_q8_k_parallel_simd(), and quantize_row_q8_k().
| void qkv_projection_parallel | ( | const void * | ln1_q8, |
| const void * | WQ, | ||
| const void * | WK, | ||
| const void * | WV, | ||
| float * | q_out, | ||
| float * | k_out, | ||
| float * | v_out, | ||
| int | H, | ||
| int | H_kv, | ||
| int | head_dim, | ||
| int | embed_dim, | ||
| int | num_threads | ||
| ) |
Parallel Q/K/V projection for single token decode.
| ln1_q8 | Input: RMSNorm output quantized to Q8_K [aligned_embed] |
| WQ | Q weights [H*head_dim, aligned_embed] in Q4_K |
| WK | K weights [H_kv*head_dim, aligned_embed] in Q4_K |
| WV | V weights [H_kv*head_dim, aligned_embed] in Q4_K |
| q_out | Output: Q vectors [H, head_dim] |
| k_out | Output: K vector [H_kv, head_dim] |
| v_out | Output: V vector [H_kv, head_dim] |
| H | Number of query heads |
| H_kv | Number of KV heads (GQA) |
| head_dim | Head dimension |
| embed_dim | Embedding dimension |
| num_threads | Number of threads to use (0 = auto) |
Definition at line 128 of file parallel_orchestration.c.
References gemv_q4_k_q8_k_parallel_simd().
|
static |
Single-token decode with parallel SIMD kernels.
This is the main decode function that processes one token through all layers. OpenMP parallel region is created ONCE at the top, and all kernels receive (ith, nth) to split their work.
Pattern: #pragma omp parallel { int ith = omp_get_thread_num(); int nth = omp_get_num_threads();
// Each kernel processes only its slice gemv_q4_k_q8_k_parallel_simd(..., ith, nth); #pragma omp barrier
rmsnorm_parallel(..., ith, nth); // (not implemented yet) #pragma omp barrier ... }
Definition at line 63 of file parallel_orchestration.c.
Referenced by decode_layer_parallel().
|
static |
Definition at line 79 of file parallel_orchestration.c.
|
static |
Definition at line 94 of file parallel_orchestration.c.