← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemv_omp.c
Go to the documentation of this file.
1 /*
2  * gemv_omp.c - OpenMP-parallel GEMV kernels for decode mode
3  *
4  * WARNING: These kernels use #pragma omp parallel for, which creates and
5  * joins threads on EVERY call. During inference, each decode token invokes
6  * kernels 500+ times. OpenMP fork/join overhead (~50-200us per call) makes
7  * these SLOWER than serial for real inference workloads:
8  *
9  * Measured on i7-3630QM (4C/8T), Qwen 0.5B:
10  * Serial kernels: 170 ms/tok (5.9 tok/s)
11  * OMP parallel: 327 ms/tok (3.1 tok/s) ← 1.9x SLOWER
12  *
13  * The math is correct (10/10 parity tests pass) but the threading model is
14  * wrong for this workload. OpenMP is designed for long-running parallel
15  * regions, not thousands of short kernel calls per token.
16  *
17  * TODO: Replace with a persistent pthread thread pool:
18  * - Create N worker pthreads once at startup
19  * - Workers spin/wait on a barrier or futex
20  * - Kernel dispatch: write work descriptor, signal barrier (~2-5us)
21  * - Workers execute rows, signal completion
22  * - This is what llama.cpp does (ggml_threadpool) to get 30 tok/s
23  * on the same hardware where these OMP kernels get 3.1 tok/s
24  *
25  * These are parallel variants of the serial GEMV kernels in
26  * gemm_kernels_q8_0.c and gemm_kernels_q5_0.c. The serial kernels
27  * remain untouched for multi-stream / multi-model serving where
28  * per-kernel parallelism is not wanted.
29  *
30  * All three kernels are row-parallel: each output y[row] is an
31  * independent dot product, so we partition rows across threads.
32  */
33 
34 #include <omp.h>
35 #include "ckernel_quant.h"
36 
37 /* Existing vec_dot dispatch functions (in gemm_kernels_q8_0.c / gemm_kernels_q5_0.c) */
38 extern void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy);
39 extern void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy);
40 extern void quantize_row_q8_0(const float *x, void *y, int k);
41 
42 /* ---------------------------------------------------------------------------
43  * gemv_q8_0_q8_0_parallel_omp (logits — 37% of decode time)
44  *
45  * Same 5-param signature as gemv_q8_0_q8_0 — drop-in swap.
46  * schedule(static) ensures contiguous row blocks per thread → no false sharing.
47  * x_blocks is read-only shared (~1 KB for K=896) → stays in L1.
48  * Each thread reads disjoint weight rows → no contention.
49  * --------------------------------------------------------------------------- */
51  const void *W,
52  const void *x_q8,
53  int M, int K)
54 {
55  const block_q8_0 *w_blocks = (const block_q8_0 *)W;
56  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
57  const int blocks_per_row = K / QK8_0;
58 
59  #pragma omp parallel for schedule(static)
60  for (int row = 0; row < M; row++) {
61  vec_dot_q8_0_q8_0(K, &y[row],
62  &w_blocks[row * blocks_per_row],
63  x_blocks);
64  }
65 }
66 
67 /* ---------------------------------------------------------------------------
68  * gemv_q5_0_q8_0_parallel_omp (mlp_down — 10% of decode time)
69  *
70  * Same 5-param signature as gemv_q5_0_q8_0 — drop-in swap.
71  * --------------------------------------------------------------------------- */
73  const void *W,
74  const void *x_q8,
75  int M, int K)
76 {
77  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
78  const block_q8_0 *x_blocks = (const block_q8_0 *)x_q8;
79  const int blocks_per_row = K / QK5_0;
80 
81  #pragma omp parallel for schedule(static)
82  for (int row = 0; row < M; row++) {
83  vec_dot_q5_0_q8_0(K, &y[row],
84  &w_blocks[row * blocks_per_row],
85  x_blocks);
86  }
87 }
88 
89 /* ---------------------------------------------------------------------------
90  * gemv_fused_q5_0_bias_parallel_omp (mlp_gate_up — 44% of decode time)
91  *
92  * Same 6-param signature as gemv_fused_q5_0_bias_dispatch — drop-in swap.
93  * Quantizes x (FP32→Q8_0) once serially, then runs the parallel GEMV.
94  * Quantization is O(K) = 896 elements ≈ negligible vs. O(M*K) GEMV.
95  * --------------------------------------------------------------------------- */
97  const void *W,
98  const float *x,
99  const float *bias,
100  int M, int K)
101 {
102  const block_q5_0 *w_blocks = (const block_q5_0 *)W;
103  const int blocks_per_row = K / QK5_0;
104 
105  /* Quantize input ONCE (serial, fast — K=896 → 28 blocks = 952 bytes) */
106  block_q8_0 x_q8[K / QK8_0];
107  quantize_row_q8_0(x, (void *)x_q8, K);
108 
109  /* Parallel GEMV over output rows */
110  #pragma omp parallel for schedule(static)
111  for (int row = 0; row < M; row++) {
112  vec_dot_q5_0_q8_0(K, &y[row],
113  &w_blocks[row * blocks_per_row],
114  x_q8);
115  if (bias) y[row] += bias[row];
116  }
117 }
Quantization block structures for weight-only quantization.
#define QK5_0
Definition: ckernel_quant.h:67
#define QK8_0
void gemv_fused_q5_0_bias_parallel_omp(float *y, const void *W, const float *x, const float *bias, int M, int K)
Definition: gemv_omp.c:96
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.
void gemv_q5_0_q8_0_parallel_omp(float *y, const void *W, const void *x_q8, int M, int K)
Definition: gemv_omp.c:72
void gemv_q8_0_q8_0_parallel_omp(float *y, const void *W, const void *x_q8, int M, int K)
Definition: gemv_omp.c:50
void quantize_row_q8_0(const float *x, void *y, int k)
Quantize FP32 to Q8_0 format (scalar reference)
void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q8_0 x Q8_0.