← Back to C-Kernel-Engine Docs Doxygen Source Documentation
fp16_convert.c
Go to the documentation of this file.
1 /**
2  * @file fp16_convert.c
3  * @brief FP32 <-> FP16 SIMD conversion utilities
4  *
5  * CK-ENGINE KERNEL RULES:
6  * =======================
7  * 1. NO malloc/free - memory via bump allocator, pointers passed in
8  * 2. NO OpenMP - parallelization at orchestrator/codegen layer
9  * 3. NO memcpy for layout - use strided access, not copies
10  * 4. API must define: inputs, outputs, workspace, and memory layouts
11  * 5. Pure computation - deterministic, no side effects
12  *
13  * After changes: make test && make llamacpp-parity-full
14  *
15  * These conversion functions use F16C hardware instructions (available on
16  * Intel Ivy Bridge and later, AMD Piledriver and later) for fast FP16/FP32
17  * conversion. FP16 (IEEE 754 half-precision) provides 2x memory savings
18  * with ~0.1% precision loss for KV cache storage.
19  *
20  * MEGA-FUSION BENEFIT:
21  * ====================
22  * FP16 KV cache doubles the context that fits in L3 cache:
23  * - FP32 KV: ~6K tokens in 6MB L3
24  * - FP16 KV: ~12K tokens in 6MB L3
25  * This extends mega-fusion's "hot zone" for longer sequences.
26  */
27 
28 #include <stdint.h>
29 #include <stddef.h>
30 #include <math.h>
31 
32 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
33 #include <immintrin.h>
34 #endif
35 
36 /* ============================================================================
37  * Scalar FP16 <-> FP32 Conversion
38  * ============================================================================
39  * These use F16C instructions when available, with software fallback.
40  */
41 
42 #if defined(__F16C__) || (defined(__AVX__) && !defined(__clang__))
43 /* Hardware F16C support */
44 
45 /**
46  * @brief Convert single FP32 to FP16 (hardware)
47  * @param f FP32 value
48  * @return FP16 value as uint16_t
49  */
50 static inline uint16_t ck_fp32_to_fp16_scalar(float f) {
51  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
52 }
53 
54 /**
55  * @brief Convert single FP16 to FP32 (hardware)
56  * @param h FP16 value as uint16_t
57  * @return FP32 value
58  */
59 static inline float ck_fp16_to_fp32_scalar(uint16_t h) {
60  return _cvtsh_ss(h);
61 }
62 
63 #else
64 /* Software fallback for systems without F16C */
65 
66 static inline uint16_t ck_fp32_to_fp16_scalar(float f) {
67  union { float f; uint32_t u; } u = { f };
68  uint32_t x = u.u;
69 
70  /* Extract sign, exponent, mantissa */
71  uint32_t sign = (x >> 16) & 0x8000;
72  int exp = ((x >> 23) & 0xFF) - 127 + 15;
73  uint32_t mant = (x >> 13) & 0x3FF;
74 
75  if (exp <= 0) {
76  /* Underflow to zero or denormal */
77  if (exp < -10) return (uint16_t)sign;
78  mant = (mant | 0x400) >> (1 - exp);
79  return (uint16_t)(sign | mant);
80  } else if (exp >= 31) {
81  /* Overflow to infinity or NaN */
82  if (exp == 128 && (x & 0x7FFFFF)) {
83  return (uint16_t)(sign | 0x7E00 | mant); /* NaN */
84  }
85  return (uint16_t)(sign | 0x7C00); /* Infinity */
86  }
87 
88  return (uint16_t)(sign | ((uint32_t)exp << 10) | mant);
89 }
90 
91 static inline float ck_fp16_to_fp32_scalar(uint16_t h) {
92  uint32_t sign = ((uint32_t)h & 0x8000) << 16;
93  int exp = (h >> 10) & 0x1F;
94  uint32_t mant = h & 0x3FF;
95 
96  if (exp == 0) {
97  if (mant == 0) {
98  /* Zero */
99  union { uint32_t u; float f; } u = { sign };
100  return u.f;
101  }
102  /* Denormalized number */
103  while (!(mant & 0x400)) {
104  mant <<= 1;
105  exp--;
106  }
107  exp++;
108  mant &= 0x3FF;
109  } else if (exp == 31) {
110  /* Infinity or NaN */
111  union { uint32_t u; float f; } u = { sign | 0x7F800000 | (mant << 13) };
112  return u.f;
113  }
114 
115  union { uint32_t u; float f; } u = { sign | ((uint32_t)(exp + 112) << 23) | (mant << 13) };
116  return u.f;
117 }
118 #endif
119 
120 /* ============================================================================
121  * Vectorized FP32 -> FP16 Conversion
122  * ============================================================================ */
123 
124 #if defined(__AVX512F__)
125 /**
126  * @brief Convert FP32 array to FP16 using AVX-512 (16 floats at a time)
127  * @param src Source FP32 array
128  * @param dst Destination FP16 array
129  * @param n Number of elements
130  */
131 void ck_fp32_to_fp16_avx512(const float *src, uint16_t *dst, int n) {
132  if (!src || !dst || n <= 0) return;
133 
134  int i = 0;
135  for (; i + 15 < n; i += 16) {
136  __m512 x = _mm512_loadu_ps(src + i);
137  __m256i y = _mm512_cvtps_ph(x, _MM_FROUND_TO_NEAREST_INT);
138  _mm256_storeu_si256((__m256i*)(dst + i), y);
139  }
140 
141  /* Handle remaining elements */
142  for (; i < n; i++) {
143  dst[i] = ck_fp32_to_fp16_scalar(src[i]);
144  }
145 }
146 
147 /**
148  * @brief Convert FP16 array to FP32 using AVX-512 (16 floats at a time)
149  * @param src Source FP16 array
150  * @param dst Destination FP32 array
151  * @param n Number of elements
152  */
153 void ck_fp16_to_fp32_avx512(const uint16_t *src, float *dst, int n) {
154  if (!src || !dst || n <= 0) return;
155 
156  int i = 0;
157  for (; i + 15 < n; i += 16) {
158  __m256i x = _mm256_loadu_si256((const __m256i*)(src + i));
159  __m512 y = _mm512_cvtph_ps(x);
160  _mm512_storeu_ps(dst + i, y);
161  }
162 
163  /* Handle remaining elements */
164  for (; i < n; i++) {
165  dst[i] = ck_fp16_to_fp32_scalar(src[i]);
166  }
167 }
168 #endif /* __AVX512F__ */
169 
170 #if defined(__AVX__)
171 /**
172  * @brief Convert FP32 array to FP16 using AVX + F16C (8 floats at a time)
173  * @param src Source FP32 array
174  * @param dst Destination FP16 array
175  * @param n Number of elements
176  */
177 void ck_fp32_to_fp16_avx(const float *src, uint16_t *dst, int n) {
178  if (!src || !dst || n <= 0) return;
179 
180  int i = 0;
181 #if defined(__F16C__)
182  for (; i + 7 < n; i += 8) {
183  __m256 x = _mm256_loadu_ps(src + i);
184  __m128i y = _mm256_cvtps_ph(x, _MM_FROUND_TO_NEAREST_INT);
185  _mm_storeu_si128((__m128i*)(dst + i), y);
186  }
187 #endif
188 
189  /* Handle remaining elements */
190  for (; i < n; i++) {
191  dst[i] = ck_fp32_to_fp16_scalar(src[i]);
192  }
193 }
194 
195 /**
196  * @brief Convert FP16 array to FP32 using AVX + F16C (8 floats at a time)
197  * @param src Source FP16 array
198  * @param dst Destination FP32 array
199  * @param n Number of elements
200  */
201 void ck_fp16_to_fp32_avx(const uint16_t *src, float *dst, int n) {
202  if (!src || !dst || n <= 0) return;
203 
204  int i = 0;
205 #if defined(__F16C__)
206  for (; i + 7 < n; i += 8) {
207  __m128i x = _mm_loadu_si128((const __m128i*)(src + i));
208  __m256 y = _mm256_cvtph_ps(x);
209  _mm256_storeu_ps(dst + i, y);
210  }
211 #endif
212 
213  /* Handle remaining elements */
214  for (; i < n; i++) {
215  dst[i] = ck_fp16_to_fp32_scalar(src[i]);
216  }
217 }
218 #endif /* __AVX__ */
219 
220 /* ============================================================================
221  * Generic Dispatch Functions
222  * ============================================================================ */
223 
224 /**
225  * @brief Convert FP32 row to FP16 (auto-select best implementation)
226  * @param src Source FP32 array
227  * @param dst Destination FP16 array (caller-allocated)
228  * @param n Number of elements
229  */
230 void ck_fp32_to_fp16_row(const float *src, uint16_t *dst, int n) {
231  if (!src || !dst || n <= 0) return;
232 
233 #if defined(__AVX512F__)
234  ck_fp32_to_fp16_avx512(src, dst, n);
235 #elif defined(__AVX__)
236  ck_fp32_to_fp16_avx(src, dst, n);
237 #else
238  for (int i = 0; i < n; i++) {
239  dst[i] = ck_fp32_to_fp16_scalar(src[i]);
240  }
241 #endif
242 }
243 
244 /**
245  * @brief Convert FP16 row to FP32 (auto-select best implementation)
246  * @param src Source FP16 array
247  * @param dst Destination FP32 array (caller-allocated)
248  * @param n Number of elements
249  */
250 void ck_fp16_to_fp32_row(const uint16_t *src, float *dst, int n) {
251  if (!src || !dst || n <= 0) return;
252 
253 #if defined(__AVX512F__)
254  ck_fp16_to_fp32_avx512(src, dst, n);
255 #elif defined(__AVX__)
256  ck_fp16_to_fp32_avx(src, dst, n);
257 #else
258  for (int i = 0; i < n; i++) {
259  dst[i] = ck_fp16_to_fp32_scalar(src[i]);
260  }
261 #endif
262 }
263 
264 /* ============================================================================
265  * 2D Conversion Functions (for matrices)
266  * ============================================================================ */
267 
268 /**
269  * @brief Convert 2D FP32 matrix to FP16 with strided access
270  * @param src Source FP32 matrix [rows, src_stride]
271  * @param dst Destination FP16 matrix [rows, dst_stride]
272  * @param rows Number of rows
273  * @param cols Number of columns (actual data per row)
274  * @param src_stride Source stride (elements per row)
275  * @param dst_stride Destination stride (elements per row)
276  */
277 void ck_fp32_to_fp16_2d(const float *src, uint16_t *dst,
278  int rows, int cols,
279  int src_stride, int dst_stride) {
280  if (!src || !dst || rows <= 0 || cols <= 0) return;
281 
282  for (int r = 0; r < rows; r++) {
283  ck_fp32_to_fp16_row(src + (size_t)r * src_stride,
284  dst + (size_t)r * dst_stride,
285  cols);
286  }
287 }
288 
289 /**
290  * @brief Convert 2D FP16 matrix to FP32 with strided access
291  * @param src Source FP16 matrix [rows, src_stride]
292  * @param dst Destination FP32 matrix [rows, dst_stride]
293  * @param rows Number of rows
294  * @param cols Number of columns (actual data per row)
295  * @param src_stride Source stride (elements per row)
296  * @param dst_stride Destination stride (elements per row)
297  */
298 void ck_fp16_to_fp32_2d(const uint16_t *src, float *dst,
299  int rows, int cols,
300  int src_stride, int dst_stride) {
301  if (!src || !dst || rows <= 0 || cols <= 0) return;
302 
303  for (int r = 0; r < rows; r++) {
304  ck_fp16_to_fp32_row(src + (size_t)r * src_stride,
305  dst + (size_t)r * dst_stride,
306  cols);
307  }
308 }
309 
310 /* ============================================================================
311  * In-place Conversion with Scratch Buffer
312  * ============================================================================ */
313 
314 /**
315  * @brief Convert FP32 to FP16 in-place using scratch buffer
316  *
317  * Useful when you want to downcast in place but need FP32 for computation.
318  * Writes FP16 to the lower half of scratch, then copies back.
319  *
320  * @param data FP32 array to convert (will contain FP16 in lower bits)
321  * @param scratch Temporary buffer, must be >= n * sizeof(uint16_t)
322  * @param n Number of elements
323  * @note After this call, data should be treated as uint16_t*
324  */
325 void ck_fp32_to_fp16_inplace(float *data, void *scratch, int n) {
326  if (!data || !scratch || n <= 0) return;
327 
328  uint16_t *tmp = (uint16_t*)scratch;
329  ck_fp32_to_fp16_row(data, tmp, n);
330 
331  /* Copy back (FP16 is half the size, so this is safe) */
332  uint16_t *dst = (uint16_t*)data;
333  for (int i = 0; i < n; i++) {
334  dst[i] = tmp[i];
335  }
336 }
337 
338 /* ============================================================================
339  * Mixed Precision Operations (compute in FP32, store in FP16)
340  * ============================================================================ */
341 
342 /**
343  * @brief FMA in FP32, store result as FP16: dst = a * b + c
344  * @param a First FP32 operand array
345  * @param b Second FP32 operand array
346  * @param c Third FP32 operand array
347  * @param dst Destination FP16 array
348  * @param n Number of elements
349  */
350 void ck_fma_f32_to_f16(const float *a, const float *b, const float *c,
351  uint16_t *dst, int n) {
352  if (!a || !b || !c || !dst || n <= 0) return;
353 
354 #if defined(__AVX512F__)
355  int i = 0;
356  for (; i + 15 < n; i += 16) {
357  __m512 va = _mm512_loadu_ps(a + i);
358  __m512 vb = _mm512_loadu_ps(b + i);
359  __m512 vc = _mm512_loadu_ps(c + i);
360  __m512 vr = _mm512_fmadd_ps(va, vb, vc);
361  __m256i vh = _mm512_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
362  _mm256_storeu_si256((__m256i*)(dst + i), vh);
363  }
364  for (; i < n; i++) {
365  dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
366  }
367 #elif defined(__AVX__) && defined(__F16C__)
368  int i = 0;
369  for (; i + 7 < n; i += 8) {
370  __m256 va = _mm256_loadu_ps(a + i);
371  __m256 vb = _mm256_loadu_ps(b + i);
372  __m256 vc = _mm256_loadu_ps(c + i);
373 #if defined(__FMA__)
374  __m256 vr = _mm256_fmadd_ps(va, vb, vc);
375 #else
376  __m256 vr = _mm256_add_ps(_mm256_mul_ps(va, vb), vc);
377 #endif
378  __m128i vh = _mm256_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
379  _mm_storeu_si128((__m128i*)(dst + i), vh);
380  }
381  for (; i < n; i++) {
382  dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
383  }
384 #else
385  for (int i = 0; i < n; i++) {
386  dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
387  }
388 #endif
389 }
390 
391 /**
392  * @brief Scale FP32 array and store as FP16: dst = scale * src
393  * @param src Source FP32 array
394  * @param scale Scalar multiplier
395  * @param dst Destination FP16 array
396  * @param n Number of elements
397  */
398 void ck_scale_f32_to_f16(const float *src, float scale, uint16_t *dst, int n) {
399  if (!src || !dst || n <= 0) return;
400 
401 #if defined(__AVX512F__)
402  __m512 vs = _mm512_set1_ps(scale);
403  int i = 0;
404  for (; i + 15 < n; i += 16) {
405  __m512 vx = _mm512_loadu_ps(src + i);
406  __m512 vr = _mm512_mul_ps(vx, vs);
407  __m256i vh = _mm512_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
408  _mm256_storeu_si256((__m256i*)(dst + i), vh);
409  }
410  for (; i < n; i++) {
411  dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
412  }
413 #elif defined(__AVX__) && defined(__F16C__)
414  __m256 vs = _mm256_set1_ps(scale);
415  int i = 0;
416  for (; i + 7 < n; i += 8) {
417  __m256 vx = _mm256_loadu_ps(src + i);
418  __m256 vr = _mm256_mul_ps(vx, vs);
419  __m128i vh = _mm256_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
420  _mm_storeu_si128((__m128i*)(dst + i), vh);
421  }
422  for (; i < n; i++) {
423  dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
424  }
425 #else
426  for (int i = 0; i < n; i++) {
427  dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
428  }
429 #endif
430 }
void ck_scale_f32_to_f16(const float *src, float scale, uint16_t *dst, int n)
Scale FP32 array and store as FP16: dst = scale * src.
Definition: fp16_convert.c:398
void ck_fp16_to_fp32_2d(const uint16_t *src, float *dst, int rows, int cols, int src_stride, int dst_stride)
Convert 2D FP16 matrix to FP32 with strided access.
Definition: fp16_convert.c:298
void ck_fp32_to_fp16_2d(const float *src, uint16_t *dst, int rows, int cols, int src_stride, int dst_stride)
Convert 2D FP32 matrix to FP16 with strided access.
Definition: fp16_convert.c:277
void ck_fma_f32_to_f16(const float *a, const float *b, const float *c, uint16_t *dst, int n)
FMA in FP32, store result as FP16: dst = a * b + c.
Definition: fp16_convert.c:350
void ck_fp32_to_fp16_inplace(float *data, void *scratch, int n)
Convert FP32 to FP16 in-place using scratch buffer.
Definition: fp16_convert.c:325
void ck_fp16_to_fp32_row(const uint16_t *src, float *dst, int n)
Convert FP16 row to FP32 (auto-select best implementation)
Definition: fp16_convert.c:250
static uint16_t ck_fp32_to_fp16_scalar(float f)
Definition: fp16_convert.c:66
static float ck_fp16_to_fp32_scalar(uint16_t h)
Definition: fp16_convert.c:91
void ck_fp32_to_fp16_row(const float *src, uint16_t *dst, int n)
Convert FP32 row to FP16 (auto-select best implementation)
Definition: fp16_convert.c:230