FP32 <-> FP16 SIMD conversion utilities. More...

#include <stdint.h>
#include <stddef.h>
#include <math.h>

Functions
void	ck_fma_f32_to_f16 (const float a, const float b, const float c, uint16_t dst, int n)
	FMA in FP32, store result as FP16: dst = a * b + c. More...

void	ck_fp16_to_fp32_2d (const uint16_t src, float dst, int rows, int cols, int src_stride, int dst_stride)
	Convert 2D FP16 matrix to FP32 with strided access. More...

void	ck_fp16_to_fp32_row (const uint16_t src, float dst, int n)
	Convert FP16 row to FP32 (auto-select best implementation) More...

static float	ck_fp16_to_fp32_scalar (uint16_t h)

void	ck_fp32_to_fp16_2d (const float src, uint16_t dst, int rows, int cols, int src_stride, int dst_stride)
	Convert 2D FP32 matrix to FP16 with strided access. More...

void	ck_fp32_to_fp16_inplace (float data, void scratch, int n)
	Convert FP32 to FP16 in-place using scratch buffer. More...

void	ck_fp32_to_fp16_row (const float src, uint16_t dst, int n)
	Convert FP32 row to FP16 (auto-select best implementation) More...

static uint16_t	ck_fp32_to_fp16_scalar (float f)

void	ck_scale_f32_to_f16 (const float src, float scale, uint16_t dst, int n)
	Scale FP32 array and store as FP16: dst = scale * src. More...

Detailed Description

FP32 <-> FP16 SIMD conversion utilities.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
NO memcpy for layout - use strided access, not copies
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

These conversion functions use F16C hardware instructions (available on Intel Ivy Bridge and later, AMD Piledriver and later) for fast FP16/FP32 conversion. FP16 (IEEE 754 half-precision) provides 2x memory savings with ~0.1% precision loss for KV cache storage.

MEGA-FUSION BENEFIT:

FP16 KV cache doubles the context that fits in L3 cache:

FP32 KV: ~6K tokens in 6MB L3
FP16 KV: ~12K tokens in 6MB L3 This extends mega-fusion's "hot zone" for longer sequences.

Definition in file fp16_convert.c.

Function Documentation

◆ ck_fma_f32_to_f16()

void ck_fma_f32_to_f16	(	const float *	a,
		const float *	b,
		const float *	c,
		uint16_t *	dst,
		int	n
	)

FMA in FP32, store result as FP16: dst = a * b + c.

Parameters

a	First FP32 operand array
b	Second FP32 operand array
c	Third FP32 operand array
dst	Destination FP16 array
n	Number of elements

Definition at line 350 of file fp16_convert.c.

                                               {
     if (!a || !b || !c || !dst || n <= 0) return;
  
 #if defined(__AVX512F__)
     int i = 0;
     for (; i + 15 < n; i += 16) {
         __m512 va = _mm512_loadu_ps(a + i);
         __m512 vb = _mm512_loadu_ps(b + i);
         __m512 vc = _mm512_loadu_ps(c + i);
         __m512 vr = _mm512_fmadd_ps(va, vb, vc);
         __m256i vh = _mm512_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
         _mm256_storeu_si256((__m256i*)(dst + i), vh);
     }
     for (; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
     }
 #elif defined(__AVX__) && defined(__F16C__)
     int i = 0;
     for (; i + 7 < n; i += 8) {
         __m256 va = _mm256_loadu_ps(a + i);
         __m256 vb = _mm256_loadu_ps(b + i);
         __m256 vc = _mm256_loadu_ps(c + i);
 #if defined(__FMA__)
         __m256 vr = _mm256_fmadd_ps(va, vb, vc);
 #else
         __m256 vr = _mm256_add_ps(_mm256_mul_ps(va, vb), vc);
 #endif
         __m128i vh = _mm256_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
         _mm_storeu_si128((__m128i*)(dst + i), vh);
     }
     for (; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
     }
 #else
     for (int i = 0; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(a[i] * b[i] + c[i]);
     }
 #endif
 }

References ck_fp32_to_fp16_scalar().

◆ ck_fp16_to_fp32_2d()

void ck_fp16_to_fp32_2d	(	const uint16_t *	src,
		float *	dst,
		int	rows,
		int	cols,
		int	src_stride,
		int	dst_stride
	)

Convert 2D FP16 matrix to FP32 with strided access.

Parameters

src	Source FP16 matrix [rows, src_stride]
dst	Destination FP32 matrix [rows, dst_stride]
rows	Number of rows
cols	Number of columns (actual data per row)
src_stride	Source stride (elements per row)
dst_stride	Destination stride (elements per row)

Definition at line 298 of file fp16_convert.c.

                                                          {
     if (!src || !dst || rows <= 0 || cols <= 0) return;
  
     for (int r = 0; r < rows; r++) {
         ck_fp16_to_fp32_row(src + (size_t)r * src_stride,
                             dst + (size_t)r * dst_stride,
                             cols);
     }
 }

References ck_fp16_to_fp32_row().

◆ ck_fp16_to_fp32_row()

void ck_fp16_to_fp32_row	(	const uint16_t *	src,
		float *	dst,
		int	n
	)

Convert FP16 row to FP32 (auto-select best implementation)

Parameters

src	Source FP16 array
dst	Destination FP32 array (caller-allocated)
n	Number of elements

Definition at line 250 of file fp16_convert.c.

                                                                  {
     if (!src || !dst || n <= 0) return;
  
 #if defined(__AVX512F__)
     ck_fp16_to_fp32_avx512(src, dst, n);
 #elif defined(__AVX__)
     ck_fp16_to_fp32_avx(src, dst, n);
 #else
     for (int i = 0; i < n; i++) {
         dst[i] = ck_fp16_to_fp32_scalar(src[i]);
     }
 #endif
 }

References ck_fp16_to_fp32_scalar().

Referenced by ck_fp16_to_fp32_2d().

◆ ck_fp16_to_fp32_scalar()

static float ck_fp16_to_fp32_scalar ( uint16_t h )

inlinestatic

Definition at line 91 of file fp16_convert.c.

                                                        {
     uint32_t sign = ((uint32_t)h & 0x8000) << 16;
     int exp = (h >> 10) & 0x1F;
     uint32_t mant = h & 0x3FF;
  
     if (exp == 0) {
         if (mant == 0) {
             /* Zero */
             union { uint32_t u; float f; } u = { sign };
             return u.f;
         }
         /* Denormalized number */
         while (!(mant & 0x400)) {
             mant <<= 1;
             exp--;
         }
         exp++;
         mant &= 0x3FF;
     } else if (exp == 31) {
         /* Infinity or NaN */
         union { uint32_t u; float f; } u = { sign | 0x7F800000 | (mant << 13) };
         return u.f;
     }
  
     union { uint32_t u; float f; } u = { sign | ((uint32_t)(exp + 112) << 23) | (mant << 13) };
     return u.f;
 }

Referenced by ck_fp16_to_fp32_row().

◆ ck_fp32_to_fp16_2d()

void ck_fp32_to_fp16_2d	(	const float *	src,
		uint16_t *	dst,
		int	rows,
		int	cols,
		int	src_stride,
		int	dst_stride
	)

Convert 2D FP32 matrix to FP16 with strided access.

Parameters

src	Source FP32 matrix [rows, src_stride]
dst	Destination FP16 matrix [rows, dst_stride]
rows	Number of rows
cols	Number of columns (actual data per row)
src_stride	Source stride (elements per row)
dst_stride	Destination stride (elements per row)

Definition at line 277 of file fp16_convert.c.

                                                          {
     if (!src || !dst || rows <= 0 || cols <= 0) return;
  
     for (int r = 0; r < rows; r++) {
         ck_fp32_to_fp16_row(src + (size_t)r * src_stride,
                             dst + (size_t)r * dst_stride,
                             cols);
     }
 }

References ck_fp32_to_fp16_row().

◆ ck_fp32_to_fp16_inplace()

void ck_fp32_to_fp16_inplace	(	float *	data,
		void *	scratch,
		int	n
	)

Convert FP32 to FP16 in-place using scratch buffer.

Useful when you want to downcast in place but need FP32 for computation. Writes FP16 to the lower half of scratch, then copies back.

Parameters

data	FP32 array to convert (will contain FP16 in lower bits)
scratch	Temporary buffer, must be >= n * sizeof(uint16_t)
n	Number of elements

Note: After this call, data should be treated as uint16_t*

Definition at line 325 of file fp16_convert.c.

                                                                 {
     if (!data || !scratch || n <= 0) return;
  
     uint16_t *tmp = (uint16_t*)scratch;
     ck_fp32_to_fp16_row(data, tmp, n);
  
     /* Copy back (FP16 is half the size, so this is safe) */
     uint16_t *dst = (uint16_t*)data;
     for (int i = 0; i < n; i++) {
         dst[i] = tmp[i];
     }
 }

References ck_fp32_to_fp16_row().

◆ ck_fp32_to_fp16_row()

void ck_fp32_to_fp16_row	(	const float *	src,
		uint16_t *	dst,
		int	n
	)

Convert FP32 row to FP16 (auto-select best implementation)

Parameters

src	Source FP32 array
dst	Destination FP16 array (caller-allocated)
n	Number of elements

Definition at line 230 of file fp16_convert.c.

                                                                  {
     if (!src || !dst || n <= 0) return;
  
 #if defined(__AVX512F__)
     ck_fp32_to_fp16_avx512(src, dst, n);
 #elif defined(__AVX__)
     ck_fp32_to_fp16_avx(src, dst, n);
 #else
     for (int i = 0; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(src[i]);
     }
 #endif
 }

References ck_fp32_to_fp16_scalar().

Referenced by ck_fp32_to_fp16_2d(), and ck_fp32_to_fp16_inplace().

◆ ck_fp32_to_fp16_scalar()

static uint16_t ck_fp32_to_fp16_scalar ( float f )

inlinestatic

Definition at line 66 of file fp16_convert.c.

                                                        {
     union { float f; uint32_t u; } u = { f };
     uint32_t x = u.u;
  
     /* Extract sign, exponent, mantissa */
     uint32_t sign = (x >> 16) & 0x8000;
     int exp = ((x >> 23) & 0xFF) - 127 + 15;
     uint32_t mant = (x >> 13) & 0x3FF;
  
     if (exp <= 0) {
         /* Underflow to zero or denormal */
         if (exp < -10) return (uint16_t)sign;
         mant = (mant | 0x400) >> (1 - exp);
         return (uint16_t)(sign | mant);
     } else if (exp >= 31) {
         /* Overflow to infinity or NaN */
         if (exp == 128 && (x & 0x7FFFFF)) {
             return (uint16_t)(sign | 0x7E00 | mant);  /* NaN */
         }
         return (uint16_t)(sign | 0x7C00);  /* Infinity */
     }
  
     return (uint16_t)(sign | ((uint32_t)exp << 10) | mant);
 }

Referenced by ck_fma_f32_to_f16(), ck_fp32_to_fp16_row(), and ck_scale_f32_to_f16().

◆ ck_scale_f32_to_f16()

void ck_scale_f32_to_f16	(	const float *	src,
		float	scale,
		uint16_t *	dst,
		int	n
	)

Scale FP32 array and store as FP16: dst = scale * src.

Parameters

src	Source FP32 array
scale	Scalar multiplier
dst	Destination FP16 array
n	Number of elements

Definition at line 398 of file fp16_convert.c.

                                                                               {
     if (!src || !dst || n <= 0) return;
  
 #if defined(__AVX512F__)
     __m512 vs = _mm512_set1_ps(scale);
     int i = 0;
     for (; i + 15 < n; i += 16) {
         __m512 vx = _mm512_loadu_ps(src + i);
         __m512 vr = _mm512_mul_ps(vx, vs);
         __m256i vh = _mm512_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
         _mm256_storeu_si256((__m256i*)(dst + i), vh);
     }
     for (; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
     }
 #elif defined(__AVX__) && defined(__F16C__)
     __m256 vs = _mm256_set1_ps(scale);
     int i = 0;
     for (; i + 7 < n; i += 8) {
         __m256 vx = _mm256_loadu_ps(src + i);
         __m256 vr = _mm256_mul_ps(vx, vs);
         __m128i vh = _mm256_cvtps_ph(vr, _MM_FROUND_TO_NEAREST_INT);
         _mm_storeu_si128((__m128i*)(dst + i), vh);
     }
     for (; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
     }
 #else
     for (int i = 0; i < n; i++) {
         dst[i] = ck_fp32_to_fp16_scalar(src[i] * scale);
     }
 #endif
 }

References ck_fp32_to_fp16_scalar().

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

MEGA-FUSION BENEFIT:

Function Documentation

◆ ck_fma_f32_to_f16()

◆ ck_fp16_to_fp32_2d()

◆ ck_fp16_to_fp32_row()

◆ ck_fp16_to_fp32_scalar()

◆ ck_fp32_to_fp16_2d()

◆ ck_fp32_to_fp16_inplace()

◆ ck_fp32_to_fp16_row()

◆ ck_fp32_to_fp16_scalar()

◆ ck_scale_f32_to_f16()