← Back to C-Kernel-Engine Docs Doxygen Source Documentation
bf16_utils.h
Go to the documentation of this file.
1 #ifndef BF16_UTILS_H
2 #define BF16_UTILS_H
3 
4 #include <stdint.h>
5 #include <stddef.h>
6 
7 #if defined(__AVX512F__)
8 #include <immintrin.h>
9 #endif
10 
11 // ============================================================================
12 // BF16 (Brain Floating Point) Format Overview
13 // ============================================================================
14 //
15 // BF16 is a 16-bit floating-point format designed for machine learning:
16 //
17 // FP32 (32 bits): [S][EEEEEEEE][MMMMMMMMMMMMMMMMMMMMMMM]
18 // 1 8 23 mantissa bits
19 //
20 // BF16 (16 bits): [S][EEEEEEEE][MMMMMMM]
21 // 1 8 7 mantissa bits
22 //
23 // Key insight: BF16 is the UPPER 16 bits of FP32. Same exponent range (-126 to
24 // +127), but only 7 mantissa bits instead of 23. This preserves the full
25 // dynamic range of FP32 while sacrificing precision.
26 //
27 // Why BF16 over FP16?
28 // - FP16 has only 5 exponent bits (range ±65504), causing overflow in ML
29 // - BF16 has 8 exponent bits (same as FP32), no overflow issues
30 // - Conversion is trivial: just truncate/round the lower 16 bits
31 //
32 // ============================================================================
33 // Scalar BF16 <-> FP32 conversion
34 // ============================================================================
35 
36 // BF16 to FP32: Zero-extend the 16-bit value to the upper half of FP32
37 // This is lossless - every BF16 value maps to exactly one FP32 value.
38 static inline float bf16_to_float(uint16_t v)
39 {
40  union {
41  uint32_t u;
42  float f;
43  } tmp;
44  tmp.u = (uint32_t)v << 16; // Place BF16 in upper 16 bits, lower bits = 0
45  return tmp.f;
46 }
47 
48 // FP32 to BF16: Extract upper 16 bits with ROUND-TO-NEAREST-EVEN rounding
49 //
50 // Why not just truncate (>> 16)?
51 // ─────────────────────────────
52 // Simple truncation always rounds DOWN, creating systematic negative bias.
53 // Over many operations, this compounds:
54 // - 1.999... truncates to 1.0 (lost 0.999)
55 // - 2.001... truncates to 2.0 (lost 0.001)
56 // - Average error: -0.5 LSB (always negative!)
57 //
58 // Round-to-nearest-even algorithm:
59 // ────────────────────────────────
60 // We add a rounding bias before truncating. The bias depends on the LSB
61 // (least significant bit) of the result to implement "banker's rounding":
62 //
63 // Fractional part < 0.5: Round down (add 0x7FFF, doesn't overflow to next)
64 // Fractional part > 0.5: Round up (add 0x7FFF, overflows to next)
65 // Fractional part = 0.5: Round to EVEN (add 0x7FFF + LSB)
66 // - If result LSB=0 (even), add 0x7FFF → stays same
67 // - If result LSB=1 (odd), add 0x8000 → rounds up
68 //
69 // The magic constant 0x7FFF is "almost half" of the lower 16 bits (0xFFFF).
70 // Adding 0x7FFF + lsb effectively rounds to nearest, with ties going to even.
71 //
72 // Example: Converting FP32 1.5f (exactly representable in BF16)
73 // FP32 bits: 0x3FC00000 = 0011 1111 1100 0000 | 0000 0000 0000 0000
74 // ^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
75 // upper 16 (BF16) lower 16 (to discard)
76 // Lower 16 = 0x0000, LSB of upper = 0
77 // Add 0x7FFF + 0 = 0x7FFF: 0x3FC07FFF
78 // Shift >> 16: 0x3FC0 ✓ (no change, fraction was 0)
79 //
80 // Example: Converting FP32 1.0000001f (not exact in BF16)
81 // FP32 bits: 0x3F800001
82 // Lower 16 = 0x0001, LSB of upper = 0
83 // Add 0x7FFF: 0x3F808000 → overflows upper bits
84 // Shift >> 16: 0x3F80 (rounded down, fraction < 0.5)
85 //
86 // Example: Converting FP32 1.0078125f (BF16 boundary, exactly 0.5 between)
87 // This is exactly halfway between two BF16 values
88 // Round-to-even picks the one with LSB=0
89 //
90 static inline uint16_t float_to_bf16(float f)
91 {
92  union {
93  uint32_t u;
94  float f;
95  } tmp;
96  tmp.f = f;
97  // Extract bit 16 (will be the LSB of the BF16 result after truncation)
98  uint32_t lsb = (tmp.u >> 16) & 1u;
99  // Add rounding bias: 0x7FFF normally, 0x8000 if LSB=1 (rounds ties to even)
100  tmp.u += 0x7FFFu + lsb;
101  // Truncate lower 16 bits
102  return (uint16_t)(tmp.u >> 16);
103 }
104 
105 // ============================================================================
106 // AVX-512 BF16 <-> FP32 conversion (16 elements at a time)
107 // ============================================================================
108 //
109 // AVX-512 provides 512-bit registers (16 floats) for efficient vectorization.
110 // We process 16 BF16 values at once, stored in a 256-bit register (__m256i).
111 //
112 // Memory layout for 16 BF16 values (32 bytes):
113 // __m256i: [bf16_0][bf16_1][bf16_2]...[bf16_15] (16 × 16-bit)
114 //
115 // After conversion to FP32 (64 bytes):
116 // __m512: [fp32_0][fp32_1][fp32_2]...[fp32_15] (16 × 32-bit)
117 //
118 // ============================================================================
119 // Native AVX-512 BF16 Instructions (when available)
120 // ============================================================================
121 //
122 // Some newer Intel Xeon processors have native BF16 support via AVX512_BF16:
123 // - Cooper Lake (3rd Gen Xeon Scalable)
124 // - Sapphire Rapids (4th Gen Xeon Scalable)
125 // - Emerald Rapids, Granite Rapids, etc.
126 //
127 // These provide single-instruction conversion:
128 // _mm512_cvtneps_pbh(a) : 16× FP32 → 16× BF16 (1 instruction vs 5-6)
129 // _mm512_dpbf16_ps(...) : BF16 dot product (for GEMM acceleration)
130 //
131 // For BF16→FP32, there's no single intrinsic, but the bit manipulation
132 // (zero-extend + shift) is already fast (2 instructions).
133 //
134 // Detection: Compile with -mavx512bf16 and check __AVX512BF16__
135 // ============================================================================
136 
137 #if defined(__AVX512F__)
138 
139 // BF16 to FP32 vectorized: Zero-extend 16-bit → 32-bit, then shift left 16
140 //
141 // Step-by-step for one element (but done for all 16 in parallel):
142 // Input BF16: 0x3F80 (represents 1.0f)
143 // Zero-extend: 0x00003F80
144 // Shift << 16: 0x3F800000 (IEEE-754 for 1.0f) ✓
145 //
146 static inline __m512 bf16x16_to_fp32(__m256i bf16_vec)
147 {
148  // Zero-extend 16 × uint16 to 16 × uint32
149  __m512i as_int = _mm512_cvtepu16_epi32(bf16_vec);
150  // Shift each 32-bit value left by 16 bits (move BF16 to upper half)
151  __m512i shifted = _mm512_slli_epi32(as_int, 16);
152  // Reinterpret bits as float (no conversion, just type cast)
153  return _mm512_castsi512_ps(shifted);
154 }
155 
156 // FP32 to BF16 vectorized: Same round-to-nearest-even algorithm, SIMD version
157 //
158 // This is the vectorized equivalent of float_to_bf16() above.
159 // Each of the 16 lanes independently performs:
160 // 1. Extract LSB of would-be BF16 result
161 // 2. Add rounding bias (0x7FFF + lsb)
162 // 3. Truncate by right-shifting 16 bits
163 //
164 // With native AVX512_BF16 support, this is a single instruction!
165 //
166 #if defined(__AVX512BF16__)
167 // ─────────────────────────────────────────────────────────────────────────────
168 // NATIVE AVX-512 BF16: Single instruction FP32→BF16 conversion
169 // Available on: Cooper Lake, Sapphire Rapids, Emerald Rapids, Granite Rapids
170 // Compile with: -mavx512bf16 (GCC/Clang) or /arch:AVX512 (MSVC)
171 // ─────────────────────────────────────────────────────────────────────────────
172 static inline __m256i fp32x16_to_bf16(__m512 fp32_vec)
173 {
174  // _mm512_cvtneps_pbh: Convert 16× FP32 → 16× BF16 in ONE instruction!
175  // "ne" = no exceptions, "ps" = packed single, "pbh" = packed bfloat16
176  // Returns __m256bh, cast to __m256i for storage compatibility
177  __m256bh bf16_result = _mm512_cvtneps_pbh(fp32_vec);
178  return (__m256i)bf16_result;
179 }
180 #else
181 // ─────────────────────────────────────────────────────────────────────────────
182 // SOFTWARE FALLBACK: Bit manipulation (5-6 instructions)
183 // Works on all AVX-512 capable CPUs (Skylake-X, Ice Lake, etc.)
184 // ─────────────────────────────────────────────────────────────────────────────
185 static inline __m256i fp32x16_to_bf16(__m512 fp32_vec)
186 {
187  // Reinterpret float bits as integers
188  __m512i as_int = _mm512_castps_si512(fp32_vec);
189 
190  // Extract bit 16 of each value (this becomes LSB of BF16 result)
191  __m512i lsb = _mm512_srli_epi32(as_int, 16);
192  lsb = _mm512_and_si512(lsb, _mm512_set1_epi32(1));
193 
194  // Compute rounding bias: 0x7FFF + lsb (0x7FFF or 0x8000)
195  __m512i rounding = _mm512_add_epi32(_mm512_set1_epi32(0x7FFF), lsb);
196 
197  // Add rounding bias and shift right to get BF16 value
198  __m512i rounded = _mm512_add_epi32(as_int, rounding);
199  __m512i shifted = _mm512_srli_epi32(rounded, 16);
200 
201  // Pack 16 × 32-bit down to 16 × 16-bit (truncates upper bits, which are 0)
202  return _mm512_cvtepi32_epi16(shifted);
203 }
204 #endif /* __AVX512BF16__ */
205 
206 // Convenience: Load 16 BF16 values from memory and convert to FP32
207 static inline __m512 bf16_loadu_cvt_fp32(const uint16_t *ptr)
208 {
209  __m256i bf16_vec = _mm256_loadu_si256((const __m256i *)ptr);
210  return bf16x16_to_fp32(bf16_vec);
211 }
212 
213 // Convenience: Convert 16 FP32 values to BF16 and store to memory
214 static inline void fp32_cvt_storeu_bf16(uint16_t *ptr, __m512 fp32_vec)
215 {
216  __m256i bf16_vec = fp32x16_to_bf16(fp32_vec);
217  _mm256_storeu_si256((__m256i *)ptr, bf16_vec);
218 }
219 
220 #endif /* __AVX512F__ */
221 
222 // ============================================================================
223 // Tensor conversion functions (use SIMD when available)
224 // ============================================================================
225 //
226 // These functions convert entire tensors between BF16 and FP32.
227 // Two common usage patterns in neural network inference:
228 //
229 // Pattern 1: Convert-Compute-Convert
230 // - Load BF16 weights/activations
231 // - Convert entire tensor to FP32
232 // - Compute in FP32 for full precision
233 // - Convert result back to BF16
234 // - Good for: Simple ops, debugging, when memory isn't critical
235 //
236 // Pattern 2: Inline Conversion (preferred for performance)
237 // - Load BF16 values directly in compute kernel
238 // - Convert to FP32 in registers (no memory write)
239 // - Compute in FP32
240 // - Convert back to BF16 before storing
241 // - Good for: GEMM, activations, fused kernels
242 //
243 // The functions below implement Pattern 1. For Pattern 2, use
244 // bf16_loadu_cvt_fp32() and fp32_cvt_storeu_bf16() directly in kernels.
245 //
246 // ============================================================================
247 
248 // Convert BF16 tensor to FP32 tensor
249 // Uses AVX-512 when available (16 elements per iteration)
250 static inline void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
251 {
252 #if defined(__AVX512F__)
253  size_t i = 0;
254  for (; i + 16 <= count; i += 16) {
255  __m512 fp32_vec = bf16_loadu_cvt_fp32(&src[i]);
256  _mm512_storeu_ps(&dst[i], fp32_vec);
257  }
258  for (; i < count; ++i) {
259  dst[i] = bf16_to_float(src[i]);
260  }
261 #else
262  for (size_t i = 0; i < count; ++i) {
263  dst[i] = bf16_to_float(src[i]);
264  }
265 #endif
266 }
267 
268 // Convert FP32 tensor to BF16 tensor
269 // Uses AVX-512 when available (16 elements per iteration)
270 // Applies round-to-nearest-even for each conversion
271 static inline void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
272 {
273 #if defined(__AVX512F__)
274  size_t i = 0;
275  for (; i + 16 <= count; i += 16) {
276  __m512 fp32_vec = _mm512_loadu_ps(&src[i]);
277  fp32_cvt_storeu_bf16(&dst[i], fp32_vec);
278  }
279  for (; i < count; ++i) {
280  dst[i] = float_to_bf16(src[i]);
281  }
282 #else
283  for (size_t i = 0; i < count; ++i) {
284  dst[i] = float_to_bf16(src[i]);
285  }
286 #endif
287 }
288 
289 #endif /* BF16_UTILS_H */
static void float_tensor_to_bf16(const float *src, uint16_t *dst, size_t count)
Definition: bf16_utils.h:271
static uint16_t float_to_bf16(float f)
Definition: bf16_utils.h:90
static float bf16_to_float(uint16_t v)
Definition: bf16_utils.h:38
static void bf16_tensor_to_float(const uint16_t *src, float *dst, size_t count)
Definition: bf16_utils.h:250