← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_features.h
Go to the documentation of this file.
1 /**
2  * @file ck_features.h
3  * @brief CPU feature detection and dispatch macros
4  *
5  * Defines standardized macros for SIMD instruction set detection
6  * and kernel dispatch. Use these instead of CPU model checks.
7  *
8  * Feature Priority (best available):
9  * AMX (512-bit tile ops, Intel Sapphire Rapids+)
10  * AVX-512 (512-bit vector, Intel Skylake-X+)
11  * AVX2 (256-bit with FMA, Intel Haswell+)
12  * AVX (256-bit, Intel Sandy Bridge+)
13  * NEON/SVE2 (ARM)
14  * DSA (PowerPC)
15  * Reference (fallback)
16  */
17 
18 #ifndef CK_FEATURES_H
19 #define CK_FEATURES_H
20 
21 #include <stdint.h>
22 
23 /*============================================================================
24  * Compiler Feature Detection
25  * These are set by the compiler based on -march flags
26  *============================================================================*/
27 
28 /* Intel/x86 */
29 #if defined(__AMX__)
30  #define CK_HAS_AMX 1
31 #endif
32 
33 #if defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
34  #define CK_HAS_AVX512 1
35 #endif
36 
37 #if defined(__AVX2__) && defined(__FMA__)
38  #define CK_HAS_AVX2_FMA 1
39 #endif
40 
41 #if defined(__AVX__)
42  #define CK_HAS_AVX 1
43 #endif
44 
45 #if defined(__VNNI__)
46  #define CK_HAS_VNNI 1
47 #endif
48 
49 #if defined(__AVX512VNNI__)
50  #define CK_HAS_AVX512VNNI 1
51 #endif
52 
53 /* ARM */
54 #if defined(__aarch64__)
55  #if defined(__ARM_FEATURE_SVE2)
56  #define CK_HAS_SVE2 1
57  #endif
58  #if defined(__ARM_FEATURE_NEON)
59  #define CK_HAS_NEON 1
60  #endif
61 #endif
62 
63 /* PowerPC */
64 #if defined(__ALTIVEC__)
65  #define CK_HAS_ALTIVEC 1
66 #endif
67 
68 #if defined(__VSX__)
69  #define CK_HAS_VSX 1
70 #endif
71 
72 /* RISC-V */
73 #if defined(__riscv_vector)
74  #define CK_HAS_RVV 1
75 #endif
76 
77 /*============================================================================
78  * Runtime Feature Detection (CPUID/MSR)
79  * For compiled binaries that need runtime dispatch
80  *============================================================================*/
81 
82 #if defined(__x86_64__) || defined(__i386__)
83  #include <cpuid.h>
84 
85  static inline uint32_t ck_cpuid_max_leaf(void) {
86  uint32_t eax, ecx = 0, ebx = 0, edx = 0;
87  __cpuid(0, eax, ebx, ecx, edx);
88  return eax;
89  }
90 
91  static inline void ck_cpuid_leaf1(uint32_t *eax, uint32_t *ebx,
92  uint32_t *ecx, uint32_t *edx) {
93  __cpuid(1, *eax, *ebx, *ecx, *edx);
94  }
95 
96  static inline void ck_cpuid_leaf7(uint32_t ecx_val, uint32_t *eax,
97  uint32_t *ebx, uint32_t *ecx,
98  uint32_t *edx) {
99  int info[4];
100  __cpuidex(info, 7, ecx_val);
101  *eax = info[0];
102  *ebx = info[1];
103  *ecx = info[2];
104  *edx = info[3];
105  }
106 
107  /* Check if OS supports XSAVE/XRSTORE (needed for AVX-512/AMX state) */
108  static inline int ck_os_has_xtile(void) {
109  uint32_t eax, ebx, ecx, edx;
110  ck_cpuid_leaf1(&eax, &ebx, &ecx, &edx);
111  /* ECX bit 26 = XSAVE, ECX bit 27 = OSXSAVE */
112  return (ecx & (1 << 27)) && (ecx & (1 << 26));
113  }
114 #endif
115 
116 /*============================================================================
117  * Feature Flag Macros
118  * Use these in dispatch functions instead of CPU model checks
119  *============================================================================*/
120 
121 /* Best available vector width */
122 #if defined(CK_HAS_AMX)
123  #define CK_VECTOR_WIDTH 512
124  #define CK_HAS_BEST_VECTOR 1
125 #elif defined(CK_HAS_AVX512)
126  #define CK_VECTOR_WIDTH 512
127  #define CK_HAS_BEST_VECTOR 1
128 #elif defined(CK_HAS_AVX2_FMA)
129  #define CK_VECTOR_WIDTH 256
130  #define CK_HAS_BEST_VECTOR 1
131 #elif defined(CK_HAS_AVX)
132  #define CK_VECTOR_WIDTH 256
133  #define CK_HAS_BEST_VECTOR 1
134 #elif defined(CK_HAS_NEON)
135  #define CK_VECTOR_WIDTH 128
136  #define CK_HAS_BEST_VECTOR 1
137 #else
138  #define CK_VECTOR_WIDTH 32 /* Scalar fallback */
139  #define CK_HAS_BEST_VECTOR 0
140 #endif
141 
142 /* AI acceleration features */
143 #if defined(CK_HAS_AMX)
144  #define CK_HAS_AI_ACCEL 1
145 #elif defined(CK_HAS_AVX512VNNI)
146  #define CK_HAS_AI_ACCEL 1
147 #elif defined(CK_HAS_VNNI)
148  #define CK_HAS_AI_ACCEL 1
149 #else
150  #define CK_HAS_AI_ACCEL 0
151 #endif
152 
153 /*============================================================================
154  * Kernel Dispatch Macros
155  * Use these for clean dispatch in kernel functions
156  *============================================================================*/
157 
158 /**
159  * @brief Dispatch to best available GEMM kernel
160  *
161  * Usage:
162  * CK_GEMM_DISPATCH(y, W, x, M, K);
163  * expands to appropriate kernel call
164  */
165 #if defined(CK_HAS_AMX)
166  #define CK_GEMM_DISPATCH(...) gemm_amx(__VA_ARGS__)
167 #elif defined(CK_HAS_AVX512)
168  #define CK_GEMM_DISPATCH(...) gemm_avx512(__VA_ARGS__)
169 #elif defined(CK_HAS_AVX2_FMA)
170  #define CK_GEMM_DISPATCH(...) gemm_avx2(__VA_ARGS__)
171 #elif defined(CK_HAS_AVX)
172  #define CK_GEMM_DISPATCH(...) gemm_avx(__VA_ARGS__)
173 #else
174  #define CK_GEMM_DISPATCH(...) gemm_ref(__VA_ARGS__)
175 #endif
176 
177 /**
178  * @brief Dispatch to best available GEMV kernel
179  */
180 #if defined(CK_HAS_AMX)
181  #define CK_GEMV_DISPATCH(...) gemv_amx(__VA_ARGS__)
182 #elif defined(CK_HAS_AVX512)
183  #define CK_GEMV_DISPATCH(...) gemv_avx512(__VA_ARGS__)
184 #elif defined(CK_HAS_AVX2_FMA)
185  #define CK_GEMV_DISPATCH(...) gemv_avx2(__VA_ARGS__)
186 #elif defined(CK_HAS_AVX)
187  #define CK_GEMV_DISPATCH(...) gemv_avx(__VA_ARGS__)
188 #else
189  #define CK_GEMV_DISPATCH(...) gemv_ref(__VA_ARGS__)
190 #endif
191 
192 /**
193  * @brief Dispatch to best available quantized GEMV kernel
194  * For INT8/INT4 quantization with VNNI/AMX acceleration
195  */
196 #if defined(CK_HAS_AMX)
197  #define CK_QGEMM_DISPATCH(...) qgemm_amx(__VA_ARGS__)
198 #elif defined(CK_HAS_AVX512VNNI)
199  #define CK_QGEMM_DISPATCH(...) qgemm_avx512vnni(__VA_ARGS__)
200 #elif defined(CK_HAS_VNNI)
201  #define CK_QGEMM_DISPATCH(...) qgemm_vnni(__VA_ARGS__)
202 #elif defined(CK_HAS_AVX2_FMA)
203  #define CK_QGEMM_DISPATCH(...) qgemm_avx2(__VA_ARGS__)
204 #else
205  #define CK_QGEMM_DISPATCH(...) qgemm_ref(__VA_ARGS__)
206 #endif
207 
208 /*============================================================================
209  * Capability Reporting
210  *============================================================================*/
211 
212 /**
213  * @brief CPU capability information structure
214  */
215 typedef struct {
216  const char *name;
217  int width; /* Vector width in bits */
218  int has_fma; /* Fused multiply-add */
219  int has_ai_accel; /* AI-specific instructions (VNNI/AMX) */
220  const char *best_kernel; /* Recommended kernel name */
222 
223 /**
224  * @brief Get current platform capabilities
225  */
227  ck_capability_t cap = {
228  .name = "unknown",
229  .width = 32,
230  .has_fma = 0,
231  .has_ai_accel = 0,
232  .best_kernel = "gemm_ref"
233  };
234 
235 #if defined(CK_HAS_AMX)
236  cap.name = "AMX (Intel Sapphire Rapids+)";
237  cap.width = 512;
238  cap.has_fma = 1;
239  cap.has_ai_accel = 1;
240  cap.best_kernel = "gemm_amx";
241 #elif defined(CK_HAS_AVX512)
242  cap.name = "AVX-512 (Intel Skylake-X+)";
243  cap.width = 512;
244  cap.has_fma = 1;
245  cap.has_ai_accel = 1;
246  cap.best_kernel = "gemm_avx512";
247 #elif defined(CK_HAS_AVX2_FMA)
248  cap.name = "AVX2+FMA (Intel Haswell+)";
249  cap.width = 256;
250  cap.has_fma = 1;
251  cap.has_ai_accel = 0;
252  cap.best_kernel = "gemm_avx2";
253 #elif defined(CK_HAS_AVX)
254  cap.name = "AVX (Intel Sandy Bridge+)";
255  cap.width = 256;
256  cap.has_fma = 0;
257  cap.has_ai_accel = 0;
258  cap.best_kernel = "gemm_avx";
259 #elif defined(CK_HAS_NEON)
260  cap.name = "NEON (ARM)";
261  cap.width = 128;
262  cap.has_fma = 1;
263  cap.has_ai_accel = 0;
264  cap.best_kernel = "gemm_neon";
265 #elif defined(CK_HAS_ALTIVEC)
266  cap.name = "AltiVec (PowerPC)";
267  cap.width = 128;
268  cap.has_fma = 1;
269  cap.has_ai_accel = 0;
270  cap.best_kernel = "gemm_altivec";
271 #endif
272 
273  return cap;
274 }
275 
276 #endif /* CK_FEATURES_H */
static ck_capability_t ck_get_capabilities(void)
Get current platform capabilities.
Definition: ck_features.h:226
CPU capability information structure.
Definition: ck_features.h:215
const char * best_kernel
Definition: ck_features.h:220
const char * name
Definition: ck_features.h:216