← Back to C-Kernel-Engine Docs Doxygen Source Documentation
cpu_features.c
Go to the documentation of this file.
1 /**
2  * CPU Feature Detection and Cache-Aware Parameter Tuning
3  *
4  * Detects CPU features, cache sizes, and core counts at runtime.
5  * Computes optimal GEMM blocking parameters based on actual hardware.
6  */
7 
8 #include "cpu_features.h"
9 #include <stdio.h>
10 #include <string.h>
11 #include <stdlib.h>
12 
13 #ifdef _WIN32
14 #include <windows.h>
15 #include <intrin.h>
16 #else
17 #include <unistd.h>
18 #endif
19 
20 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
21 #define X86_CPU 1
22 #if defined(__GNUC__) || defined(__clang__)
23 #include <cpuid.h>
24 #endif
25 #endif
26 
27 // Global instances
31 
32 // =============================================================================
33 // CPUID helpers for x86
34 // =============================================================================
35 
36 #ifdef X86_CPU
37 static void cpuid(int leaf, int subleaf, uint32_t* eax, uint32_t* ebx, uint32_t* ecx, uint32_t* edx) {
38 #if defined(__GNUC__) || defined(__clang__)
39  __cpuid_count(leaf, subleaf, *eax, *ebx, *ecx, *edx);
40 #elif defined(_MSC_VER)
41  int regs[4];
42  __cpuidex(regs, leaf, subleaf);
43  *eax = regs[0]; *ebx = regs[1]; *ecx = regs[2]; *edx = regs[3];
44 #else
45  *eax = *ebx = *ecx = *edx = 0;
46 #endif
47 }
48 
49 static void detect_x86_features(CPUInfo* info) {
50  uint32_t eax, ebx, ecx, edx;
51 
52  // Check max CPUID leaf
53  cpuid(0, 0, &eax, &ebx, &ecx, &edx);
54  uint32_t max_leaf = eax;
55 
56  if (max_leaf >= 1) {
57  cpuid(1, 0, &eax, &ebx, &ecx, &edx);
58  info->has_avx = (ecx >> 28) & 1;
59  info->has_fma = (ecx >> 12) & 1;
60  }
61 
62  if (max_leaf >= 7) {
63  cpuid(7, 0, &eax, &ebx, &ecx, &edx);
64  info->has_avx2 = (ebx >> 5) & 1;
65  info->has_avx512f = (ebx >> 16) & 1;
66  }
67 }
68 
69 // Detect cache sizes using CPUID leaf 0x04 (Intel) or leaf 0x8000001D (AMD)
70 static void detect_x86_cache_sizes(CPUInfo* info) {
71  uint32_t eax, ebx, ecx, edx;
72 
73  // Try Intel deterministic cache parameters (leaf 0x04)
74  for (int index = 0; index < 16; index++) {
75  cpuid(0x04, index, &eax, &ebx, &ecx, &edx);
76 
77  int cache_type = eax & 0x1F;
78  if (cache_type == 0) break; // No more caches
79 
80  int cache_level = (eax >> 5) & 0x7;
81  int line_size = (ebx & 0xFFF) + 1;
82  int partitions = ((ebx >> 12) & 0x3FF) + 1;
83  int ways = ((ebx >> 22) & 0x3FF) + 1;
84  int sets = ecx + 1;
85 
86  size_t cache_size = (size_t)line_size * partitions * ways * sets;
87 
88  if (cache_type == 1 || cache_type == 3) { // Data or unified cache
89  if (cache_level == 1) {
90  info->l1d_size = cache_size;
91  info->l1_line_size = line_size;
92  } else if (cache_level == 2) {
93  info->l2_size = cache_size;
94  } else if (cache_level == 3) {
95  info->l3_size = cache_size;
96  }
97  }
98  }
99 
100  // If Intel method didn't work, try AMD method (leaf 0x8000001D)
101  if (info->l1d_size == 0) {
102  cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
103  if (eax >= 0x8000001D) {
104  for (int index = 0; index < 16; index++) {
105  cpuid(0x8000001D, index, &eax, &ebx, &ecx, &edx);
106 
107  int cache_type = eax & 0x1F;
108  if (cache_type == 0) break;
109 
110  int cache_level = (eax >> 5) & 0x7;
111  int line_size = (ebx & 0xFFF) + 1;
112  int partitions = ((ebx >> 12) & 0x3FF) + 1;
113  int ways = ((ebx >> 22) & 0x3FF) + 1;
114  int sets = ecx + 1;
115 
116  size_t cache_size = (size_t)line_size * partitions * ways * sets;
117 
118  if (cache_type == 1 || cache_type == 3) {
119  if (cache_level == 1) {
120  info->l1d_size = cache_size;
121  info->l1_line_size = line_size;
122  } else if (cache_level == 2) {
123  info->l2_size = cache_size;
124  } else if (cache_level == 3) {
125  info->l3_size = cache_size;
126  }
127  }
128  }
129  }
130  }
131 }
132 #endif
133 
134 // =============================================================================
135 // Linux sysfs fallback for cache detection
136 // =============================================================================
137 
138 #if defined(__linux__)
139 static size_t read_sysfs_cache_size(int cpu, int index) {
140  char path[256];
141  snprintf(path, sizeof(path),
142  "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu, index);
143 
144  FILE* f = fopen(path, "r");
145  if (!f) return 0;
146 
147  char buf[32];
148  if (!fgets(buf, sizeof(buf), f)) {
149  fclose(f);
150  return 0;
151  }
152  fclose(f);
153 
154  size_t size = 0;
155  char unit = 'K';
156  sscanf(buf, "%zu%c", &size, &unit);
157 
158  if (unit == 'K' || unit == 'k') size *= 1024;
159  else if (unit == 'M' || unit == 'm') size *= 1024 * 1024;
160 
161  return size;
162 }
163 
164 static int read_sysfs_cache_level(int cpu, int index) {
165  char path[256];
166  snprintf(path, sizeof(path),
167  "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu, index);
168 
169  FILE* f = fopen(path, "r");
170  if (!f) return -1;
171 
172  int level = 0;
173  if (fscanf(f, "%d", &level) != 1) level = -1;
174  fclose(f);
175  return level;
176 }
177 
178 static void detect_linux_cache_sizes(CPUInfo* info) {
179  // Read from CPU 0's cache info
180  for (int index = 0; index < 10; index++) {
181  size_t size = read_sysfs_cache_size(0, index);
182  if (size == 0) break;
183 
184  int level = read_sysfs_cache_level(0, index);
185 
186  // Check if it's data or unified cache
187  char path[256];
188  snprintf(path, sizeof(path),
189  "/sys/devices/system/cpu/cpu0/cache/index%d/type", index);
190  FILE* f = fopen(path, "r");
191  if (f) {
192  char type[32] = {0};
193  if (fscanf(f, "%31s", type) != 1) type[0] = '\0';
194  fclose(f);
195 
196  if (strcmp(type, "Data") == 0 || strcmp(type, "Unified") == 0) {
197  if (level == 1) info->l1d_size = size;
198  else if (level == 2) info->l2_size = size;
199  else if (level == 3) info->l3_size = size;
200  }
201  }
202  }
203 }
204 
205 static int detect_linux_physical_cores(void) {
206  // Count unique physical cores by reading core_id
207  int max_core_id = -1;
208  int num_processors = 0;
209 
210  FILE* f = fopen("/proc/cpuinfo", "r");
211  if (!f) return 1;
212 
213  char line[256];
214  while (fgets(line, sizeof(line), f)) {
215  if (strncmp(line, "processor", 9) == 0) {
216  num_processors++;
217  }
218  if (strncmp(line, "cpu cores", 9) == 0) {
219  int cores = 0;
220  sscanf(line, "cpu cores : %d", &cores);
221  if (cores > 0) {
222  fclose(f);
223  return cores;
224  }
225  }
226  if (strncmp(line, "core id", 7) == 0) {
227  int core_id = 0;
228  sscanf(line, "core id : %d", &core_id);
229  if (core_id > max_core_id) max_core_id = core_id;
230  }
231  }
232  fclose(f);
233 
234  // Fallback: use processor count (may include hyperthreads)
235  if (max_core_id >= 0) {
236  return max_core_id + 1;
237  }
238  return num_processors > 0 ? num_processors : 1;
239 }
240 #endif
241 
242 // =============================================================================
243 // Physical core detection
244 // =============================================================================
245 
246 static int detect_physical_cores(void) {
247 #if defined(__linux__)
248  return detect_linux_physical_cores();
249 #elif defined(_WIN32)
250  SYSTEM_INFO sysinfo;
251  GetSystemInfo(&sysinfo);
252  // Windows: this gives logical processors, divide by 2 for HT estimate
253  return sysinfo.dwNumberOfProcessors / 2;
254 #elif defined(__APPLE__)
255  int cores = 1;
256  size_t len = sizeof(cores);
257  sysctlbyname("hw.physicalcpu", &cores, &len, NULL, 0);
258  return cores;
259 #else
260  return 1;
261 #endif
262 }
263 
264 // =============================================================================
265 // Compute optimal GEMM blocking parameters
266 // =============================================================================
267 
268 static void compute_gemm_params(const CPUInfo* cpu, GEMMParams* params) {
269  // Microkernel sizes based on SIMD width
270  // MUST match compile-time MR_FIXED/NR_FIXED in gemm_microkernel.c
271  if (cpu->has_avx512f) {
272  params->MR = 6; // 6 rows
273  params->NR = 32; // 32 cols (2 x ZMM registers)
274  } else if (cpu->has_fma) {
275  // AVX2+FMA: can use 6x16 with FMA hiding register spilling
276  params->MR = 6; // 6 rows
277  params->NR = 16; // 16 cols (2 x YMM registers)
278  } else if (cpu->has_avx || cpu->has_avx2) {
279  // AVX without FMA: use 4x16 to avoid register spilling
280  params->MR = 4; // 4 rows (reduced to fit in 16 YMM registers)
281  params->NR = 16; // 16 cols (2 x YMM registers)
282  } else {
283  params->MR = 4;
284  params->NR = 4;
285  }
286 
287  // Get cache sizes (use defaults if detection failed)
288  size_t l1 = cpu->l1d_size > 0 ? cpu->l1d_size : 32 * 1024; // Default 32KB
289  size_t l2 = cpu->l2_size > 0 ? cpu->l2_size : 256 * 1024; // Default 256KB
290  size_t l3 = cpu->l3_size > 0 ? cpu->l3_size : 8 * 1024 * 1024; // Default 8MB
291 
292  // BLIS-style blocking parameter computation
293  // Reference: "Anatomy of High-Performance Matrix Multiplication" (Goto & Van de Geijn)
294  //
295  // KC: Controls L1 usage
296  // - A micropanel: MR x KC
297  // - B micropanel: KC x NR (streamed from L2)
298  // - Want MR * KC * sizeof(float) to fit in ~half of L1
299  //
300  // MC: Controls L2 usage
301  // - A block: MC x KC should fit in L2
302  //
303  // NC: Controls L3 usage / main memory streaming
304  // - B panel: KC x NC
305 
306  // KC: Controls L1 usage
307  // For optimal performance, both A micropanel and B row should fit in L1:
308  // - A micropanel: MR * KC floats
309  // - B row for streaming: NR floats per iteration (small)
310  // Use ~25% of L1 for A micropanel to leave room for B and working set
311  size_t l1_for_a = (l1 * 25) / 100;
312  params->KC = (int)(l1_for_a / (params->MR * sizeof(float)));
313 
314  // Round KC to multiple of 8 for alignment
315  params->KC = (params->KC / 8) * 8;
316  if (params->KC < 64) params->KC = 64;
317  if (params->KC > 512) params->KC = 512; // Cap at 512 for better cache fit
318 
319  // MC: A block = MC * KC * 4 bytes should fit in ~80% of L2
320  size_t l2_for_a = (l2 * 80) / 100;
321  params->MC = (int)(l2_for_a / (params->KC * sizeof(float)));
322 
323  // Round MC to multiple of MR
324  params->MC = (params->MC / params->MR) * params->MR;
325  if (params->MC < params->MR * 4) params->MC = params->MR * 4;
326  if (params->MC > 512) params->MC = 512;
327 
328  // NC: B panel = KC * NC * 4 bytes
329  // For L3, we want good streaming, use ~50% of L3 / num_cores
330  size_t l3_per_core = l3 / (cpu->num_cores > 0 ? cpu->num_cores : 1);
331  size_t l3_for_b = (l3_per_core * 50) / 100;
332  params->NC = (int)(l3_for_b / (params->KC * sizeof(float)));
333 
334  // Round NC to multiple of NR
335  params->NC = (params->NC / params->NR) * params->NR;
336  if (params->NC < params->NR * 8) params->NC = params->NR * 8;
337  if (params->NC > 8192) params->NC = 8192;
338 }
339 
340 // =============================================================================
341 // Public API
342 // =============================================================================
343 
344 void cpu_features_init(void) {
345  if (g_cpu_initialized) return;
346 
347  memset(&g_cpu_info, 0, sizeof(g_cpu_info));
348  memset(&g_gemm_params, 0, sizeof(g_gemm_params));
349 
350  // Detect SIMD features
351 #ifdef X86_CPU
352  detect_x86_features(&g_cpu_info);
353  detect_x86_cache_sizes(&g_cpu_info);
354 #endif
355 
356  // Linux sysfs fallback for cache sizes
357 #if defined(__linux__)
358  if (g_cpu_info.l1d_size == 0) {
359  detect_linux_cache_sizes(&g_cpu_info);
360  }
361 #endif
362 
363  // Detect physical cores
365 
366  // Compute GEMM parameters based on detected hardware
368 
369  g_cpu_initialized = 1;
370 }
371 
374  return &g_gemm_params;
375 }
376 
377 const CPUInfo* get_cpu_info(void) {
379  return &g_cpu_info;
380 }
381 
382 void print_cpu_info(void) {
384 
385  printf("=== CPU Info ===\n");
386  printf("Physical cores: %d\n", g_cpu_info.num_cores);
387  printf("L1 Data Cache: %zu KB\n", g_cpu_info.l1d_size / 1024);
388  printf("L2 Cache: %zu KB\n", g_cpu_info.l2_size / 1024);
389  printf("L3 Cache: %zu MB\n", g_cpu_info.l3_size / (1024 * 1024));
390  printf("Cache line: %zu bytes\n", g_cpu_info.l1_line_size);
391  printf("AVX: %s\n", g_cpu_info.has_avx ? "yes" : "no");
392  printf("AVX2: %s\n", g_cpu_info.has_avx2 ? "yes" : "no");
393  printf("AVX-512F: %s\n", g_cpu_info.has_avx512f ? "yes" : "no");
394  printf("FMA: %s\n", g_cpu_info.has_fma ? "yes" : "no");
395  printf("\n=== GEMM Blocking Parameters ===\n");
396  printf("MR (microkernel rows): %d\n", g_gemm_params.MR);
397  printf("NR (microkernel cols): %d\n", g_gemm_params.NR);
398  printf("MC (M block): %d\n", g_gemm_params.MC);
399  printf("NC (N block): %d\n", g_gemm_params.NC);
400  printf("KC (K block): %d\n", g_gemm_params.KC);
401  printf("\n");
402 }
const CPUInfo * get_cpu_info(void)
Definition: cpu_features.c:377
void print_cpu_info(void)
Definition: cpu_features.c:382
static void compute_gemm_params(const CPUInfo *cpu, GEMMParams *params)
Definition: cpu_features.c:268
CPUInfo g_cpu_info
Definition: cpu_features.c:28
static int detect_physical_cores(void)
Definition: cpu_features.c:246
int g_cpu_initialized
Definition: cpu_features.c:30
void cpu_features_init(void)
Definition: cpu_features.c:344
const GEMMParams * get_gemm_params(void)
Definition: cpu_features.c:372
GEMMParams g_gemm_params
Definition: cpu_features.c:29
size_t l1d_size
Definition: cpu_features.h:21
size_t l3_size
Definition: cpu_features.h:23
int has_avx2
Definition: cpu_features.h:27
int has_fma
Definition: cpu_features.h:29
int has_avx
Definition: cpu_features.h:26
int has_avx512f
Definition: cpu_features.h:28
int num_cores
Definition: cpu_features.h:25
size_t l2_size
Definition: cpu_features.h:22
size_t l1_line_size
Definition: cpu_features.h:24