← Back to C-Kernel-Engine Docs Doxygen Source Documentation
cpu_features.c File Reference
#include "cpu_features.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

Go to the source code of this file.

Functions

static void compute_gemm_params (const CPUInfo *cpu, GEMMParams *params)
 
void cpu_features_init (void)
 
static int detect_physical_cores (void)
 
const CPUInfoget_cpu_info (void)
 
const GEMMParamsget_gemm_params (void)
 
void print_cpu_info (void)
 

Variables

CPUInfo g_cpu_info = {0}
 
int g_cpu_initialized = 0
 
GEMMParams g_gemm_params = {0}
 

Function Documentation

◆ compute_gemm_params()

static void compute_gemm_params ( const CPUInfo cpu,
GEMMParams params 
)
static

Definition at line 268 of file cpu_features.c.

268  {
269  // Microkernel sizes based on SIMD width
270  // MUST match compile-time MR_FIXED/NR_FIXED in gemm_microkernel.c
271  if (cpu->has_avx512f) {
272  params->MR = 6; // 6 rows
273  params->NR = 32; // 32 cols (2 x ZMM registers)
274  } else if (cpu->has_fma) {
275  // AVX2+FMA: can use 6x16 with FMA hiding register spilling
276  params->MR = 6; // 6 rows
277  params->NR = 16; // 16 cols (2 x YMM registers)
278  } else if (cpu->has_avx || cpu->has_avx2) {
279  // AVX without FMA: use 4x16 to avoid register spilling
280  params->MR = 4; // 4 rows (reduced to fit in 16 YMM registers)
281  params->NR = 16; // 16 cols (2 x YMM registers)
282  } else {
283  params->MR = 4;
284  params->NR = 4;
285  }
286 
287  // Get cache sizes (use defaults if detection failed)
288  size_t l1 = cpu->l1d_size > 0 ? cpu->l1d_size : 32 * 1024; // Default 32KB
289  size_t l2 = cpu->l2_size > 0 ? cpu->l2_size : 256 * 1024; // Default 256KB
290  size_t l3 = cpu->l3_size > 0 ? cpu->l3_size : 8 * 1024 * 1024; // Default 8MB
291 
292  // BLIS-style blocking parameter computation
293  // Reference: "Anatomy of High-Performance Matrix Multiplication" (Goto & Van de Geijn)
294  //
295  // KC: Controls L1 usage
296  // - A micropanel: MR x KC
297  // - B micropanel: KC x NR (streamed from L2)
298  // - Want MR * KC * sizeof(float) to fit in ~half of L1
299  //
300  // MC: Controls L2 usage
301  // - A block: MC x KC should fit in L2
302  //
303  // NC: Controls L3 usage / main memory streaming
304  // - B panel: KC x NC
305 
306  // KC: Controls L1 usage
307  // For optimal performance, both A micropanel and B row should fit in L1:
308  // - A micropanel: MR * KC floats
309  // - B row for streaming: NR floats per iteration (small)
310  // Use ~25% of L1 for A micropanel to leave room for B and working set
311  size_t l1_for_a = (l1 * 25) / 100;
312  params->KC = (int)(l1_for_a / (params->MR * sizeof(float)));
313 
314  // Round KC to multiple of 8 for alignment
315  params->KC = (params->KC / 8) * 8;
316  if (params->KC < 64) params->KC = 64;
317  if (params->KC > 512) params->KC = 512; // Cap at 512 for better cache fit
318 
319  // MC: A block = MC * KC * 4 bytes should fit in ~80% of L2
320  size_t l2_for_a = (l2 * 80) / 100;
321  params->MC = (int)(l2_for_a / (params->KC * sizeof(float)));
322 
323  // Round MC to multiple of MR
324  params->MC = (params->MC / params->MR) * params->MR;
325  if (params->MC < params->MR * 4) params->MC = params->MR * 4;
326  if (params->MC > 512) params->MC = 512;
327 
328  // NC: B panel = KC * NC * 4 bytes
329  // For L3, we want good streaming, use ~50% of L3 / num_cores
330  size_t l3_per_core = l3 / (cpu->num_cores > 0 ? cpu->num_cores : 1);
331  size_t l3_for_b = (l3_per_core * 50) / 100;
332  params->NC = (int)(l3_for_b / (params->KC * sizeof(float)));
333 
334  // Round NC to multiple of NR
335  params->NC = (params->NC / params->NR) * params->NR;
336  if (params->NC < params->NR * 8) params->NC = params->NR * 8;
337  if (params->NC > 8192) params->NC = 8192;
338 }
size_t l1d_size
Definition: cpu_features.h:21
size_t l3_size
Definition: cpu_features.h:23
int has_avx2
Definition: cpu_features.h:27
int has_fma
Definition: cpu_features.h:29
int has_avx
Definition: cpu_features.h:26
int has_avx512f
Definition: cpu_features.h:28
int num_cores
Definition: cpu_features.h:25
size_t l2_size
Definition: cpu_features.h:22

References CPUInfo::has_avx, CPUInfo::has_avx2, CPUInfo::has_avx512f, CPUInfo::has_fma, GEMMParams::KC, CPUInfo::l1d_size, CPUInfo::l2_size, CPUInfo::l3_size, GEMMParams::MC, GEMMParams::MR, GEMMParams::NC, GEMMParams::NR, and CPUInfo::num_cores.

Referenced by cpu_features_init().

◆ cpu_features_init()

void cpu_features_init ( void  )

Definition at line 344 of file cpu_features.c.

344  {
345  if (g_cpu_initialized) return;
346 
347  memset(&g_cpu_info, 0, sizeof(g_cpu_info));
348  memset(&g_gemm_params, 0, sizeof(g_gemm_params));
349 
350  // Detect SIMD features
351 #ifdef X86_CPU
352  detect_x86_features(&g_cpu_info);
353  detect_x86_cache_sizes(&g_cpu_info);
354 #endif
355 
356  // Linux sysfs fallback for cache sizes
357 #if defined(__linux__)
358  if (g_cpu_info.l1d_size == 0) {
359  detect_linux_cache_sizes(&g_cpu_info);
360  }
361 #endif
362 
363  // Detect physical cores
365 
366  // Compute GEMM parameters based on detected hardware
368 
369  g_cpu_initialized = 1;
370 }
static void compute_gemm_params(const CPUInfo *cpu, GEMMParams *params)
Definition: cpu_features.c:268
CPUInfo g_cpu_info
Definition: cpu_features.c:28
static int detect_physical_cores(void)
Definition: cpu_features.c:246
int g_cpu_initialized
Definition: cpu_features.c:30
GEMMParams g_gemm_params
Definition: cpu_features.c:29

References compute_gemm_params(), detect_physical_cores(), g_cpu_info, g_cpu_initialized, g_gemm_params, CPUInfo::l1d_size, and CPUInfo::num_cores.

Referenced by get_cpu_info(), get_gemm_params(), and print_cpu_info().

◆ detect_physical_cores()

static int detect_physical_cores ( void  )
static

Definition at line 246 of file cpu_features.c.

246  {
247 #if defined(__linux__)
248  return detect_linux_physical_cores();
249 #elif defined(_WIN32)
250  SYSTEM_INFO sysinfo;
251  GetSystemInfo(&sysinfo);
252  // Windows: this gives logical processors, divide by 2 for HT estimate
253  return sysinfo.dwNumberOfProcessors / 2;
254 #elif defined(__APPLE__)
255  int cores = 1;
256  size_t len = sizeof(cores);
257  sysctlbyname("hw.physicalcpu", &cores, &len, NULL, 0);
258  return cores;
259 #else
260  return 1;
261 #endif
262 }

Referenced by cpu_features_init().

◆ get_cpu_info()

const CPUInfo* get_cpu_info ( void  )

Definition at line 377 of file cpu_features.c.

377  {
379  return &g_cpu_info;
380 }
void cpu_features_init(void)
Definition: cpu_features.c:344

References cpu_features_init(), g_cpu_info, and g_cpu_initialized.

Referenced by gemm_init_threads().

◆ get_gemm_params()

const GEMMParams* get_gemm_params ( void  )

Definition at line 372 of file cpu_features.c.

372  {
374  return &g_gemm_params;
375 }

References cpu_features_init(), g_cpu_initialized, and g_gemm_params.

◆ print_cpu_info()

void print_cpu_info ( void  )

Definition at line 382 of file cpu_features.c.

382  {
384 
385  printf("=== CPU Info ===\n");
386  printf("Physical cores: %d\n", g_cpu_info.num_cores);
387  printf("L1 Data Cache: %zu KB\n", g_cpu_info.l1d_size / 1024);
388  printf("L2 Cache: %zu KB\n", g_cpu_info.l2_size / 1024);
389  printf("L3 Cache: %zu MB\n", g_cpu_info.l3_size / (1024 * 1024));
390  printf("Cache line: %zu bytes\n", g_cpu_info.l1_line_size);
391  printf("AVX: %s\n", g_cpu_info.has_avx ? "yes" : "no");
392  printf("AVX2: %s\n", g_cpu_info.has_avx2 ? "yes" : "no");
393  printf("AVX-512F: %s\n", g_cpu_info.has_avx512f ? "yes" : "no");
394  printf("FMA: %s\n", g_cpu_info.has_fma ? "yes" : "no");
395  printf("\n=== GEMM Blocking Parameters ===\n");
396  printf("MR (microkernel rows): %d\n", g_gemm_params.MR);
397  printf("NR (microkernel cols): %d\n", g_gemm_params.NR);
398  printf("MC (M block): %d\n", g_gemm_params.MC);
399  printf("NC (N block): %d\n", g_gemm_params.NC);
400  printf("KC (K block): %d\n", g_gemm_params.KC);
401  printf("\n");
402 }
size_t l1_line_size
Definition: cpu_features.h:24

References cpu_features_init(), g_cpu_info, g_cpu_initialized, g_gemm_params, CPUInfo::has_avx, CPUInfo::has_avx2, CPUInfo::has_avx512f, CPUInfo::has_fma, GEMMParams::KC, CPUInfo::l1_line_size, CPUInfo::l1d_size, CPUInfo::l2_size, CPUInfo::l3_size, GEMMParams::MC, GEMMParams::MR, GEMMParams::NC, GEMMParams::NR, and CPUInfo::num_cores.

Variable Documentation

◆ g_cpu_info

CPUInfo g_cpu_info = {0}

CPU Feature Detection and Cache-Aware Parameter Tuning

Detects CPU features, cache sizes, and core counts at runtime. Computes optimal GEMM blocking parameters based on actual hardware.

Definition at line 28 of file cpu_features.c.

Referenced by cpu_features_init(), get_cpu_info(), and print_cpu_info().

◆ g_cpu_initialized

int g_cpu_initialized = 0

Definition at line 30 of file cpu_features.c.

Referenced by cpu_features_init(), get_cpu_info(), get_gemm_params(), and print_cpu_info().

◆ g_gemm_params

GEMMParams g_gemm_params = {0}

Definition at line 29 of file cpu_features.c.

Referenced by cpu_features_init(), get_gemm_params(), and print_cpu_info().