20 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
22 #if defined(__GNUC__) || defined(__clang__)
37 static void cpuid(
int leaf,
int subleaf, uint32_t* eax, uint32_t* ebx, uint32_t* ecx, uint32_t* edx) {
38 #if defined(__GNUC__) || defined(__clang__)
39 __cpuid_count(leaf, subleaf, *eax, *ebx, *ecx, *edx);
40 #elif defined(_MSC_VER)
42 __cpuidex(regs, leaf, subleaf);
43 *eax = regs[0]; *ebx = regs[1]; *ecx = regs[2]; *edx = regs[3];
45 *eax = *ebx = *ecx = *edx = 0;
49 static void detect_x86_features(
CPUInfo* info) {
50 uint32_t eax, ebx, ecx, edx;
53 cpuid(0, 0, &eax, &ebx, &ecx, &edx);
54 uint32_t max_leaf = eax;
57 cpuid(1, 0, &eax, &ebx, &ecx, &edx);
58 info->
has_avx = (ecx >> 28) & 1;
59 info->
has_fma = (ecx >> 12) & 1;
63 cpuid(7, 0, &eax, &ebx, &ecx, &edx);
70 static void detect_x86_cache_sizes(
CPUInfo* info) {
71 uint32_t eax, ebx, ecx, edx;
74 for (
int index = 0; index < 16; index++) {
75 cpuid(0x04, index, &eax, &ebx, &ecx, &edx);
77 int cache_type = eax & 0x1F;
78 if (cache_type == 0)
break;
80 int cache_level = (eax >> 5) & 0x7;
81 int line_size = (ebx & 0xFFF) + 1;
82 int partitions = ((ebx >> 12) & 0x3FF) + 1;
83 int ways = ((ebx >> 22) & 0x3FF) + 1;
86 size_t cache_size = (size_t)line_size * partitions * ways * sets;
88 if (cache_type == 1 || cache_type == 3) {
89 if (cache_level == 1) {
92 }
else if (cache_level == 2) {
94 }
else if (cache_level == 3) {
102 cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
103 if (eax >= 0x8000001D) {
104 for (
int index = 0; index < 16; index++) {
105 cpuid(0x8000001D, index, &eax, &ebx, &ecx, &edx);
107 int cache_type = eax & 0x1F;
108 if (cache_type == 0)
break;
110 int cache_level = (eax >> 5) & 0x7;
111 int line_size = (ebx & 0xFFF) + 1;
112 int partitions = ((ebx >> 12) & 0x3FF) + 1;
113 int ways = ((ebx >> 22) & 0x3FF) + 1;
116 size_t cache_size = (size_t)line_size * partitions * ways * sets;
118 if (cache_type == 1 || cache_type == 3) {
119 if (cache_level == 1) {
122 }
else if (cache_level == 2) {
124 }
else if (cache_level == 3) {
138 #if defined(__linux__)
139 static size_t read_sysfs_cache_size(
int cpu,
int index) {
141 snprintf(path,
sizeof(path),
142 "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu, index);
144 FILE* f = fopen(path,
"r");
148 if (!fgets(buf,
sizeof(buf), f)) {
156 sscanf(buf,
"%zu%c", &size, &unit);
158 if (unit ==
'K' || unit ==
'k') size *= 1024;
159 else if (unit ==
'M' || unit ==
'm') size *= 1024 * 1024;
164 static int read_sysfs_cache_level(
int cpu,
int index) {
166 snprintf(path,
sizeof(path),
167 "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu, index);
169 FILE* f = fopen(path,
"r");
173 if (fscanf(f,
"%d", &level) != 1) level = -1;
178 static void detect_linux_cache_sizes(
CPUInfo* info) {
180 for (
int index = 0; index < 10; index++) {
181 size_t size = read_sysfs_cache_size(0, index);
182 if (size == 0)
break;
184 int level = read_sysfs_cache_level(0, index);
188 snprintf(path,
sizeof(path),
189 "/sys/devices/system/cpu/cpu0/cache/index%d/type", index);
190 FILE* f = fopen(path,
"r");
193 if (fscanf(f,
"%31s", type) != 1) type[0] =
'\0';
196 if (strcmp(type,
"Data") == 0 || strcmp(type,
"Unified") == 0) {
197 if (level == 1) info->
l1d_size = size;
198 else if (level == 2) info->
l2_size = size;
199 else if (level == 3) info->
l3_size = size;
205 static int detect_linux_physical_cores(
void) {
207 int max_core_id = -1;
208 int num_processors = 0;
210 FILE* f = fopen(
"/proc/cpuinfo",
"r");
214 while (fgets(line,
sizeof(line), f)) {
215 if (strncmp(line,
"processor", 9) == 0) {
218 if (strncmp(line,
"cpu cores", 9) == 0) {
220 sscanf(line,
"cpu cores : %d", &cores);
226 if (strncmp(line,
"core id", 7) == 0) {
228 sscanf(line,
"core id : %d", &core_id);
229 if (core_id > max_core_id) max_core_id = core_id;
235 if (max_core_id >= 0) {
236 return max_core_id + 1;
238 return num_processors > 0 ? num_processors : 1;
247 #if defined(__linux__)
248 return detect_linux_physical_cores();
249 #elif defined(_WIN32)
251 GetSystemInfo(&sysinfo);
253 return sysinfo.dwNumberOfProcessors / 2;
254 #elif defined(__APPLE__)
256 size_t len =
sizeof(cores);
257 sysctlbyname(
"hw.physicalcpu", &cores, &len, NULL, 0);
311 size_t l1_for_a = (l1 * 25) / 100;
312 params->
KC = (int)(l1_for_a / (params->
MR *
sizeof(
float)));
315 params->
KC = (params->
KC / 8) * 8;
316 if (params->
KC < 64) params->
KC = 64;
317 if (params->
KC > 512) params->
KC = 512;
320 size_t l2_for_a = (l2 * 80) / 100;
321 params->
MC = (int)(l2_for_a / (params->
KC *
sizeof(
float)));
324 params->
MC = (params->
MC / params->
MR) * params->
MR;
325 if (params->
MC < params->
MR * 4) params->
MC = params->
MR * 4;
326 if (params->
MC > 512) params->
MC = 512;
331 size_t l3_for_b = (l3_per_core * 50) / 100;
332 params->
NC = (int)(l3_for_b / (params->
KC *
sizeof(
float)));
335 params->
NC = (params->
NC / params->
NR) * params->
NR;
336 if (params->
NC < params->
NR * 8) params->
NC = params->
NR * 8;
337 if (params->
NC > 8192) params->
NC = 8192;
357 #if defined(__linux__)
385 printf(
"=== CPU Info ===\n");
395 printf(
"\n=== GEMM Blocking Parameters ===\n");
const CPUInfo * get_cpu_info(void)
void print_cpu_info(void)
static void compute_gemm_params(const CPUInfo *cpu, GEMMParams *params)
static int detect_physical_cores(void)
void cpu_features_init(void)
const GEMMParams * get_gemm_params(void)