18 #include <sys/utsname.h>
19 #include <sys/sysinfo.h>
31 char *
end = str + strlen(str) - 1;
32 while (
end > str && isspace(*
end)) *
end-- =
'\0';
39 FILE *f = fopen(path,
"r");
41 if (!fgets(buf, buf_size, f)) {
59 return strtoull(buf, NULL, 10);
62 static int run_command(
const char *cmd,
char *output,
size_t output_size) {
63 FILE *fp = popen(cmd,
"r");
67 while (total < output_size - 1) {
68 size_t n = fread(output + total, 1, output_size - 1 - total, fp);
73 int status = pclose(fp);
74 return WEXITSTATUS(status);
79 for (
const char *p = hex_mask; *p; p++) {
80 if (*p ==
',' || *p ==
'\n')
continue;
82 if (*p >=
'0' && *p <=
'9') val = *p -
'0';
83 else if (*p >=
'a' && *p <=
'f') val = *p -
'a' + 10;
84 else if (*p >=
'A' && *p <=
'F') val = *p -
'A' + 10;
85 while (val) { count += val & 1; val >>= 1; }
93 if (!flags || !flag)
return 0;
94 size_t flag_len = strlen(flag);
95 const char *p = flags;
96 while ((p = strstr(p, flag)) != NULL) {
98 int start_ok = (p == flags) || (*(p - 1) ==
' ');
100 char after = *(p + flag_len);
101 int end_ok = (after ==
'\0') || (after ==
' ') || (after ==
'\n');
102 if (start_ok && end_ok)
return 1;
113 memset(cpu, 0,
sizeof(*cpu));
115 FILE *f = fopen(
"/proc/cpuinfo",
"r");
119 int processor_count = 0;
120 int physical_id_max = -1;
121 int core_id_max = -1;
123 while (fgets(line,
sizeof(line), f)) {
124 char *colon = strchr(line,
':');
125 if (!colon)
continue;
128 char *value = colon + 1;
133 if (strcmp(key,
"processor") == 0) {
135 }
else if (strcmp(key,
"model name") == 0 && cpu->
model_name[0] ==
'\0') {
137 }
else if (strcmp(key,
"vendor_id") == 0 && cpu->
vendor[0] ==
'\0') {
139 }
else if (strcmp(key,
"cpu family") == 0 && cpu->
family == 0) {
140 cpu->
family = atoi(value);
141 }
else if (strcmp(key,
"model") == 0 && cpu->
model == 0) {
142 cpu->
model = atoi(value);
143 }
else if (strcmp(key,
"stepping") == 0 && cpu->
stepping == 0) {
145 }
else if (strcmp(key,
"cpu MHz") == 0 && cpu->
base_freq_mhz == 0) {
147 }
else if (strcmp(key,
"physical id") == 0) {
148 int id = atoi(value);
149 if (
id > physical_id_max) physical_id_max =
id;
150 }
else if (strcmp(key,
"core id") == 0) {
151 int id = atoi(value);
152 if (
id > core_id_max) core_id_max =
id;
153 }
else if (strcmp(key,
"flags") == 0) {
172 cpu->
sockets = physical_id_max + 1;
176 int cores_per_socket =
read_file_int(
"/sys/devices/system/cpu/cpu0/topology/core_cpus_list");
177 if (cores_per_socket < 0) {
184 int unique_cores = 0;
188 snprintf(path,
sizeof(path),
189 "/sys/devices/system/cpu/cpu%d/topology/core_id", i);
191 if (core_id >= 0 && core_id <
MAX_CPUS && !seen_cores[core_id]) {
192 seen_cores[core_id] = 1;
203 int max_freq =
read_file_int(
"/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
238 int type_order_a = (strcmp(ca->
type,
"Data") == 0) ? 0 :
239 (strcmp(ca->
type,
"Instruction") == 0) ? 1 : 2;
240 int type_order_b = (strcmp(cb->
type,
"Data") == 0) ? 0 :
241 (strcmp(cb->
type,
"Instruction") == 0) ? 1 : 2;
243 return type_order_a - type_order_b;
247 memset(cache, 0,
sizeof(*cache));
249 const char *base =
"/sys/devices/system/cpu/cpu0/cache";
250 DIR *dir = opendir(base);
253 struct dirent *entry;
254 while ((entry = readdir(dir)) != NULL) {
255 if (strncmp(entry->d_name,
"index", 5) != 0)
continue;
260 snprintf(path,
sizeof(path),
"%s/%s/level", base, entry->d_name);
263 snprintf(path,
sizeof(path),
"%s/%s/type", base, entry->d_name);
266 snprintf(path,
sizeof(path),
"%s/%s/size", base, entry->d_name);
272 snprintf(path,
sizeof(path),
"%s/%s/coherency_line_size", base, entry->d_name);
275 snprintf(path,
sizeof(path),
"%s/%s/ways_of_associativity", base, entry->d_name);
278 snprintf(path,
sizeof(path),
"%s/%s/shared_cpu_map", base, entry->d_name);
284 if (ci->
level == 3) {
304 memset(numa, 0,
sizeof(*numa));
306 const char *base =
"/sys/devices/system/node";
307 DIR *dir = opendir(base);
314 if (sysinfo(&si) == 0) {
321 struct dirent *entry;
322 while ((entry = readdir(dir)) != NULL) {
323 if (strncmp(entry->d_name,
"node", 4) != 0)
continue;
324 if (!isdigit(entry->d_name[4]))
continue;
326 int node_id = atoi(entry->d_name + 4);
335 snprintf(path,
sizeof(path),
"%s/%s/meminfo", base, entry->d_name);
336 FILE *f = fopen(path,
"r");
339 while (fgets(line,
sizeof(line), f)) {
341 if (sscanf(line,
"Node %*d MemTotal: %lu kB", &val) == 1) {
343 }
else if (sscanf(line,
"Node %*d MemFree: %lu kB", &val) == 1) {
351 snprintf(path,
sizeof(path),
"%s/%s/cpulist", base, entry->d_name);
356 char *
token = strtok_r(cpulist,
",", &saveptr);
363 }
else if (sscanf(
token,
"%d", &
start) == 1) {
366 token = strtok_r(NULL,
",", &saveptr);
376 snprintf(path,
sizeof(path),
"%s/node0/distance", base);
380 char *
token = strtok_r(dist_str,
" ", &saveptr);
382 while (
token && col < numa->num_nodes) {
384 token = strtok_r(NULL,
" ", &saveptr);
395 #include <sys/time.h>
401 snprintf(path,
sizeof(path),
"/sys/devices/system/cpu/cpu%d/node0", cpu);
404 for (
int node = 0; node < 16; node++) {
405 snprintf(path,
sizeof(path),
"/sys/devices/system/node/node%d/cpu%d", node, cpu);
406 if (access(path, F_OK) == 0) {
415 return sched_getcpu();
429 const size_t SIZE = 256 * 1024 * 1024;
430 const size_t COUNT = SIZE /
sizeof(double);
431 const int ITERATIONS = 3;
437 if (numa_node_out) *numa_node_out = numa_node;
445 int num_threads = omp_get_max_threads();
446 if (num_threads_out) *num_threads_out = num_threads;
448 if (num_threads_out) *num_threads_out = 1;
452 double *a = NULL, *b = NULL, *c = NULL;
453 if (posix_memalign((
void**)&a, 64, SIZE) != 0 ||
454 posix_memalign((
void**)&b, 64, SIZE) != 0 ||
455 posix_memalign((
void**)&c, 64, SIZE) != 0) {
465 for (
size_t i = 0; i < COUNT; i++) {
472 for (
size_t i = 0; i < COUNT; i++) {
479 const double scalar = 3.0;
482 clock_gettime(CLOCK_MONOTONIC, &
start);
484 for (
int iter = 0; iter < ITERATIONS; iter++) {
485 #pragma omp parallel for schedule(static)
486 for (
size_t i = 0; i < COUNT; i++) {
487 c[i] = a[i] + scalar * b[i];
489 __asm__
volatile(
"" :::
"memory");
492 clock_gettime(CLOCK_MONOTONIC, &
end);
495 volatile double sum = 0;
496 for (
size_t i = 0; i < COUNT; i += COUNT/10) {
506 double elapsed_sec = (
end.tv_sec -
start.tv_sec) +
507 (
end.tv_nsec -
start.tv_nsec) / 1e9;
510 double total_bytes = (double)SIZE * 3.0 * ITERATIONS;
511 double bandwidth_gbs = (total_bytes / elapsed_sec) / (1024.0 * 1024.0 * 1024.0);
513 return (
float)bandwidth_gbs;
524 if (measured_bw_gbs <= 0 || memory_speed_mhz <= 0)
return 0;
528 float bw_per_channel = (memory_speed_mhz * 8.0f) / 1000.0f;
535 float efficiency = 0.75f;
538 float estimated_channels = measured_bw_gbs / (bw_per_channel * efficiency);
541 if (estimated_channels < 1.3f)
return 1;
542 if (estimated_channels < 2.5f)
return 2;
543 if (estimated_channels < 3.5f)
return 3;
544 if (estimated_channels < 5.0f)
return 4;
545 if (estimated_channels < 7.0f)
return 6;
554 memset(mem, 0,
sizeof(*mem));
557 FILE *f = fopen(
"/proc/meminfo",
"r");
560 while (fgets(line,
sizeof(line), f)) {
562 if (sscanf(line,
"MemTotal: %lu kB", &val) == 1) {
564 }
else if (sscanf(line,
"MemAvailable: %lu kB", &val) == 1) {
566 }
else if (sscanf(line,
"Cached: %lu kB", &val) == 1) {
575 if (
run_command(
"dmidecode -t memory 2>/dev/null", output,
sizeof(output)) == 0 &&
576 strlen(output) > 100) {
578 char *line = strtok(output,
"\n");
584 if (strstr(line,
"Memory Device")) {
587 memset(current_slot, 0,
sizeof(*current_slot));
590 }
else if (current_slot) {
595 if (sscanf(line,
"Size: %lu MB", &val) == 1) {
599 }
else if (sscanf(line,
"Size: %lu GB", &val) == 1) {
600 current_slot->
size_mb = val * 1024;
603 }
else if (strstr(line,
"Size: No Module")) {
605 }
else if (sscanf(line,
"Speed: %d MT/s", &ival) == 1 ||
606 sscanf(line,
"Speed: %d MHz", &ival) == 1) {
609 }
else if (sscanf(line,
"Type: %63s", str) == 1) {
610 strncpy(current_slot->
type, str,
sizeof(current_slot->
type) - 1);
614 }
else if (sscanf(line,
"Locator: %63s", str) == 1) {
615 strncpy(current_slot->
locator, str,
sizeof(current_slot->
locator) - 1);
616 }
else if (sscanf(line,
"Rank: %d", &ival) == 1) {
617 current_slot->
rank = ival;
618 }
else if (sscanf(line,
"Data Width: %d bits", &ival) == 1) {
623 line = strtok(NULL,
"\n");
655 float bytes_per_transfer = 8.0f;
725 float per_lane[] = {0, 0.25f, 0.5f, 0.985f, 1.969f, 3.938f, 7.877f};
726 if (gen < 1 || gen > 6) gen = 3;
727 return per_lane[gen] * width;
731 memset(pcie, 0,
sizeof(*pcie));
734 if (
run_command(
"lspci -vvv 2>/dev/null", output,
sizeof(output)) != 0) {
739 char *line = strtok(output,
"\n");
743 if (strlen(line) > 0 && isxdigit(line[0]) && line[2] ==
':') {
746 memset(current, 0,
sizeof(*current));
752 char *name_start = strchr(line,
':');
754 name_start = strchr(name_start + 1,
':');
757 while (*name_start ==
' ') name_start++;
764 current->
is_gpu = (strstr(line,
"VGA") != NULL ||
765 strstr(line,
"3D controller") != NULL ||
766 strstr(line,
"Display") != NULL);
767 current->
is_nic = (strstr(line,
"Ethernet") != NULL ||
768 strstr(line,
"Network") != NULL ||
769 strstr(line,
"InfiniBand") != NULL);
770 current->
is_nvme = (strstr(line,
"Non-Volatile memory") != NULL);
772 }
else if (current) {
774 if (strstr(line,
"LnkCap:")) {
775 char *speed = strstr(line,
"Speed ");
776 char *width = strstr(line,
"Width x");
779 if (sscanf(speed,
"Speed %fGT/s", >s) == 1) {
791 }
else if (strstr(line,
"LnkSta:")) {
792 char *speed = strstr(line,
"Speed ");
793 char *width = strstr(line,
"Width x");
796 if (sscanf(speed,
"Speed %fGT/s", >s) == 1) {
806 sscanf(width,
"Width x%d", ¤t->
link_width);
811 line = strtok(NULL,
"\n");
833 memset(net, 0,
sizeof(*net));
835 const char *base =
"/sys/class/net";
836 DIR *dir = opendir(base);
839 struct dirent *entry;
840 while ((entry = readdir(dir)) != NULL) {
841 if (entry->d_name[0] ==
'.')
continue;
842 if (strcmp(entry->d_name,
"lo") == 0)
continue;
846 memset(nic, 0,
sizeof(*nic));
848 strncpy(nic->
name, entry->d_name,
sizeof(nic->
name) - 1);
853 snprintf(path,
sizeof(path),
"%s/%s/operstate", base, entry->d_name);
856 nic->
is_up = (strcmp(state,
"up") == 0);
860 snprintf(path,
sizeof(path),
"%s/%s/speed", base, entry->d_name);
868 snprintf(path,
sizeof(path),
"%s/%s/mtu", base, entry->d_name);
872 snprintf(path,
sizeof(path),
"%s/%s/address", base, entry->d_name);
876 snprintf(path,
sizeof(path),
"%s/%s/device/driver", base, entry->d_name);
877 char driver_link[512];
878 ssize_t len = readlink(path, driver_link,
sizeof(driver_link) - 1);
880 driver_link[len] =
'\0';
881 char *driver_name = strrchr(driver_link,
'/');
883 strncpy(nic->
driver, driver_name + 1,
sizeof(nic->
driver) - 1);
888 snprintf(path,
sizeof(path),
"/sys/class/infiniband/%s", entry->d_name);
889 if (access(path, F_OK) == 0) {
895 if (strstr(nic->
driver,
"mlx") || strstr(nic->
driver,
"bnxt") ||
896 strstr(nic->
driver,
"qed")) {
902 snprintf(path,
sizeof(path),
"%s/%s/device", base, entry->d_name);
904 len = readlink(path, pci_link,
sizeof(pci_link) - 1);
906 pci_link[len] =
'\0';
907 char *pci = strrchr(pci_link,
'/');
937 memset(aff, 0,
sizeof(*aff));
940 const char *omp_threads = getenv(
"OMP_NUM_THREADS");
947 const char *omp_bind = getenv(
"OMP_PROC_BIND");
955 const char *omp_places = getenv(
"OMP_PLACES");
964 if (sched_getaffinity(0,
sizeof(
mask), &
mask) == 0) {
966 if (CPU_ISSET(i, &
mask)) {
980 memset(topo, 0,
sizeof(*topo));
986 if (uname(&uts) == 0) {
988 "%s %s", uts.sysname, uts.release);
1012 memset(recs, 0,
sizeof(*recs));
1021 strcpy(r->
title,
"Memory Slots Available");
1023 "Only %d of %d memory slots populated. Adding more DIMMs "
1024 "could increase memory bandwidth.",
1027 "Consider adding %d more matching DIMMs for better bandwidth",
1036 strcpy(r->
title,
"Single-Channel Memory");
1038 "Running in single-channel mode significantly reduces memory bandwidth. "
1039 "This will impact training performance.");
1041 "Add a second DIMM in the correct slot to enable dual-channel mode");
1049 strcpy(r->
title,
"OpenMP Affinity Not Set");
1051 "OpenMP thread affinity is not configured. Threads may migrate "
1052 "between cores causing cache misses and NUMA penalties.");
1054 "export OMP_PROC_BIND=close OMP_PLACES=cores");
1062 strcpy(r->
title,
"Slow Network for Distributed Training");
1064 "Maximum network bandwidth is %.2f GB/s. This will be a "
1065 "significant bottleneck for distributed training.",
1068 "Consider upgrading to 10GbE+ or InfiniBand for distributed training");
1076 strcpy(r->
title,
"No RDMA-Capable NICs");
1078 "No RDMA-capable network adapters detected. RDMA enables direct "
1079 "memory access between nodes, reducing latency for gradient sync.");
1081 "Consider Mellanox ConnectX or Intel E810 for RDMA support");
1089 strcpy(r->
title,
"Limited SIMD Support");
1091 "CPU does not support AVX2. Kernel performance will be limited.");
1092 strcpy(r->
action,
"AVX2+ CPUs provide significantly better performance");
1100 strcpy(r->
title,
"Multi-NUMA System Detected");
1102 "System has %d NUMA nodes. Cross-node memory access is slower. "
1103 "Ensure data locality for best performance.",
1106 "Use numactl --localalloc or NUMA-aware memory allocation");
1121 uint64_t model_size_mb) {
1127 return model_size_mb / effective_bw;
int ways_of_associativity
float measured_bandwidth_gbs
float theoretical_bandwidth_gbs
NetworkInterface interfaces[8]
Recommendation recommendations[32]
RecommendationPriority priority
RecommendationCategory category
int topology_estimate_channels_from_bandwidth(float measured_bw_gbs, int memory_speed_mhz, const char *memory_type)
static uint64_t read_file_uint64(const char *path)
int topology_discover_memory(MemoryInfo *mem)
static int cache_compare(const void *a, const void *b)
int topology_discover_pcie(PCIeTopology *pcie)
int topology_discover(SystemTopology *topo)
int topology_discover_cpu(CPUInfo *cpu)
static int get_current_cpu(void)
static int get_numa_node_for_cpu(int cpu)
float topology_measure_memory_bandwidth(void)
int topology_discover_cache(CacheTopology *cache)
static void trim_string(char *str)
static int run_command(const char *cmd, char *output, size_t output_size)
float topology_measure_memory_bandwidth_ex(int *numa_node_out, int *num_threads_out)
static int count_set_bits(const char *hex_mask)
int topology_discover_network(NetworkTopology *net)
float topology_estimate_network_training_time(const NetworkTopology *net, uint64_t model_size_mb)
static int read_file_int(const char *path)
int topology_generate_recommendations(const SystemTopology *topo, RecommendationList *recs)
static float pcie_bandwidth_gbs(int gen, int width)
float topology_estimate_memory_bandwidth(const MemoryInfo *mem)
int topology_discover_affinity(AffinityInfo *aff)
static int has_cpu_flag(const char *flags, const char *flag)
static int read_file_string(const char *path, char *buf, size_t buf_size)
int topology_discover_numa(NUMATopology *numa)
int32_t int32_t int32_t int32_t int32_t mask