← Back to C-Kernel-Engine Docs Doxygen Source Documentation
system_topology.c
Go to the documentation of this file.
1 /*
2  * system_topology.c - System hardware topology discovery implementation
3  *
4  * Probes system hardware via /proc, /sys, and external tools to provide
5  * comprehensive information for distributed training optimization.
6  */
7 
8 #define _GNU_SOURCE
9 #include "system_topology.h"
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <dirent.h>
16 #include <ctype.h>
17 #include <sched.h>
18 #include <sys/utsname.h>
19 #include <sys/sysinfo.h>
20 
21 #ifdef _OPENMP
22 #include <omp.h>
23 #endif
24 
25 // ═══════════════════════════════════════════════════════════════════════════════
26 // Helper Functions
27 // ═══════════════════════════════════════════════════════════════════════════════
28 
29 static void trim_string(char *str) {
30  if (!str) return;
31  char *end = str + strlen(str) - 1;
32  while (end > str && isspace(*end)) *end-- = '\0';
33  char *start = str;
34  while (*start && isspace(*start)) start++;
35  if (start != str) memmove(str, start, strlen(start) + 1);
36 }
37 
38 static int read_file_string(const char *path, char *buf, size_t buf_size) {
39  FILE *f = fopen(path, "r");
40  if (!f) return -1;
41  if (!fgets(buf, buf_size, f)) {
42  fclose(f);
43  return -1;
44  }
45  fclose(f);
46  trim_string(buf);
47  return 0;
48 }
49 
50 static int read_file_int(const char *path) {
51  char buf[64];
52  if (read_file_string(path, buf, sizeof(buf)) < 0) return -1;
53  return atoi(buf);
54 }
55 
56 static uint64_t read_file_uint64(const char *path) {
57  char buf[64];
58  if (read_file_string(path, buf, sizeof(buf)) < 0) return 0;
59  return strtoull(buf, NULL, 10);
60 }
61 
62 static int run_command(const char *cmd, char *output, size_t output_size) {
63  FILE *fp = popen(cmd, "r");
64  if (!fp) return -1;
65 
66  size_t total = 0;
67  while (total < output_size - 1) {
68  size_t n = fread(output + total, 1, output_size - 1 - total, fp);
69  if (n == 0) break;
70  total += n;
71  }
72  output[total] = '\0';
73  int status = pclose(fp);
74  return WEXITSTATUS(status);
75 }
76 
77 static int count_set_bits(const char *hex_mask) {
78  int count = 0;
79  for (const char *p = hex_mask; *p; p++) {
80  if (*p == ',' || *p == '\n') continue;
81  int val = 0;
82  if (*p >= '0' && *p <= '9') val = *p - '0';
83  else if (*p >= 'a' && *p <= 'f') val = *p - 'a' + 10;
84  else if (*p >= 'A' && *p <= 'F') val = *p - 'A' + 10;
85  while (val) { count += val & 1; val >>= 1; }
86  }
87  return count;
88 }
89 
90 // Check if a CPU flag exists as a complete word in the flags string
91 // This properly handles word boundaries to avoid "avx" matching "avx2"
92 static int has_cpu_flag(const char *flags, const char *flag) {
93  if (!flags || !flag) return 0;
94  size_t flag_len = strlen(flag);
95  const char *p = flags;
96  while ((p = strstr(p, flag)) != NULL) {
97  // Check character before (must be start or space)
98  int start_ok = (p == flags) || (*(p - 1) == ' ');
99  // Check character after (must be end, space, or newline)
100  char after = *(p + flag_len);
101  int end_ok = (after == '\0') || (after == ' ') || (after == '\n');
102  if (start_ok && end_ok) return 1;
103  p++;
104  }
105  return 0;
106 }
107 
108 // ═══════════════════════════════════════════════════════════════════════════════
109 // CPU Discovery
110 // ═══════════════════════════════════════════════════════════════════════════════
111 
113  memset(cpu, 0, sizeof(*cpu));
114 
115  FILE *f = fopen("/proc/cpuinfo", "r");
116  if (!f) return -1;
117 
118  char line[4096]; // Flags line can be very long on modern CPUs (AVX-512 + AMX)
119  int processor_count = 0;
120  int physical_id_max = -1;
121  int core_id_max = -1;
122 
123  while (fgets(line, sizeof(line), f)) {
124  char *colon = strchr(line, ':');
125  if (!colon) continue;
126 
127  char *key = line;
128  char *value = colon + 1;
129  *colon = '\0';
130  trim_string(key);
131  trim_string(value);
132 
133  if (strcmp(key, "processor") == 0) {
134  processor_count++;
135  } else if (strcmp(key, "model name") == 0 && cpu->model_name[0] == '\0') {
136  strncpy(cpu->model_name, value, sizeof(cpu->model_name) - 1);
137  } else if (strcmp(key, "vendor_id") == 0 && cpu->vendor[0] == '\0') {
138  strncpy(cpu->vendor, value, sizeof(cpu->vendor) - 1);
139  } else if (strcmp(key, "cpu family") == 0 && cpu->family == 0) {
140  cpu->family = atoi(value);
141  } else if (strcmp(key, "model") == 0 && cpu->model == 0) {
142  cpu->model = atoi(value);
143  } else if (strcmp(key, "stepping") == 0 && cpu->stepping == 0) {
144  cpu->stepping = atoi(value);
145  } else if (strcmp(key, "cpu MHz") == 0 && cpu->base_freq_mhz == 0) {
146  cpu->base_freq_mhz = atof(value);
147  } else if (strcmp(key, "physical id") == 0) {
148  int id = atoi(value);
149  if (id > physical_id_max) physical_id_max = id;
150  } else if (strcmp(key, "core id") == 0) {
151  int id = atoi(value);
152  if (id > core_id_max) core_id_max = id;
153  } else if (strcmp(key, "flags") == 0) {
154  // Use word-boundary-aware matching for CPU flags
155  cpu->has_sse4_2 = has_cpu_flag(value, "sse4_2");
156  cpu->has_avx = has_cpu_flag(value, "avx");
157  cpu->has_avx2 = has_cpu_flag(value, "avx2");
158  cpu->has_avx512f = has_cpu_flag(value, "avx512f");
159  cpu->has_avx512bw = has_cpu_flag(value, "avx512bw");
160  cpu->has_avx512vl = has_cpu_flag(value, "avx512vl");
161  cpu->has_avx512_bf16 = has_cpu_flag(value, "avx512_bf16");
162  cpu->has_amx_tile = has_cpu_flag(value, "amx_tile");
163  cpu->has_amx_int8 = has_cpu_flag(value, "amx_int8");
164  cpu->has_amx_bf16 = has_cpu_flag(value, "amx_bf16");
165  cpu->has_amx = cpu->has_amx_tile || cpu->has_amx_int8 || cpu->has_amx_bf16;
166  cpu->has_vnni = has_cpu_flag(value, "avx512_vnni") || has_cpu_flag(value, "avx_vnni");
167  }
168  }
169  fclose(f);
170 
171  cpu->logical_cores = processor_count;
172  cpu->sockets = physical_id_max + 1;
173  if (cpu->sockets < 1) cpu->sockets = 1;
174 
175  // Read from /sys for more accurate core count
176  int cores_per_socket = read_file_int("/sys/devices/system/cpu/cpu0/topology/core_cpus_list");
177  if (cores_per_socket < 0) {
178  // Fallback: estimate from logical cores and sockets
179  cpu->physical_cores = cpu->logical_cores / 2; // Assume HT
180  cpu->cores_per_socket = cpu->physical_cores / cpu->sockets;
181  } else {
182  // Count unique core IDs
183  char path[256];
184  int unique_cores = 0;
185  int seen_cores[MAX_CPUS] = {0};
186 
187  for (int i = 0; i < cpu->logical_cores && i < MAX_CPUS; i++) {
188  snprintf(path, sizeof(path),
189  "/sys/devices/system/cpu/cpu%d/topology/core_id", i);
190  int core_id = read_file_int(path);
191  if (core_id >= 0 && core_id < MAX_CPUS && !seen_cores[core_id]) {
192  seen_cores[core_id] = 1;
193  unique_cores++;
194  }
195  }
196  cpu->physical_cores = unique_cores > 0 ? unique_cores : cpu->logical_cores / 2;
197  cpu->cores_per_socket = cpu->physical_cores / cpu->sockets;
198  }
199 
200  cpu->threads_per_core = cpu->logical_cores / cpu->physical_cores;
201 
202  // Try to get max frequency
203  int max_freq = read_file_int("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
204  if (max_freq > 0) {
205  cpu->max_freq_mhz = max_freq / 1000.0f;
206  }
207 
208  // Estimate PCIe lanes based on CPU model
209  if (strstr(cpu->model_name, "Xeon") || strstr(cpu->model_name, "EPYC")) {
210  cpu->pcie_lanes_total = 64; // Server CPUs typically have more
211  cpu->pcie_generation = 4;
212  } else if (strstr(cpu->model_name, "i9") || strstr(cpu->model_name, "i7")) {
213  cpu->pcie_lanes_total = 20;
214  cpu->pcie_generation = cpu->has_avx512f ? 4 : 3;
215  } else {
216  cpu->pcie_lanes_total = 16;
217  cpu->pcie_generation = 3;
218  }
219 
220  return 0;
221 }
222 
223 // ═══════════════════════════════════════════════════════════════════════════════
224 // Cache Discovery
225 // ═══════════════════════════════════════════════════════════════════════════════
226 
227 // Sort caches by level (L1 first), then by type (Data before Instruction before Unified)
228 static int cache_compare(const void *a, const void *b) {
229  const CacheInfo *ca = (const CacheInfo *)a;
230  const CacheInfo *cb = (const CacheInfo *)b;
231 
232  // First sort by level (L1 < L2 < L3)
233  if (ca->level != cb->level) {
234  return ca->level - cb->level;
235  }
236 
237  // Within same level, sort by type: Data, Instruction, Unified
238  int type_order_a = (strcmp(ca->type, "Data") == 0) ? 0 :
239  (strcmp(ca->type, "Instruction") == 0) ? 1 : 2;
240  int type_order_b = (strcmp(cb->type, "Data") == 0) ? 0 :
241  (strcmp(cb->type, "Instruction") == 0) ? 1 : 2;
242 
243  return type_order_a - type_order_b;
244 }
245 
247  memset(cache, 0, sizeof(*cache));
248 
249  const char *base = "/sys/devices/system/cpu/cpu0/cache";
250  DIR *dir = opendir(base);
251  if (!dir) return -1;
252 
253  struct dirent *entry;
254  while ((entry = readdir(dir)) != NULL) {
255  if (strncmp(entry->d_name, "index", 5) != 0) continue;
256 
257  char path[512];
258  CacheInfo *ci = &cache->levels[cache->num_levels];
259 
260  snprintf(path, sizeof(path), "%s/%s/level", base, entry->d_name);
261  ci->level = read_file_int(path);
262 
263  snprintf(path, sizeof(path), "%s/%s/type", base, entry->d_name);
264  read_file_string(path, ci->type, sizeof(ci->type));
265 
266  snprintf(path, sizeof(path), "%s/%s/size", base, entry->d_name);
267  char size_str[32];
268  if (read_file_string(path, size_str, sizeof(size_str)) == 0) {
269  ci->size_kb = atoi(size_str); // Usually in KB with 'K' suffix
270  }
271 
272  snprintf(path, sizeof(path), "%s/%s/coherency_line_size", base, entry->d_name);
273  ci->line_size_bytes = read_file_int(path);
274 
275  snprintf(path, sizeof(path), "%s/%s/ways_of_associativity", base, entry->d_name);
277 
278  snprintf(path, sizeof(path), "%s/%s/shared_cpu_map", base, entry->d_name);
279  char cpu_map[256];
280  if (read_file_string(path, cpu_map, sizeof(cpu_map)) == 0) {
281  ci->shared_by_cores = count_set_bits(cpu_map);
282  }
283 
284  if (ci->level == 3) {
285  cache->l3_total_kb = ci->size_kb; // Will be multiplied if multiple
286  }
287 
288  cache->num_levels++;
289  if (cache->num_levels >= MAX_CACHE_LEVELS) break;
290  }
291  closedir(dir);
292 
293  // Sort by level (L1 → L2 → L3) and type (Data → Instruction → Unified)
294  qsort(cache->levels, cache->num_levels, sizeof(CacheInfo), cache_compare);
295 
296  return 0;
297 }
298 
299 // ═══════════════════════════════════════════════════════════════════════════════
300 // NUMA Discovery
301 // ═══════════════════════════════════════════════════════════════════════════════
302 
304  memset(numa, 0, sizeof(*numa));
305 
306  const char *base = "/sys/devices/system/node";
307  DIR *dir = opendir(base);
308  if (!dir) {
309  // No NUMA, single node system
310  numa->num_nodes = 1;
311  numa->nodes[0].node_id = 0;
312 
313  struct sysinfo si;
314  if (sysinfo(&si) == 0) {
315  numa->nodes[0].memory_total_mb = si.totalram / (1024 * 1024);
316  numa->nodes[0].memory_free_mb = si.freeram / (1024 * 1024);
317  }
318  return 0;
319  }
320 
321  struct dirent *entry;
322  while ((entry = readdir(dir)) != NULL) {
323  if (strncmp(entry->d_name, "node", 4) != 0) continue;
324  if (!isdigit(entry->d_name[4])) continue;
325 
326  int node_id = atoi(entry->d_name + 4);
327  if (node_id >= MAX_NUMA_NODES) continue;
328 
329  NUMANode *node = &numa->nodes[numa->num_nodes];
330  node->node_id = node_id;
331 
332  char path[512];
333 
334  // Memory info
335  snprintf(path, sizeof(path), "%s/%s/meminfo", base, entry->d_name);
336  FILE *f = fopen(path, "r");
337  if (f) {
338  char line[256];
339  while (fgets(line, sizeof(line), f)) {
340  uint64_t val;
341  if (sscanf(line, "Node %*d MemTotal: %lu kB", &val) == 1) {
342  node->memory_total_mb = val / 1024;
343  } else if (sscanf(line, "Node %*d MemFree: %lu kB", &val) == 1) {
344  node->memory_free_mb = val / 1024;
345  }
346  }
347  fclose(f);
348  }
349 
350  // CPU list
351  snprintf(path, sizeof(path), "%s/%s/cpulist", base, entry->d_name);
352  char cpulist[512];
353  if (read_file_string(path, cpulist, sizeof(cpulist)) == 0) {
354  // Parse CPU list (e.g., "0-7,16-23")
355  char *saveptr;
356  char *token = strtok_r(cpulist, ",", &saveptr);
357  while (token && node->num_cpus < MAX_CPUS) {
358  int start, end;
359  if (sscanf(token, "%d-%d", &start, &end) == 2) {
360  for (int i = start; i <= end && node->num_cpus < MAX_CPUS; i++) {
361  node->cpu_list[node->num_cpus++] = i;
362  }
363  } else if (sscanf(token, "%d", &start) == 1) {
364  node->cpu_list[node->num_cpus++] = start;
365  }
366  token = strtok_r(NULL, ",", &saveptr);
367  }
368  }
369 
370  numa->num_nodes++;
371  }
372  closedir(dir);
373 
374  // Read NUMA distances
375  char path[512];
376  snprintf(path, sizeof(path), "%s/node0/distance", base);
377  char dist_str[256];
378  if (read_file_string(path, dist_str, sizeof(dist_str)) == 0) {
379  char *saveptr;
380  char *token = strtok_r(dist_str, " ", &saveptr);
381  int col = 0;
382  while (token && col < numa->num_nodes) {
383  numa->distances[0][col++] = atoi(token);
384  token = strtok_r(NULL, " ", &saveptr);
385  }
386  }
387 
388  return 0;
389 }
390 
391 // ═══════════════════════════════════════════════════════════════════════════════
392 // Memory Bandwidth Measurement
393 // ═══════════════════════════════════════════════════════════════════════════════
394 
395 #include <sys/time.h>
396 #include <time.h>
397 
398 // Get current NUMA node for a CPU
399 static int get_numa_node_for_cpu(int cpu) {
400  char path[256];
401  snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/node0", cpu);
402 
403  // Check which node directory exists
404  for (int node = 0; node < 16; node++) {
405  snprintf(path, sizeof(path), "/sys/devices/system/node/node%d/cpu%d", node, cpu);
406  if (access(path, F_OK) == 0) {
407  return node;
408  }
409  }
410  return 0; // Default to node 0
411 }
412 
413 // Get current CPU we're running on
414 static int get_current_cpu(void) {
415  return sched_getcpu();
416 }
417 
418 // Measure actual memory bandwidth using streaming operations (STREAM-like benchmark)
419 // Returns bandwidth in GB/s, or -1 on error
420 // Also returns the NUMA node used for the test via numa_node pointer (if not NULL)
421 //
422 // NUMA/SNC Considerations:
423 // - Pins all threads to the NUMA node of the main thread
424 // - Allocates memory on that same NUMA node (via mbind or first-touch)
425 // - This ensures we measure LOCAL bandwidth, not cross-SNC/cross-socket
426 //
427 float topology_measure_memory_bandwidth_ex(int *numa_node_out, int *num_threads_out) {
428  // Use 256 MB buffer - large enough to exceed L3 cache
429  const size_t SIZE = 256 * 1024 * 1024;
430  const size_t COUNT = SIZE / sizeof(double);
431  const int ITERATIONS = 3;
432 
433  // Detect which NUMA node we're on
434  int current_cpu = get_current_cpu();
435  int numa_node = get_numa_node_for_cpu(current_cpu);
436 
437  if (numa_node_out) *numa_node_out = numa_node;
438 
439  // Set thread affinity for consistent NUMA placement
440  #ifdef _OPENMP
441  omp_set_dynamic(0); // Disable dynamic thread adjustment
442 
443  // Try to limit threads to current NUMA node's CPUs
444  // This is approximate - for precise control use numactl
445  int num_threads = omp_get_max_threads();
446  if (num_threads_out) *num_threads_out = num_threads;
447  #else
448  if (num_threads_out) *num_threads_out = 1;
449  #endif
450 
451  // Allocate aligned buffers
452  double *a = NULL, *b = NULL, *c = NULL;
453  if (posix_memalign((void**)&a, 64, SIZE) != 0 ||
454  posix_memalign((void**)&b, 64, SIZE) != 0 ||
455  posix_memalign((void**)&c, 64, SIZE) != 0) {
456  if (a) free(a);
457  if (b) free(b);
458  if (c) free(c);
459  return -1.0f;
460  }
461 
462  // First-touch initialization on the MAIN thread only
463  // This ensures all memory is allocated on the NUMA node of the main thread
464  // Critical for SNC: we want ALL memory on ONE SNC cluster, not spread across
465  for (size_t i = 0; i < COUNT; i++) {
466  a[i] = 1.0;
467  b[i] = 2.0;
468  c[i] = 0.0;
469  }
470 
471  // Warm up pass - single threaded to keep memory local
472  for (size_t i = 0; i < COUNT; i++) {
473  c[i] = a[i] + b[i];
474  }
475 
476  // Now run the timed test with OpenMP
477  // All threads read from the same NUMA node's memory
478  // This measures the bandwidth that ONE NUMA node can deliver
479  const double scalar = 3.0;
480 
481  struct timespec start, end;
482  clock_gettime(CLOCK_MONOTONIC, &start);
483 
484  for (int iter = 0; iter < ITERATIONS; iter++) {
485  #pragma omp parallel for schedule(static)
486  for (size_t i = 0; i < COUNT; i++) {
487  c[i] = a[i] + scalar * b[i];
488  }
489  __asm__ volatile("" ::: "memory");
490  }
491 
492  clock_gettime(CLOCK_MONOTONIC, &end);
493 
494  // Prevent optimizer from removing the computation
495  volatile double sum = 0;
496  for (size_t i = 0; i < COUNT; i += COUNT/10) {
497  sum += c[i];
498  }
499  (void)sum;
500 
501  free(a);
502  free(b);
503  free(c);
504 
505  // Calculate bandwidth
506  double elapsed_sec = (end.tv_sec - start.tv_sec) +
507  (end.tv_nsec - start.tv_nsec) / 1e9;
508 
509  // Triad: 2 reads + 1 write = 3 arrays × size
510  double total_bytes = (double)SIZE * 3.0 * ITERATIONS;
511  double bandwidth_gbs = (total_bytes / elapsed_sec) / (1024.0 * 1024.0 * 1024.0);
512 
513  return (float)bandwidth_gbs;
514 }
515 
516 // Simple wrapper for backward compatibility
518  return topology_measure_memory_bandwidth_ex(NULL, NULL);
519 }
520 
521 // Estimate channel configuration from measured bandwidth and memory speed
522 // Returns estimated number of channels (1, 2, 4, 6, 8)
523 int topology_estimate_channels_from_bandwidth(float measured_bw_gbs, int memory_speed_mhz, const char *memory_type) {
524  if (measured_bw_gbs <= 0 || memory_speed_mhz <= 0) return 0;
525 
526  // Theoretical bandwidth per channel: speed_mhz * 8 bytes / 1000 = GB/s
527  // DDR5 has 2 sub-channels per DIMM, but we treat each DIMM as one channel here
528  float bw_per_channel = (memory_speed_mhz * 8.0f) / 1000.0f;
529 
530  // Measured bandwidth is typically 70-90% of theoretical due to:
531  // - Memory controller overhead
532  // - Refresh cycles
533  // - Bank conflicts
534  // - Command bus overhead
535  float efficiency = 0.75f; // Conservative estimate
536 
537  // Estimate channels
538  float estimated_channels = measured_bw_gbs / (bw_per_channel * efficiency);
539 
540  // Round to nearest standard configuration
541  if (estimated_channels < 1.3f) return 1;
542  if (estimated_channels < 2.5f) return 2;
543  if (estimated_channels < 3.5f) return 3; // Unusual but possible
544  if (estimated_channels < 5.0f) return 4;
545  if (estimated_channels < 7.0f) return 6;
546  return 8;
547 }
548 
549 // ═══════════════════════════════════════════════════════════════════════════════
550 // Memory Discovery
551 // ═══════════════════════════════════════════════════════════════════════════════
552 
554  memset(mem, 0, sizeof(*mem));
555 
556  // Basic memory info from /proc/meminfo
557  FILE *f = fopen("/proc/meminfo", "r");
558  if (f) {
559  char line[256];
560  while (fgets(line, sizeof(line), f)) {
561  uint64_t val;
562  if (sscanf(line, "MemTotal: %lu kB", &val) == 1) {
563  mem->total_mb = val / 1024;
564  } else if (sscanf(line, "MemAvailable: %lu kB", &val) == 1) {
565  mem->available_mb = val / 1024;
566  } else if (sscanf(line, "Cached: %lu kB", &val) == 1) {
567  mem->cached_mb = val / 1024;
568  }
569  }
570  fclose(f);
571  }
572 
573  // Try to get DIMM info via dmidecode (requires root)
574  char output[8192];
575  if (run_command("dmidecode -t memory 2>/dev/null", output, sizeof(output)) == 0 &&
576  strlen(output) > 100) {
577 
578  char *line = strtok(output, "\n");
579  MemorySlot *current_slot = NULL;
580 
581  while (line) {
582  trim_string(line);
583 
584  if (strstr(line, "Memory Device")) {
585  if (mem->num_slots < MAX_MEMORY_SLOTS) {
586  current_slot = &mem->slots[mem->num_slots++];
587  memset(current_slot, 0, sizeof(*current_slot));
588  current_slot->slot_number = mem->num_slots;
589  }
590  } else if (current_slot) {
591  uint64_t val;
592  int ival;
593  char str[64];
594 
595  if (sscanf(line, "Size: %lu MB", &val) == 1) {
596  current_slot->size_mb = val;
597  current_slot->populated = true;
598  mem->slots_populated++;
599  } else if (sscanf(line, "Size: %lu GB", &val) == 1) {
600  current_slot->size_mb = val * 1024;
601  current_slot->populated = true;
602  mem->slots_populated++;
603  } else if (strstr(line, "Size: No Module")) {
604  current_slot->populated = false;
605  } else if (sscanf(line, "Speed: %d MT/s", &ival) == 1 ||
606  sscanf(line, "Speed: %d MHz", &ival) == 1) {
607  current_slot->speed_mhz = ival;
608  if (mem->memory_speed_mhz == 0) mem->memory_speed_mhz = ival;
609  } else if (sscanf(line, "Type: %63s", str) == 1) {
610  strncpy(current_slot->type, str, sizeof(current_slot->type) - 1);
611  if (mem->memory_type[0] == '\0') {
612  strncpy(mem->memory_type, str, sizeof(mem->memory_type) - 1);
613  }
614  } else if (sscanf(line, "Locator: %63s", str) == 1) {
615  strncpy(current_slot->locator, str, sizeof(current_slot->locator) - 1);
616  } else if (sscanf(line, "Rank: %d", &ival) == 1) {
617  current_slot->rank = ival;
618  } else if (sscanf(line, "Data Width: %d bits", &ival) == 1) {
619  current_slot->data_width_bits = ival;
620  }
621  }
622 
623  line = strtok(NULL, "\n");
624  }
625  }
626 
627  // Estimate channel configuration
628  if (mem->slots_populated > 0) {
629  if (mem->slots_populated == 1) {
630  strcpy(mem->channel_config, "Single-channel");
631  mem->num_channels = 1;
632  mem->channels_populated = 1;
633  } else if (mem->slots_populated == 2) {
634  strcpy(mem->channel_config, "Dual-channel");
635  mem->num_channels = 2;
636  mem->channels_populated = 2;
637  } else if (mem->slots_populated == 4) {
638  strcpy(mem->channel_config, "Quad-channel");
639  mem->num_channels = 4;
640  mem->channels_populated = 4;
641  } else if (mem->slots_populated >= 6) {
642  strcpy(mem->channel_config, "Hexa-channel or more");
643  mem->num_channels = 6;
645  } else {
646  snprintf(mem->channel_config, sizeof(mem->channel_config),
647  "%d DIMMs", mem->slots_populated);
648  mem->num_channels = mem->slots_populated;
650  }
651 
652  // Estimate bandwidth
653  // DDR4: speed * 8 bytes * channels
654  // DDR5: speed * 8 bytes * channels (but DDR5 has 2 channels per DIMM)
655  float bytes_per_transfer = 8.0f;
656  if (strstr(mem->memory_type, "DDR5")) {
658  (mem->memory_speed_mhz * bytes_per_transfer * mem->channels_populated * 2) / 1000.0f;
659  } else {
661  (mem->memory_speed_mhz * bytes_per_transfer * mem->channels_populated) / 1000.0f;
662  }
663  }
664 
665  // Always measure actual bandwidth (quick ~0.5s test)
666  // Use extended version to get NUMA node and thread count for transparency
668  &mem->bw_test_numa_node,
669  &mem->bw_test_num_threads
670  );
671 
672  // If dmidecode didn't give us channel info, estimate from measured bandwidth
673  if (mem->slots_populated == 0 && mem->measured_bandwidth_gbs > 0) {
674  // Try to detect memory speed from /sys or assume DDR4-3200
675  if (mem->memory_speed_mhz == 0) {
676  // Common defaults: DDR4-2666, DDR4-3200, DDR5-4800
677  // Assume DDR4-3200 as a reasonable default
678  mem->memory_speed_mhz = 3200;
679  strcpy(mem->memory_type, "DDR4");
680  }
681 
682  // Estimate channels from measured bandwidth
685 
686  // Update channel config string
687  switch (mem->estimated_channels) {
688  case 1:
689  strcpy(mem->channel_config, "Single-channel (estimated)");
690  break;
691  case 2:
692  strcpy(mem->channel_config, "Dual-channel (estimated)");
693  break;
694  case 4:
695  strcpy(mem->channel_config, "Quad-channel (estimated)");
696  break;
697  case 6:
698  strcpy(mem->channel_config, "Hexa-channel (estimated)");
699  break;
700  case 8:
701  strcpy(mem->channel_config, "Octa-channel (estimated)");
702  break;
703  default:
704  snprintf(mem->channel_config, sizeof(mem->channel_config),
705  "%d-channel (estimated)", mem->estimated_channels);
706  }
707 
709  mem->num_channels = mem->estimated_channels;
710 
711  // Recalculate theoretical based on estimated channels
713  (mem->memory_speed_mhz * 8.0f * mem->estimated_channels) / 1000.0f;
714  }
715 
716  return 0;
717 }
718 
719 // ═══════════════════════════════════════════════════════════════════════════════
720 // PCIe Discovery
721 // ═══════════════════════════════════════════════════════════════════════════════
722 
723 static float pcie_bandwidth_gbs(int gen, int width) {
724  // GB/s per lane per generation (accounting for encoding overhead)
725  float per_lane[] = {0, 0.25f, 0.5f, 0.985f, 1.969f, 3.938f, 7.877f};
726  if (gen < 1 || gen > 6) gen = 3;
727  return per_lane[gen] * width;
728 }
729 
731  memset(pcie, 0, sizeof(*pcie));
732 
733  char output[32768];
734  if (run_command("lspci -vvv 2>/dev/null", output, sizeof(output)) != 0) {
735  return -1;
736  }
737 
738  PCIeDevice *current = NULL;
739  char *line = strtok(output, "\n");
740 
741  while (line) {
742  // New device line: "00:1f.0 ISA bridge: Intel..."
743  if (strlen(line) > 0 && isxdigit(line[0]) && line[2] == ':') {
744  if (pcie->num_devices < MAX_PCIE_DEVICES) {
745  current = &pcie->devices[pcie->num_devices++];
746  memset(current, 0, sizeof(*current));
747 
748  // Parse BDF
749  sscanf(line, "%x:%x.%x", &current->bus, &current->device, &current->function);
750 
751  // Get device name (after the type)
752  char *name_start = strchr(line, ':');
753  if (name_start) {
754  name_start = strchr(name_start + 1, ':');
755  if (name_start) {
756  name_start++;
757  while (*name_start == ' ') name_start++;
758  strncpy(current->device_name, name_start,
759  sizeof(current->device_name) - 1);
760  }
761  }
762 
763  // Check device type
764  current->is_gpu = (strstr(line, "VGA") != NULL ||
765  strstr(line, "3D controller") != NULL ||
766  strstr(line, "Display") != NULL);
767  current->is_nic = (strstr(line, "Ethernet") != NULL ||
768  strstr(line, "Network") != NULL ||
769  strstr(line, "InfiniBand") != NULL);
770  current->is_nvme = (strstr(line, "Non-Volatile memory") != NULL);
771  }
772  } else if (current) {
773  // Parse LnkCap and LnkSta for PCIe info
774  if (strstr(line, "LnkCap:")) {
775  char *speed = strstr(line, "Speed ");
776  char *width = strstr(line, "Width x");
777  if (speed) {
778  float gts;
779  if (sscanf(speed, "Speed %fGT/s", &gts) == 1) {
780  if (gts >= 64) current->link_speed_max = 6;
781  else if (gts >= 32) current->link_speed_max = 5;
782  else if (gts >= 16) current->link_speed_max = 4;
783  else if (gts >= 8) current->link_speed_max = 3;
784  else if (gts >= 5) current->link_speed_max = 2;
785  else current->link_speed_max = 1;
786  }
787  }
788  if (width) {
789  sscanf(width, "Width x%d", &current->link_width_max);
790  }
791  } else if (strstr(line, "LnkSta:")) {
792  char *speed = strstr(line, "Speed ");
793  char *width = strstr(line, "Width x");
794  if (speed) {
795  float gts;
796  if (sscanf(speed, "Speed %fGT/s", &gts) == 1) {
797  if (gts >= 64) current->link_speed = 6;
798  else if (gts >= 32) current->link_speed = 5;
799  else if (gts >= 16) current->link_speed = 4;
800  else if (gts >= 8) current->link_speed = 3;
801  else if (gts >= 5) current->link_speed = 2;
802  else current->link_speed = 1;
803  }
804  }
805  if (width) {
806  sscanf(width, "Width x%d", &current->link_width);
807  }
808  }
809  }
810 
811  line = strtok(NULL, "\n");
812  }
813 
814  // Calculate bandwidths and summary
815  for (int i = 0; i < pcie->num_devices; i++) {
816  PCIeDevice *d = &pcie->devices[i];
819 
820  if (d->link_width > 0) {
821  pcie->total_lanes_used += d->link_width;
822  }
823  }
824 
825  return 0;
826 }
827 
828 // ═══════════════════════════════════════════════════════════════════════════════
829 // Network Discovery
830 // ═══════════════════════════════════════════════════════════════════════════════
831 
833  memset(net, 0, sizeof(*net));
834 
835  const char *base = "/sys/class/net";
836  DIR *dir = opendir(base);
837  if (!dir) return -1;
838 
839  struct dirent *entry;
840  while ((entry = readdir(dir)) != NULL) {
841  if (entry->d_name[0] == '.') continue;
842  if (strcmp(entry->d_name, "lo") == 0) continue; // Skip loopback
843 
844  if (net->num_interfaces >= MAX_NICS) break;
845  NetworkInterface *nic = &net->interfaces[net->num_interfaces];
846  memset(nic, 0, sizeof(*nic));
847 
848  strncpy(nic->name, entry->d_name, sizeof(nic->name) - 1);
849 
850  char path[512];
851 
852  // Check if interface is up
853  snprintf(path, sizeof(path), "%s/%s/operstate", base, entry->d_name);
854  char state[32];
855  if (read_file_string(path, state, sizeof(state)) == 0) {
856  nic->is_up = (strcmp(state, "up") == 0);
857  }
858 
859  // Get speed
860  snprintf(path, sizeof(path), "%s/%s/speed", base, entry->d_name);
861  int speed = read_file_int(path);
862  if (speed > 0) {
863  nic->speed_mbps = speed;
864  nic->has_link = true;
865  }
866 
867  // Get MTU
868  snprintf(path, sizeof(path), "%s/%s/mtu", base, entry->d_name);
869  nic->mtu = read_file_int(path);
870 
871  // Get MAC address
872  snprintf(path, sizeof(path), "%s/%s/address", base, entry->d_name);
873  read_file_string(path, nic->mac_address, sizeof(nic->mac_address));
874 
875  // Get driver
876  snprintf(path, sizeof(path), "%s/%s/device/driver", base, entry->d_name);
877  char driver_link[512];
878  ssize_t len = readlink(path, driver_link, sizeof(driver_link) - 1);
879  if (len > 0) {
880  driver_link[len] = '\0';
881  char *driver_name = strrchr(driver_link, '/');
882  if (driver_name) {
883  strncpy(nic->driver, driver_name + 1, sizeof(nic->driver) - 1);
884  }
885  }
886 
887  // Check for InfiniBand
888  snprintf(path, sizeof(path), "/sys/class/infiniband/%s", entry->d_name);
889  if (access(path, F_OK) == 0) {
890  nic->is_infiniband = true;
891  nic->supports_rdma = true;
892  }
893 
894  // Check for RoCE capability
895  if (strstr(nic->driver, "mlx") || strstr(nic->driver, "bnxt") ||
896  strstr(nic->driver, "qed")) {
897  nic->supports_roce = true;
898  nic->supports_rdma = true;
899  }
900 
901  // Get PCI address
902  snprintf(path, sizeof(path), "%s/%s/device", base, entry->d_name);
903  char pci_link[512];
904  len = readlink(path, pci_link, sizeof(pci_link) - 1);
905  if (len > 0) {
906  pci_link[len] = '\0';
907  char *pci = strrchr(pci_link, '/');
908  if (pci) {
909  strncpy(nic->pci_address, pci + 1, sizeof(nic->pci_address) - 1);
910  }
911  }
912 
913  // Calculate bandwidth
914  float bandwidth = nic->speed_mbps / 8000.0f; // Mbps to GB/s
915 
916  if (bandwidth > net->max_bandwidth_gbs) {
917  net->max_bandwidth_gbs = bandwidth;
919  }
920 
921  if (nic->supports_rdma) {
922  net->has_rdma = true;
923  }
924 
925  net->num_interfaces++;
926  }
927  closedir(dir);
928 
929  return 0;
930 }
931 
932 // ═══════════════════════════════════════════════════════════════════════════════
933 // Affinity Discovery
934 // ═══════════════════════════════════════════════════════════════════════════════
935 
937  memset(aff, 0, sizeof(*aff));
938 
939  // OpenMP settings
940  const char *omp_threads = getenv("OMP_NUM_THREADS");
941  if (omp_threads) {
942  aff->omp_num_threads = atoi(omp_threads);
943  } else {
944  aff->omp_num_threads = sysconf(_SC_NPROCESSORS_ONLN);
945  }
946 
947  const char *omp_bind = getenv("OMP_PROC_BIND");
948  if (omp_bind) {
949  strncpy(aff->omp_proc_bind, omp_bind, sizeof(aff->omp_proc_bind) - 1);
950  aff->affinity_set = true;
951  } else {
952  strcpy(aff->omp_proc_bind, "not set");
953  }
954 
955  const char *omp_places = getenv("OMP_PLACES");
956  if (omp_places) {
957  strncpy(aff->omp_places, omp_places, sizeof(aff->omp_places) - 1);
958  } else {
959  strcpy(aff->omp_places, "not set");
960  }
961 
962  // Current process affinity
963  cpu_set_t mask;
964  if (sched_getaffinity(0, sizeof(mask), &mask) == 0) {
965  for (int i = 0; i < MAX_CPUS && aff->num_affinity_cpus < MAX_CPUS; i++) {
966  if (CPU_ISSET(i, &mask)) {
967  aff->affinity_cpus[aff->num_affinity_cpus++] = i;
968  }
969  }
970  }
971 
972  return 0;
973 }
974 
975 // ═══════════════════════════════════════════════════════════════════════════════
976 // Main Discovery Function
977 // ═══════════════════════════════════════════════════════════════════════════════
978 
980  memset(topo, 0, sizeof(*topo));
981 
982  // Get hostname and kernel version
983  gethostname(topo->hostname, sizeof(topo->hostname));
984 
985  struct utsname uts;
986  if (uname(&uts) == 0) {
987  snprintf(topo->kernel_version, sizeof(topo->kernel_version),
988  "%s %s", uts.sysname, uts.release);
989  }
990 
991  // Check for root access
992  topo->has_root_access = (geteuid() == 0);
993 
994  // Run all discovery functions
995  topology_discover_cpu(&topo->cpu);
1002 
1003  return 0;
1004 }
1005 
1006 // ═══════════════════════════════════════════════════════════════════════════════
1007 // Recommendations Generation
1008 // ═══════════════════════════════════════════════════════════════════════════════
1009 
1011  RecommendationList *recs) {
1012  memset(recs, 0, sizeof(*recs));
1013 
1014  // Memory recommendations
1015  if (topo->memory.slots_populated > 0 &&
1016  topo->memory.slots_populated < topo->memory.num_slots) {
1017 
1018  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1021  strcpy(r->title, "Memory Slots Available");
1022  snprintf(r->description, sizeof(r->description),
1023  "Only %d of %d memory slots populated. Adding more DIMMs "
1024  "could increase memory bandwidth.",
1025  topo->memory.slots_populated, topo->memory.num_slots);
1026  snprintf(r->action, sizeof(r->action),
1027  "Consider adding %d more matching DIMMs for better bandwidth",
1028  topo->memory.num_slots - topo->memory.slots_populated);
1029  }
1030 
1031  // Single-channel warning
1032  if (topo->memory.channels_populated == 1 && topo->memory.num_slots > 1) {
1033  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1036  strcpy(r->title, "Single-Channel Memory");
1037  strcpy(r->description,
1038  "Running in single-channel mode significantly reduces memory bandwidth. "
1039  "This will impact training performance.");
1040  strcpy(r->action,
1041  "Add a second DIMM in the correct slot to enable dual-channel mode");
1042  }
1043 
1044  // Affinity recommendations
1045  if (!topo->affinity.affinity_set) {
1046  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1049  strcpy(r->title, "OpenMP Affinity Not Set");
1050  strcpy(r->description,
1051  "OpenMP thread affinity is not configured. Threads may migrate "
1052  "between cores causing cache misses and NUMA penalties.");
1053  strcpy(r->action,
1054  "export OMP_PROC_BIND=close OMP_PLACES=cores");
1055  }
1056 
1057  // Network recommendations
1058  if (topo->network.max_bandwidth_gbs < 1.0f) { // Less than 10 GbE
1059  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1062  strcpy(r->title, "Slow Network for Distributed Training");
1063  snprintf(r->description, sizeof(r->description),
1064  "Maximum network bandwidth is %.2f GB/s. This will be a "
1065  "significant bottleneck for distributed training.",
1066  topo->network.max_bandwidth_gbs);
1067  strcpy(r->action,
1068  "Consider upgrading to 10GbE+ or InfiniBand for distributed training");
1069  }
1070 
1071  // RDMA recommendation
1072  if (!topo->network.has_rdma && topo->network.num_interfaces > 0) {
1073  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1076  strcpy(r->title, "No RDMA-Capable NICs");
1077  strcpy(r->description,
1078  "No RDMA-capable network adapters detected. RDMA enables direct "
1079  "memory access between nodes, reducing latency for gradient sync.");
1080  strcpy(r->action,
1081  "Consider Mellanox ConnectX or Intel E810 for RDMA support");
1082  }
1083 
1084  // SIMD recommendations
1085  if (!topo->cpu.has_avx2) {
1086  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1089  strcpy(r->title, "Limited SIMD Support");
1090  strcpy(r->description,
1091  "CPU does not support AVX2. Kernel performance will be limited.");
1092  strcpy(r->action, "AVX2+ CPUs provide significantly better performance");
1093  }
1094 
1095  // NUMA warning for multi-socket
1096  if (topo->numa.num_nodes > 1) {
1097  Recommendation *r = &recs->recommendations[recs->num_recommendations++];
1100  strcpy(r->title, "Multi-NUMA System Detected");
1101  snprintf(r->description, sizeof(r->description),
1102  "System has %d NUMA nodes. Cross-node memory access is slower. "
1103  "Ensure data locality for best performance.",
1104  topo->numa.num_nodes);
1105  strcpy(r->action,
1106  "Use numactl --localalloc or NUMA-aware memory allocation");
1107  }
1108 
1109  return 0;
1110 }
1111 
1112 // ═══════════════════════════════════════════════════════════════════════════════
1113 // Utility Functions
1114 // ═══════════════════════════════════════════════════════════════════════════════
1115 
1117  return mem->theoretical_bandwidth_gbs;
1118 }
1119 
1121  uint64_t model_size_mb) {
1122  if (net->max_bandwidth_gbs <= 0) return -1;
1123 
1124  // Time to transfer model_size_mb in seconds
1125  // Account for protocol overhead (~10%)
1126  float effective_bw = net->max_bandwidth_gbs * 0.9f * 1024; // Convert to MB/s
1127  return model_size_mb / effective_bw;
1128 }
int affinity_cpus[256]
char omp_places[64]
char omp_proc_bind[32]
int logical_cores
bool has_avx512bw
char model_name[256]
int pcie_generation
bool has_vnni
bool has_avx512_bf16
float max_freq_mhz
char vendor[64]
int has_avx2
Definition: cpu_features.h:27
int cores_per_socket
bool has_amx_int8
int has_avx
Definition: cpu_features.h:26
int physical_cores
int has_avx512f
Definition: cpu_features.h:28
bool has_amx_bf16
int pcie_lanes_total
int threads_per_core
bool has_amx
bool has_avx512vl
float base_freq_mhz
bool has_sse4_2
bool has_amx_tile
char type[16]
int line_size_bytes
int ways_of_associativity
int shared_by_cores
CacheInfo levels[4]
float measured_bandwidth_gbs
int channels_populated
uint64_t total_mb
float theoretical_bandwidth_gbs
int estimated_channels
uint64_t available_mb
char memory_type[32]
char channel_config[64]
int bw_test_num_threads
uint64_t cached_mb
MemorySlot slots[16]
char type[32]
uint64_t size_mb
char locator[64]
int cpu_list[256]
uint64_t memory_free_mb
uint64_t memory_total_mb
int distances[8][8]
NUMANode nodes[8]
NetworkInterface interfaces[8]
char device_name[256]
float bandwidth_max_gbs
float bandwidth_gbs
PCIeDevice devices[32]
Recommendation recommendations[32]
RecommendationPriority priority
RecommendationCategory category
char description[512]
CacheTopology cache
NUMATopology numa
char hostname[256]
PCIeTopology pcie
NetworkTopology network
MemoryInfo memory
char kernel_version[128]
AffinityInfo affinity
int topology_estimate_channels_from_bandwidth(float measured_bw_gbs, int memory_speed_mhz, const char *memory_type)
static uint64_t read_file_uint64(const char *path)
int topology_discover_memory(MemoryInfo *mem)
static int cache_compare(const void *a, const void *b)
int topology_discover_pcie(PCIeTopology *pcie)
int topology_discover(SystemTopology *topo)
int topology_discover_cpu(CPUInfo *cpu)
static int get_current_cpu(void)
static int get_numa_node_for_cpu(int cpu)
float topology_measure_memory_bandwidth(void)
int topology_discover_cache(CacheTopology *cache)
static void trim_string(char *str)
static int run_command(const char *cmd, char *output, size_t output_size)
float topology_measure_memory_bandwidth_ex(int *numa_node_out, int *num_threads_out)
static int count_set_bits(const char *hex_mask)
int topology_discover_network(NetworkTopology *net)
float topology_estimate_network_training_time(const NetworkTopology *net, uint64_t model_size_mb)
static int read_file_int(const char *path)
int topology_generate_recommendations(const SystemTopology *topo, RecommendationList *recs)
static float pcie_bandwidth_gbs(int gen, int width)
float topology_estimate_memory_bandwidth(const MemoryInfo *mem)
int topology_discover_affinity(AffinityInfo *aff)
static int has_cpu_flag(const char *flags, const char *flag)
static int read_file_string(const char *path, char *buf, size_t buf_size)
int topology_discover_numa(NUMATopology *numa)
#define MAX_CACHE_LEVELS
#define MAX_NUMA_NODES
#define MAX_MEMORY_SLOTS
#define MAX_CPUS
@ REC_PRIORITY_MEDIUM
@ REC_PRIORITY_HIGH
@ REC_PRIORITY_LOW
#define MAX_NICS
@ REC_CATEGORY_AFFINITY
@ REC_CATEGORY_CPU
@ REC_CATEGORY_MEMORY
@ REC_CATEGORY_NETWORK
#define MAX_PCIE_DEVICES
int32_t int32_t int32_t int32_t int32_t mask
Definition: tokenizer.h:233
int32_t id
Definition: tokenizer.h:315
const char * token
Definition: tokenizer.h:306
uint32_t end
Definition: utf8.c:215
uint32_t start
Definition: utf8.c:214