20 #define RESET "\033[0m"
21 #define BOLD "\033[1m"
24 #define RED "\033[31m"
25 #define GREEN "\033[32m"
26 #define YELLOW "\033[33m"
27 #define BLUE "\033[34m"
28 #define MAGENTA "\033[35m"
29 #define CYAN "\033[36m"
30 #define WHITE "\033[37m"
32 #define BG_RED "\033[41m"
33 #define BG_GREEN "\033[42m"
34 #define BG_YELLOW "\033[43m"
39 #define C(color) (use_colors ? color : "")
45 static const char*
format_size(uint64_t size_mb,
char *buf,
size_t buf_size) {
46 if (size_mb >= 1024 * 1024) {
47 snprintf(buf, buf_size,
"%.1f TB", size_mb / (1024.0 * 1024.0));
48 }
else if (size_mb >= 1024) {
49 snprintf(buf, buf_size,
"%.1f GB", size_mb / 1024.0);
51 snprintf(buf, buf_size,
"%lu MB", (
unsigned long)size_mb);
58 snprintf(buf, buf_size,
"%.1f GB/s", bw_gbs);
60 snprintf(buf, buf_size,
"%.0f MB/s", bw_gbs * 1024);
66 printf(
"\n%s",
C(
BOLD));
67 printf(
"═══════════════════════════════════════════════════════════════════════════════\n");
68 printf(
" %s\n", title);
69 printf(
"═══════════════════════════════════════════════════════════════════════════════%s\n",
75 printf(
" ────────────────────────────────────────────────────────────────────────────\n");
79 for (
int i = 0; i < level; i++) {
82 printf(
" %s── ", is_last ?
"└" :
"├");
109 char simd_buf[256] =
"";
111 strcat(simd_buf,
"AVX-512");
113 char sub_features[64] =
"";
117 if (sub_features[0]) {
119 sub_features[strlen(sub_features) - 2] =
'\0';
120 strcat(simd_buf,
" (");
121 strcat(simd_buf, sub_features);
122 strcat(simd_buf,
")");
124 strcat(simd_buf,
" ");
126 strcat(simd_buf,
"AVX2 ");
128 strcat(simd_buf,
"AVX ");
130 strcat(simd_buf,
"SSE4.2 ");
133 if (cpu->
has_vnni) strcat(simd_buf,
"VNNI ");
137 strcat(simd_buf,
"AMX");
138 char amx_features[32] =
"";
141 if (amx_features[0]) {
142 amx_features[strlen(amx_features) - 1] =
'\0';
143 strcat(simd_buf,
"(");
144 strcat(simd_buf, amx_features);
145 strcat(simd_buf,
") ");
147 strcat(simd_buf,
" ");
159 simd_buf[0] ? simd_buf :
"Basic",
165 print_warning(
"No AVX2 support - kernel performance will be limited");
173 printf(
" %sSource: /sys/devices/system/cpu/cpu0/cache/%s\n\n",
184 if (instances < 1) instances = 1;
188 int total_kb = c->
size_kb * instances;
192 if (total_kb >= 1024) {
193 snprintf(size_str,
sizeof(size_str),
"%d MiB", total_kb / 1024);
195 snprintf(size_str,
sizeof(size_str),
"%d KiB", total_kb);
199 char instance_str[32] =
"";
201 snprintf(instance_str,
sizeof(instance_str),
" (%d instances)", instances);
203 snprintf(instance_str,
sizeof(instance_str),
" (%d instance)", instances);
208 c->
type[0] ==
'D' ?
'd' : (c->
type[0] ==
'I' ?
'i' :
' '),
209 size_str, instance_str);
217 printf(
" %sSource: /sys/devices/system/node/%s\n",
C(
DIM),
C(
RESET));
221 printf(
"\n %s✓ Single NUMA node (Uniform Memory Access)%s\n",
C(
GREEN),
C(
RESET));
222 printf(
" %s All memory is local - no NUMA penalties%s\n",
C(
DIM),
C(
RESET));
223 printf(
"\n %sNote: Sub-NUMA Clustering (SNC) / NUMA-Per-Socket (NPS) not detected.%s\n",
225 printf(
" %s On Xeon/EPYC, check BIOS settings or run: numactl --hardware%s\n",
233 if (sockets > 0 && numa->
num_nodes > sockets) {
234 int nodes_per_socket = numa->
num_nodes / sockets;
235 printf(
"\n %s⚠ Sub-NUMA detected: %d NUMA nodes on %d socket(s) = SNC%d or NPS%d%s\n",
237 printf(
" %s Intel: Sub-NUMA Clustering (SNC) | AMD: NUMA-Per-Socket (NPS)%s\n",
239 printf(
" %s Each sub-node has its own memory channels for lower latency%s\n",
241 }
else if (sockets > 1) {
243 printf(
"\n %sMulti-socket system: %d sockets, %d NUMA nodes%s\n",
245 printf(
" %s SNC/NPS not enabled - each socket is one NUMA node%s\n",
247 printf(
" %s 💡 Enable SNC in BIOS to partition channels for lower latency%s\n",
254 for (
int i = 0; i < numa->
num_nodes; i++) {
256 int is_last = (i == numa->
num_nodes - 1);
267 printf(
"\n NUMA Distances (10=local, higher=remote):\n");
269 for (
int i = 0; i < numa->
num_nodes; i++) {
273 for (
int i = 0; i < numa->
num_nodes; i++) {
275 for (
int j = 0; j < numa->
num_nodes; j++) {
288 printf(
"\n %s💡 Per-node bandwidth: numactl --cpunodebind=0 --membind=0 ./build/show_config%s\n",
296 printf(
" %sSource: /proc/meminfo, dmidecode (if root), STREAM benchmark%s\n\n",
299 char total_buf[32], avail_buf[32], theo_bw_buf[32], meas_bw_buf[32];
305 printf(
" %sTotal: %s%s (%s available)\n",
320 printf(
"\n %sBandwidth Analysis:%s\n",
C(
CYAN),
C(
RESET));
324 printf(
" %s├── Theoretical: %d MT/s × 8 bytes × %d channel(s) = %s%s\n",
330 printf(
" %s│ └── SNC potential: %d ch ÷ 2 = SNC2 (%d ch/node), ÷ 4 = SNC4 (%d ch/node)%s\n",
336 printf(
" %s│ └── SNC potential: %d ch ÷ 2 = SNC2 (%d ch/node)%s\n",
342 printf(
" %s├── Theoretical: %s (estimated)%s\n",
351 printf(
" %s├── Measured: %s%s%s (%.0f%% efficiency)%s\n",
355 printf(
" %s│ Method: STREAM Triad (c[i] = a[i] + s*b[i])%s\n",
357 printf(
" %s│ Buffer: 256 MB × 3 arrays, 3 iterations%s\n",
359 printf(
" %s│ NUMA node: %d (memory allocated on this node)%s\n",
361 printf(
" %s│ Threads: %d (OMP_NUM_THREADS)%s\n",
363 printf(
" %s└── Formula: (256MB × 3 × 3) / time = GB/s%s\n",
369 printf(
"\n DIMM Layout:\n");
370 for (
int i = 0; i < mem->
num_slots; i++) {
375 printf(
" %s[%s]%s %s: %s%s @ %d MT/s%s\n",
379 printf(
" %s[%s]%s EMPTY\n",
386 print_warning(
"Single-channel mode - bandwidth reduced by ~50%%");
390 printf(
" %s💡 Tip:%s Add %d more DIMM(s) for better bandwidth\n",
398 int gpu_count = 0, nic_count = 0, nvme_count = 0;
411 const char *type_icon =
" ";
412 const char *type_color =
"";
432 name[
sizeof(name) - 1] =
'\0';
434 strcpy(name + 42,
"...");
437 printf(
" %s%s%s%-45s%s x%d Gen%d %s%s%s",
444 printf(
" %s(capable: x%d Gen%d)%s",
450 if (gpu_count == 0 && nic_count == 0 && nvme_count == 0) {
451 printf(
" %sNo significant PCIe devices detected%s\n",
C(
DIM),
C(
RESET));
454 printf(
"\n Summary: %d GPU(s), %d NIC(s), %d NVMe(s)\n",
455 gpu_count, nic_count, nvme_count);
462 printf(
" %sNo network interfaces detected%s\n",
C(
DIM),
C(
RESET));
471 const char *status_icon = n->
is_up && n->
has_link ?
"✓" :
"✗";
477 printf(
" %s%s%s %s%-10s%s ",
478 C(status_color), status_icon,
C(
RESET),
483 const char *speed_color =
"";
487 else speed_color =
RED;
489 printf(
"%s%6lu Mbps%s (%s) ",
510 printf(
"\n For Distributed Training:\n");
513 print_ok(
"100 GbE+ available - excellent for distributed training");
515 printf(
" %s✓%s 10 GbE available - good for small clusters\n",
518 print_warning(
"Only 1 GbE - significant bottleneck for distributed training");
520 print_warning(
"Very slow network - distributed training not recommended");
524 print_ok(
"RDMA capable NIC detected - low-latency gradient sync possible");
540 printf(
" %s💡 Recommendation:%s export OMP_PROC_BIND=close OMP_PLACES=cores\n",
548 print_ok(
"No significant issues detected!");
557 const char *priority_icon =
"";
558 const char *priority_color =
"";
562 priority_color =
RED;
566 priority_color =
RED;
574 priority_color =
GREEN;
578 printf(
"\n %s %s%s%s\n", priority_icon,
C(priority_color), r->
title,
C(
RESET));
587 char mem_bw_buf[32], net_bw_buf[32];
591 printf(
" Single Node Capacity:\n");
601 printf(
"\n Estimated Gradient Sync Time (single allreduce):\n");
603 uint64_t model_sizes[] = {100, 500, 1000, 7000};
604 const char *model_names[] = {
"100 MB (BERT-base)",
"500 MB (GPT-2)",
605 "1 GB (ResNet-50 batch)",
"7 GB (LLaMA-7B)"};
607 for (
int i = 0; i < 4; i++) {
609 &topo->
network, model_sizes[i]);
611 const char *time_color =
"";
612 if (sync_time < 0.1f) time_color =
GREEN;
613 else if (sync_time < 1.0f) time_color =
YELLOW;
614 else time_color =
RED;
616 printf(
" %-25s %s%8.2f sec%s\n",
617 model_names[i],
C(time_color), sync_time,
C(
RESET));
621 printf(
"\n Multi-Node Projection (assuming identical nodes):\n");
622 int nodes[] = {2, 4, 8, 16};
623 for (
int i = 0; i < 4; i++) {
628 char total_mem_buf[32];
629 format_size(total_mem, total_mem_buf,
sizeof(total_mem_buf));
631 printf(
" %2d nodes: %4d cores, %s memory\n",
632 n, total_cores, total_mem_buf);
636 printf(
"\n Ring-AllReduce Topology (4 nodes):\n");
637 printf(
" %s┌─────────┐ ┌─────────┐%s\n",
C(
CYAN),
C(
RESET));
638 printf(
" %s│ Node 0 │────→│ Node 1 │%s\n",
C(
CYAN),
C(
RESET));
639 printf(
" %s│ Worker │ │ Worker │%s\n",
C(
CYAN),
C(
RESET));
640 printf(
" %s└────↑────┘ └────│────┘%s\n",
C(
CYAN),
C(
RESET));
643 printf(
" %s┌────│────┐ ┌────↓────┐%s\n",
C(
CYAN),
C(
RESET));
644 printf(
" %s│ Node 3 │←────│ Node 2 │%s\n",
C(
CYAN),
C(
RESET));
645 printf(
" %s│ Worker │ │ Worker │%s\n",
C(
CYAN),
C(
RESET));
646 printf(
" %s└─────────┘ └─────────┘%s\n",
C(
CYAN),
C(
RESET));
655 printf(
" %sNote:%s Running without root - some info may be unavailable\n",
680 int main(
int argc,
char *argv[]) {
682 for (
int i = 1; i < argc; i++) {
683 if (strcmp(argv[i],
"--no-color") == 0) {
686 if (strcmp(argv[i],
"--help") == 0 || strcmp(argv[i],
"-h") == 0) {
687 printf(
"Usage: %s [OPTIONS]\n", argv[0]);
688 printf(
"\nDisplay system hardware configuration for C-Kernel-Engine\n");
689 printf(
"\nOptions:\n");
690 printf(
" --no-color Disable colored output\n");
691 printf(
" --help, -h Show this help message\n");
703 fprintf(stderr,
"Error: Failed to discover system topology\n");
int main(int argc, char *argv[])
static void print_ok(const char *msg)
void topology_print_memory(const MemoryInfo *mem)
static const char * format_bandwidth(float bw_gbs, char *buf, size_t buf_size)
void topology_print_network(const NetworkTopology *net)
void topology_print_pcie(const PCIeTopology *pcie)
void topology_print_distributed_potential(const SystemTopology *topo)
static void print_tree_item(int level, int is_last, const char *fmt,...)
void topology_print_numa(const NUMATopology *numa, int sockets)
static void print_section(const char *title)
void topology_print_cpu(const CPUInfo *cpu)
static void print_warning(const char *msg)
void topology_print_affinity(const AffinityInfo *aff)
static const char * format_size(uint64_t size_mb, char *buf, size_t buf_size)
void topology_print_recommendations(const RecommendationList *recs)
void topology_print_summary(const SystemTopology *topo)
static void print_header(const char *title)
void topology_print_cache(const CacheTopology *cache, int logical_cores)
float measured_bandwidth_gbs
float theoretical_bandwidth_gbs
NetworkInterface interfaces[8]
Recommendation recommendations[32]
RecommendationPriority priority
int topology_discover(SystemTopology *topo)
float topology_estimate_network_training_time(const NetworkTopology *net, uint64_t model_size_mb)
int topology_generate_recommendations(const SystemTopology *topo, RecommendationList *recs)