← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_microkernel.c File Reference

GEMM Microkernel - High-Performance Register-Blocked Matrix Multiplication. More...

#include "ckernel_engine.h"
#include "cpu_features.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>

Go to the source code of this file.

Macros

#define GEMM_BACKEND   "Native"
 
#define KC   (get_gemm_params()->KC)
 
#define MC   (get_gemm_params()->MC)
 
#define MR   (MR_FIXED)
 
#define MR_FIXED   4
 
#define NC   (get_gemm_params()->NC)
 
#define NR   (NR_FIXED)
 
#define NR_FIXED   4
 
#define PACK_THRESHOLD   256
 

Functions

const char * gemm_get_backend (void)
 
static void gemm_init_threads (void)
 
void gemm_microkernel (const float *A, const float *B, float *C, int M, int N, int K, int B_transposed)
 
void gemm_microkernel_blocked (const float *A, const float *B, float *C, int M, int N, int K)
 
void gemm_microkernel_blocked_bt (const float *A, const float *B, float *C, int M, int N, int K)
 
static void gemm_microkernel_edge (int m, int n, int K, const float *A, int lda, const float *B, int ldb, float *C, int ldc, int first_k)
 
void gemm_microkernel_packed (const float *A, const float *B, float *C, int M, int N, int K)
 
static void gemm_microkernel_sequential (const float *A, const float *B, float *C, int M, int N, int K)
 
static void pack_a_panel (const float *A, int lda, float *Ap, int mc, int kc, int mr)
 
static void pack_b_panel (const float *B, int ldb, float *Bp, int kc, int nc, int nr)
 

Variables

static int g_threads_initialized = 0
 

Detailed Description

GEMM Microkernel - High-Performance Register-Blocked Matrix Multiplication.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

This file implements optimized GEMM microkernels with multiple backends:

  1. USE_MKL: Intel MKL cblas_sgemm (best performance on Intel CPUs)
  2. USE_ONEDNN: Intel oneDNN matmul primitive (Apache 2.0 licensed)
  3. Native: Our own AVX-512/AVX2/AVX microkernels (no dependencies)

Build with: make USE_MKL=1 # Use Intel MKL make USE_ONEDNN=1 # Use Intel oneDNN make # Use native kernels

Layout: C[M,N] = A[M,K] @ B[K,N] (row-major)

Definition in file gemm_microkernel.c.

Macro Definition Documentation

◆ GEMM_BACKEND

#define GEMM_BACKEND   "Native"

Definition at line 45 of file gemm_microkernel.c.

◆ KC

#define KC   (get_gemm_params()->KC)

Definition at line 230 of file gemm_microkernel.c.

◆ MC

#define MC   (get_gemm_params()->MC)

Definition at line 228 of file gemm_microkernel.c.

◆ MR

#define MR   (MR_FIXED)

Definition at line 226 of file gemm_microkernel.c.

◆ MR_FIXED

#define MR_FIXED   4

Definition at line 221 of file gemm_microkernel.c.

◆ NC

#define NC   (get_gemm_params()->NC)

Definition at line 229 of file gemm_microkernel.c.

◆ NR

#define NR   (NR_FIXED)

Definition at line 227 of file gemm_microkernel.c.

◆ NR_FIXED

#define NR_FIXED   4

Definition at line 222 of file gemm_microkernel.c.

◆ PACK_THRESHOLD

#define PACK_THRESHOLD   256

Definition at line 1132 of file gemm_microkernel.c.

Function Documentation

◆ gemm_get_backend()

const char* gemm_get_backend ( void  )

Definition at line 1160 of file gemm_microkernel.c.

1160  {
1161  return GEMM_BACKEND;
1162 }
#define GEMM_BACKEND

References GEMM_BACKEND.

◆ gemm_init_threads()

static void gemm_init_threads ( void  )
static

Definition at line 915 of file gemm_microkernel.c.

915  {
916  if (g_threads_initialized) return;
917 
918 #ifdef _OPENMP
919  const CPUInfo* cpu = get_cpu_info();
920  int physical_cores = cpu->num_cores;
921 
922  // Only use physical cores - hyperthreading hurts compute-bound GEMM
923  if (physical_cores > 0) {
924  int current_max = omp_get_max_threads();
925  // Only reduce if we have more threads than physical cores
926  if (current_max > physical_cores) {
927  omp_set_num_threads(physical_cores);
928  }
929  }
930 #endif
932 }
const CPUInfo * get_cpu_info(void)
Definition: cpu_features.c:377
static int g_threads_initialized
int num_cores
Definition: cpu_features.h:25

References g_threads_initialized, get_cpu_info(), and CPUInfo::num_cores.

Referenced by gemm_microkernel_blocked().

◆ gemm_microkernel()

void gemm_microkernel ( const float *  A,
const float *  B,
float *  C,
int  M,
int  N,
int  K,
int  B_transposed 
)

Definition at line 1134 of file gemm_microkernel.c.

1141 {
1142  if (B_transposed) {
1143  gemm_microkernel_blocked_bt(A, B, C, M, N, K);
1144  } else {
1145  // Use packed version for large matrices
1146  if (M >= PACK_THRESHOLD && N >= PACK_THRESHOLD && K >= PACK_THRESHOLD) {
1147  gemm_microkernel_packed(A, B, C, M, N, K);
1148  } else {
1149  gemm_microkernel_blocked(A, B, C, M, N, K);
1150  }
1151  }
1152 }
#define PACK_THRESHOLD
void gemm_microkernel_blocked(const float *A, const float *B, float *C, int M, int N, int K)
void gemm_microkernel_blocked_bt(const float *A, const float *B, float *C, int M, int N, int K)
void gemm_microkernel_packed(const float *A, const float *B, float *C, int M, int N, int K)
#define C(color)
Definition: show_config.c:39

References C, gemm_microkernel_blocked(), gemm_microkernel_blocked_bt(), gemm_microkernel_packed(), and PACK_THRESHOLD.

Referenced by gemm_blocked_serial().

◆ gemm_microkernel_blocked()

void gemm_microkernel_blocked ( const float *  A,
const float *  B,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 934 of file gemm_microkernel.c.

940 {
941  const int mr = MR;
942  const int nr = NR;
943 
944  // Use sequential version for small matrices to avoid OpenMP overhead
945  // Threshold tuned for typical 4-8 core systems
946  if ((size_t)M * N * K <= 512ULL * 512 * 512) {
947  gemm_microkernel_sequential(A, B, C, M, N, K);
948  return;
949  }
950 
951  // Initialize thread count to physical cores (once)
953 
954  // Zero output first
955  #pragma omp parallel for schedule(static)
956  for (int i = 0; i < M; i++) {
957  memset(&C[i * N], 0, N * sizeof(float));
958  }
959 
960  // Block over K (outermost - for accumulation across all threads)
961  for (int k0 = 0; k0 < K; k0 += KC) {
962  int kb = (k0 + KC <= K) ? KC : (K - k0);
963  int first_k = (k0 == 0);
964 
965  // Parallelize over M rows - each thread gets a chunk of M
966  // This gives better cache locality than tile-level parallelism
967  #pragma omp parallel for schedule(static)
968  for (int m0 = 0; m0 < M; m0 += mr) {
969  int mr_actual = (m0 + mr <= M) ? mr : (M - m0);
970 
971  // Each thread processes all N tiles for its M rows
972  for (int n0 = 0; n0 < N; n0 += nr) {
973  int nr_actual = (n0 + nr <= N) ? nr : (N - n0);
974 
975  const float *A_tile = &A[m0 * K + k0];
976  const float *B_tile = &B[k0 * N + n0];
977  float *C_tile = &C[m0 * N + n0];
978 
979  if (mr_actual == mr && nr_actual == nr) {
980 #if defined(__AVX512F__)
981  gemm_microkernel_6x32_avx512(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
982 #elif defined(__FMA__)
983  gemm_microkernel_6x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
984 #elif defined(__AVX__)
985  gemm_microkernel_4x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
986 #else
987  gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
988 #endif
989  } else {
990  gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
991  }
992  }
993  }
994  }
995 }
static void gemm_microkernel_edge(int m, int n, int K, const float *A, int lda, const float *B, int ldb, float *C, int ldc, int first_k)
static void gemm_microkernel_sequential(const float *A, const float *B, float *C, int M, int N, int K)
static void gemm_init_threads(void)
#define KC
#define MR
#define NR

References C, gemm_init_threads(), gemm_microkernel_edge(), gemm_microkernel_sequential(), KC, MR, and NR.

Referenced by gemm_microkernel(), and gemm_microkernel_packed().

◆ gemm_microkernel_blocked_bt()

void gemm_microkernel_blocked_bt ( const float *  A,
const float *  B,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 1058 of file gemm_microkernel.c.

1064 {
1065  // Zero output first
1066  #pragma omp parallel for schedule(static)
1067  for (int i = 0; i < M; i++) {
1068  memset(&C[i * N], 0, N * sizeof(float));
1069  }
1070 
1071  const int mr = MR;
1072  const int nr = NR;
1073 
1074  #pragma omp parallel for schedule(dynamic) collapse(2)
1075  for (int m0 = 0; m0 < M; m0 += MC) {
1076  for (int n0 = 0; n0 < N; n0 += NC) {
1077  int mb = (m0 + MC <= M) ? MC : (M - m0);
1078  int nb = (n0 + NC <= N) ? NC : (N - n0);
1079 
1080  for (int k0 = 0; k0 < K; k0 += KC) {
1081  int kb = (k0 + KC <= K) ? KC : (K - k0);
1082  int first_k = (k0 == 0);
1083 
1084  for (int m1 = 0; m1 < mb; m1 += mr) {
1085  int mr_actual = (m1 + mr <= mb) ? mr : (mb - m1);
1086 
1087  for (int n1 = 0; n1 < nb; n1 += nr) {
1088  int nr_actual = (n1 + nr <= nb) ? nr : (nb - n1);
1089 
1090  const float *A_tile = &A[(m0 + m1) * K + k0];
1091  const float *B_tile = &B[(n0 + n1) * K + k0];
1092  float *C_tile = &C[(m0 + m1) * N + (n0 + n1)];
1093 
1094  if (mr_actual == mr && nr_actual == nr) {
1095 #if defined(__AVX512F__)
1096  gemm_microkernel_6x32_bt_avx512(kb, A_tile, K, B_tile, K, C_tile, N, first_k);
1097 #else
1098  // Scalar fallback for B-transposed
1099  for (int i = 0; i < mr; i++) {
1100  for (int j = 0; j < nr; j++) {
1101  float sum = first_k ? 0.0f : C_tile[i * N + j];
1102  for (int kk = 0; kk < kb; kk++) {
1103  sum += A_tile[i * K + kk] * B_tile[j * K + kk];
1104  }
1105  C_tile[i * N + j] = sum;
1106  }
1107  }
1108 #endif
1109  } else {
1110  // Edge case
1111  for (int i = 0; i < mr_actual; i++) {
1112  for (int j = 0; j < nr_actual; j++) {
1113  float sum = first_k ? 0.0f : C_tile[i * N + j];
1114  for (int kk = 0; kk < kb; kk++) {
1115  sum += A_tile[i * K + kk] * B_tile[j * K + kk];
1116  }
1117  C_tile[i * N + j] = sum;
1118  }
1119  }
1120  }
1121  }
1122  }
1123  }
1124  }
1125  }
1126 }
#define NC
#define MC

References C, KC, MC, MR, NC, and NR.

Referenced by gemm_microkernel().

◆ gemm_microkernel_edge()

static void gemm_microkernel_edge ( int  m,
int  n,
int  K,
const float *  A,
int  lda,
const float *  B,
int  ldb,
float *  C,
int  ldc,
int  first_k 
)
static

Definition at line 737 of file gemm_microkernel.c.

744 {
745  for (int i = 0; i < m; i++) {
746  for (int j = 0; j < n; j++) {
747  float sum = first_k ? 0.0f : C[i * ldc + j];
748  for (int k = 0; k < K; k++) {
749  sum += A[i * lda + k] * B[k * ldb + j];
750  }
751  C[i * ldc + j] = sum;
752  }
753  }
754 }

References C.

Referenced by gemm_microkernel_blocked(), and gemm_microkernel_sequential().

◆ gemm_microkernel_packed()

void gemm_microkernel_packed ( const float *  A,
const float *  B,
float *  C,
int  M,
int  N,
int  K 
)

Definition at line 840 of file gemm_microkernel.c.

846 {
847  // Use tile-parallel blocked version - scales better on many-core systems
848  gemm_microkernel_blocked(A, B, C, M, N, K);
849 }

References C, and gemm_microkernel_blocked().

Referenced by gemm_microkernel().

◆ gemm_microkernel_sequential()

static void gemm_microkernel_sequential ( const float *  A,
const float *  B,
float *  C,
int  M,
int  N,
int  K 
)
static

Definition at line 862 of file gemm_microkernel.c.

868 {
869  // Zero output
870  for (int i = 0; i < M; i++) {
871  memset(&C[i * N], 0, N * sizeof(float));
872  }
873 
874  const int mr = MR;
875  const int nr = NR;
876 
877  // Block over K
878  for (int k0 = 0; k0 < K; k0 += KC) {
879  int kb = (k0 + KC <= K) ? KC : (K - k0);
880  int first_k = (k0 == 0);
881 
882  // Loop over tiles
883  for (int m0 = 0; m0 < M; m0 += mr) {
884  int mr_actual = (m0 + mr <= M) ? mr : (M - m0);
885 
886  for (int n0 = 0; n0 < N; n0 += nr) {
887  int nr_actual = (n0 + nr <= N) ? nr : (N - n0);
888 
889  const float *A_tile = &A[m0 * K + k0];
890  const float *B_tile = &B[k0 * N + n0];
891  float *C_tile = &C[m0 * N + n0];
892 
893  if (mr_actual == mr && nr_actual == nr) {
894 #if defined(__AVX512F__)
895  gemm_microkernel_6x32_avx512(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
896 #elif defined(__FMA__)
897  gemm_microkernel_6x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
898 #elif defined(__AVX__)
899  gemm_microkernel_4x16_avx(kb, A_tile, K, B_tile, N, C_tile, N, first_k);
900 #else
901  gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
902 #endif
903  } else {
904  gemm_microkernel_edge(mr_actual, nr_actual, kb, A_tile, K, B_tile, N, C_tile, N, first_k);
905  }
906  }
907  }
908  }
909 }

References C, gemm_microkernel_edge(), KC, MR, and NR.

Referenced by gemm_microkernel_blocked().

◆ pack_a_panel()

static void pack_a_panel ( const float *  A,
int  lda,
float *  Ap,
int  mc,
int  kc,
int  mr 
)
static

Definition at line 761 of file gemm_microkernel.c.

766 {
767  #pragma omp parallel for schedule(static) if(mc > 64)
768  for (int i = 0; i < mc; i += mr) {
769  int rows = (i + mr <= mc) ? mr : (mc - i);
770  float *Ap_panel = &Ap[(i / mr) * mr * kc];
771 
772  for (int p = 0; p < rows; p++) {
773  const float *A_row = &A[(i + p) * lda];
774  float *Ap_row = &Ap_panel[p * kc];
775 
776  // Vectorized copy
777  int k = 0;
778 #if defined(__AVX__)
779  for (; k <= kc - 8; k += 8) {
780  _mm256_storeu_ps(&Ap_row[k], _mm256_loadu_ps(&A_row[k]));
781  }
782 #endif
783  for (; k < kc; k++) {
784  Ap_row[k] = A_row[k];
785  }
786  }
787  // Zero pad if partial panel
788  for (int p = rows; p < mr; p++) {
789  memset(&Ap_panel[p * kc], 0, kc * sizeof(float));
790  }
791  }
792 }

◆ pack_b_panel()

static void pack_b_panel ( const float *  B,
int  ldb,
float *  Bp,
int  kc,
int  nc,
int  nr 
)
static

Definition at line 795 of file gemm_microkernel.c.

800 {
801  #pragma omp parallel for schedule(static) if(nc > 128)
802  for (int j = 0; j < nc; j += nr) {
803  int cols = (j + nr <= nc) ? nr : (nc - j);
804  float *Bp_panel = &Bp[(j / nr) * nr * kc];
805 
806  for (int k = 0; k < kc; k++) {
807  const float *B_row = &B[k * ldb + j];
808  float *Bp_row = &Bp_panel[k * nr];
809 
810  // Copy cols and zero-pad
811  int c = 0;
812 #if defined(__AVX512F__)
813  for (; c <= cols - 16; c += 16) {
814  _mm512_store_ps(&Bp_row[c], _mm512_loadu_ps(&B_row[c]));
815  }
816 #elif defined(__AVX__)
817  for (; c <= cols - 8; c += 8) {
818  _mm256_store_ps(&Bp_row[c], _mm256_loadu_ps(&B_row[c]));
819  }
820 #endif
821  for (; c < cols; c++) {
822  Bp_row[c] = B_row[c];
823  }
824  for (; c < nr; c++) {
825  Bp_row[c] = 0.0f;
826  }
827  }
828  }
829 }

Variable Documentation

◆ g_threads_initialized

int g_threads_initialized = 0
static

Definition at line 912 of file gemm_microkernel.c.

Referenced by gemm_init_threads().