← Back to C-Kernel-Engine Docs Doxygen Source Documentation
gemm_kernels_amx.c File Reference

AMX (Advanced Matrix Extensions) GEMM kernels. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdbool.h>
#include "ckernel_quant.h"

Go to the source code of this file.

Functions

bool amx_available (void)
 
void gemv_q4_k_q8_k_amx (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q4_k_q8_k_avx (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q4_k_q8_k_avx2 (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q4_k_q8_k_ref (float *y, const void *W, const void *x_q8, int M, int K)
 
void gemv_q4_k_q8_k_vnni (float *y, const void *W, const void *x_q8, int M, int K)
 

Detailed Description

AMX (Advanced Matrix Extensions) GEMM kernels.

CK-ENGINE KERNEL RULES:

  1. NO malloc/free - memory via bump allocator, pointers passed in
  2. NO OpenMP - parallelization at orchestrator/codegen layer
  3. API must define: inputs, outputs, workspace, and memory layouts
  4. Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Intel AMX provides dedicated matrix multiply hardware:

  • 8 tile registers (TMM0-TMM7), each up to 1KB
  • TDPBSSD: INT8 signed dot product (A signed, B signed)
  • TDPBSUD: INT8 mixed sign (A signed, B unsigned)
  • TDPBUSD: INT8 mixed sign (A unsigned, B signed)
  • TDPBUUD: INT8 unsigned dot product
  • TDPBF16PS: BF16 dot product to FP32

Tile dimensions:

  • Max: 16 rows x 64 bytes (1024 bytes per tile)
  • For INT8: 16x64 elements
  • For BF16: 16x32 elements

Performance:

  • AMX INT8: ~2000 INT8 ops/cycle (vs ~256 for AVX-512 VNNI)
  • AMX BF16: ~1000 BF16 ops/cycle
  • Expected 8-16x speedup over AVX-512 for large GEMM

Requirements:

  • Sapphire Rapids or newer (4th Gen Xeon)
  • Linux kernel 5.16+ with AMX support
  • Compiler: GCC 11+, Clang 12+, ICX 2022+

Definition in file gemm_kernels_amx.c.

Function Documentation

◆ amx_available()

bool amx_available ( void  )

Definition at line 262 of file gemm_kernels_amx.c.

262  {
263  return false;
264 }

◆ gemv_q4_k_q8_k_amx()

void gemv_q4_k_q8_k_amx ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 247 of file gemm_kernels_amx.c.

247  {
248  /* No AMX support - cascade through fallbacks: AVX-512 VNNI → AVX2 → AVX → ref */
249 #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
250  gemv_q4_k_q8_k_vnni(y, W, x_q8, M, K);
251 #elif defined(__AVX2__)
252  gemv_q4_k_q8_k_avx2(y, W, x_q8, M, K);
253 #elif defined(__AVX__)
254  gemv_q4_k_q8_k_avx(y, W, x_q8, M, K);
255 #else
256  gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
257 #endif
258 }
void gemv_q4_k_q8_k_avx2(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q4_k_q8_k_vnni(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q4_k_q8_k_ref(float *y, const void *W, const void *x_q8, int M, int K)
void gemv_q4_k_q8_k_avx(float *y, const void *W, const void *x_q8, int M, int K)

References gemv_q4_k_q8_k_avx(), gemv_q4_k_q8_k_avx2(), gemv_q4_k_q8_k_ref(), and gemv_q4_k_q8_k_vnni().

◆ gemv_q4_k_q8_k_avx()

void gemv_q4_k_q8_k_avx ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 251 of file gemm_kernels_q4k_avx.c.

255 {
256  gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
257 }
void gemv_q4_k_q8_k_ref(float *y, const void *W, const void *x_q8, int M, int K)

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_avx2()

void gemv_q4_k_q8_k_avx2 ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 89 of file gemm_kernels_q4k_q8k_avx2.c.

93 {
94  /* TODO: Implement AVX2 version with correct Q4_K memory layout.
95  * For now, fall back to reference implementation which has been
96  * fixed to use the correct layout.
97  */
98  gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
99 }
void gemv_q4_k_q8_k_ref(float *y, const void *W, const void *x_q8, int M, int K)

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_ref()

void gemv_q4_k_q8_k_ref ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 177 of file gemm_kernels_q4k_q8k.c.

181 {
182  if (!y || !W || !x_q8 || M <= 0 || K <= 0) {
183  return;
184  }
185 
186  const block_q4_K *blocks = (const block_q4_K *)W;
187  const block_q8_K *x = (const block_q8_K *)x_q8;
188  const int blocks_per_row = K / QK_K;
189 
190  for (int row = 0; row < M; ++row) {
191  const block_q4_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
192  y[row] = dot_q4_k_q8_k_ref(w_row, x, K);
193  }
194 }
#define QK_K
static float dot_q4_k_q8_k_ref(const block_q4_K *w, const block_q8_K *x, int k)

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_vnni()

void gemv_q4_k_q8_k_vnni ( float *  y,
const void *  W,
const void *  x_q8,
int  M,
int  K 
)

Definition at line 95 of file gemm_kernels_q4k_q8k_vnni.c.

99 {
100  /* TODO: Implement VNNI version with correct Q4_K memory layout.
101  * For now, fall back to reference implementation which has been
102  * fixed to use the correct layout.
103  */
104  gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
105 }
void gemv_q4_k_q8_k_ref(float *y, const void *W, const void *x_q8, int M, int K)

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().