AMX (Advanced Matrix Extensions) GEMM kernels. More...

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdbool.h>
#include "ckernel_quant.h"

Functions
bool	amx_available (void)

void	gemv_q4_k_q8_k_amx (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q4_k_q8_k_avx (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q4_k_q8_k_avx2 (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q4_k_q8_k_ref (float y, const void W, const void *x_q8, int M, int K)

void	gemv_q4_k_q8_k_vnni (float y, const void W, const void *x_q8, int M, int K)

Detailed Description

AMX (Advanced Matrix Extensions) GEMM kernels.

CK-ENGINE KERNEL RULES:

NO malloc/free - memory via bump allocator, pointers passed in
NO OpenMP - parallelization at orchestrator/codegen layer
API must define: inputs, outputs, workspace, and memory layouts
Pure computation - deterministic, no side effects

After changes: make test && make llamacpp-parity-full

Intel AMX provides dedicated matrix multiply hardware:

8 tile registers (TMM0-TMM7), each up to 1KB
TDPBSSD: INT8 signed dot product (A signed, B signed)
TDPBSUD: INT8 mixed sign (A signed, B unsigned)
TDPBUSD: INT8 mixed sign (A unsigned, B signed)
TDPBUUD: INT8 unsigned dot product
TDPBF16PS: BF16 dot product to FP32

Tile dimensions:

Max: 16 rows x 64 bytes (1024 bytes per tile)
For INT8: 16x64 elements
For BF16: 16x32 elements

Performance:

AMX INT8: ~2000 INT8 ops/cycle (vs ~256 for AVX-512 VNNI)
AMX BF16: ~1000 BF16 ops/cycle
Expected 8-16x speedup over AVX-512 for large GEMM

Requirements:

Sapphire Rapids or newer (4th Gen Xeon)
Linux kernel 5.16+ with AMX support
Compiler: GCC 11+, Clang 12+, ICX 2022+

Definition in file gemm_kernels_amx.c.

Function Documentation

◆ amx_available()

bool amx_available ( void )

Definition at line 262 of file gemm_kernels_amx.c.

                          {
     return false;
 }

◆ gemv_q4_k_q8_k_amx()

void gemv_q4_k_q8_k_amx	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 247 of file gemm_kernels_amx.c.

                                                                                  {
     /* No AMX support - cascade through fallbacks: AVX-512 VNNI → AVX2 → AVX → ref */
 #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     gemv_q4_k_q8_k_vnni(y, W, x_q8, M, K);
 #elif defined(__AVX2__)
     gemv_q4_k_q8_k_avx2(y, W, x_q8, M, K);
 #elif defined(__AVX__)
     gemv_q4_k_q8_k_avx(y, W, x_q8, M, K);
 #else
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 #endif
 }

References gemv_q4_k_q8_k_avx(), gemv_q4_k_q8_k_avx2(), gemv_q4_k_q8_k_ref(), and gemv_q4_k_q8_k_vnni().

◆ gemv_q4_k_q8_k_avx()

void gemv_q4_k_q8_k_avx	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 251 of file gemm_kernels_q4k_avx.c.

 {
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 }

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_avx2()

void gemv_q4_k_q8_k_avx2	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 89 of file gemm_kernels_q4k_q8k_avx2.c.

 {
     /* TODO: Implement AVX2 version with correct Q4_K memory layout.
      * For now, fall back to reference implementation which has been
      * fixed to use the correct layout.
      */
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 }

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_ref()

void gemv_q4_k_q8_k_ref	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 177 of file gemm_kernels_q4k_q8k.c.

 {
     if (!y || !W || !x_q8 || M <= 0 || K <= 0) {
         return;
     }
  
     const block_q4_K *blocks = (const block_q4_K *)W;
     const block_q8_K *x = (const block_q8_K *)x_q8;
     const int blocks_per_row = K / QK_K;
  
     for (int row = 0; row < M; ++row) {
         const block_q4_K *w_row = blocks + (size_t)row * (size_t)blocks_per_row;
         y[row] = dot_q4_k_q8_k_ref(w_row, x, K);
     }
 }

Referenced by gemv_q4_k_q8_k_amx().

◆ gemv_q4_k_q8_k_vnni()

void gemv_q4_k_q8_k_vnni	(	float *	y,
		const void *	W,
		const void *	x_q8,
		int	M,
		int	K
	)

Definition at line 95 of file gemm_kernels_q4k_q8k_vnni.c.

 {
     /* TODO: Implement VNNI version with correct Q4_K memory layout.
      * For now, fall back to reference implementation which has been
      * fixed to use the correct layout.
      */
     gemv_q4_k_q8_k_ref(y, W, x_q8, M, K);
 }

References gemv_q4_k_q8_k_ref().

Referenced by gemv_q4_k_q8_k_amx().

Functions

Detailed Description

CK-ENGINE KERNEL RULES:

Function Documentation

◆ amx_available()

◆ gemv_q4_k_q8_k_amx()

◆ gemv_q4_k_q8_k_avx()

◆ gemv_q4_k_q8_k_avx2()

◆ gemv_q4_k_q8_k_ref()

◆ gemv_q4_k_q8_k_vnni()