C-Kernel-Engine/doxygen/ckernel__quant_8h_source.html

 /**

  * @file ckernel_quant.h

  * @brief Quantization block structures for weight-only quantization

  *

  * Defines block structures for various quantization formats used in LLM inference.

  * Primary focus on Q4_K_M which is commonly used for LLM weight compression.

  *

  * Block structures are compatible with llama.cpp/GGML for model loading.

  */


 #ifndef CKERNEL_QUANT_H

 #define CKERNEL_QUANT_H


 #include <stdint.h>

 #include <stddef.h>

 #include "ckernel_dtype.h"


 #ifdef __cplusplus

 extern "C" {

 #endif


 /* ============================================================================

  * Half-Precision Type (FP16 - IEEE 754)

  * ============================================================================ */


 typedef uint16_t ck_half;


 /* ============================================================================

  * Q4_0: Simple 4-bit Quantization

  * - 32 weights per block

  * - 1 FP16 scale per block

  * - 18 bytes per 32 weights = 4.5 bits/weight

  * ============================================================================ */


 #define QK4_0 32


 typedef struct {

     ck_half d;             /* 2 bytes: scale (delta) */

     uint8_t qs[QK4_0 / 2]; /* 16 bytes: 32 x 4-bit weights (2 per byte) */

 } block_q4_0;

 /* Total: 18 bytes per 32 weights */


 /* ============================================================================

  * Q4_1: Simple 4-bit Quantization with Min

  * - 32 weights per block

  * - 2 FP16 values: scale (d) and min (m)

  * - 20 bytes per 32 weights = 5.0 bits/weight

  * ============================================================================ */


 #define QK4_1 32


 typedef struct {

     ck_half d;             /* 2 bytes: scale (delta) */

     ck_half m;             /* 2 bytes: minimum */

     uint8_t qs[QK4_1 / 2]; /* 16 bytes: 32 x 4-bit weights (2 per byte) */

 } block_q4_1;

 /* Total: 20 bytes per 32 weights */


 /* ============================================================================

  * Q5_0: Simple 5-bit Quantization

  * - 32 weights per block

  * - 1 FP16 scale per block

  * - Low 4 bits stored like Q4_0, high 1 bit packed separately

  * - 22 bytes per 32 weights = 5.5 bits/weight

  * ============================================================================ */


 #define QK5_0 32


 typedef struct {

     ck_half d;             /* 2 bytes: scale (delta) */

     uint8_t qh[4];         /* 4 bytes: high 1-bit of each weight (32 bits total) */

     uint8_t qs[QK5_0 / 2]; /* 16 bytes: low 4-bits of 32 weights (2 per byte) */

 } block_q5_0;

 /* Total: 22 bytes per 32 weights */


 /* ============================================================================

  * Q5_1: Simple 5-bit Quantization with Min

  * - 32 weights per block

  * - 2 FP16 values: scale (d) and min (m)

  * - Low 4 bits stored like Q4_1, high 1 bit packed separately

  * - 24 bytes per 32 weights = 6.0 bits/weight

  * ============================================================================ */


 #define QK5_1 32


 typedef struct {

     ck_half d;             /* 2 bytes: scale (delta) */

     ck_half m;             /* 2 bytes: minimum */

     uint8_t qh[4];         /* 4 bytes: high 1-bit of each weight (32 bits total) */

     uint8_t qs[QK5_1 / 2]; /* 16 bytes: low 4-bits of 32 weights (2 per byte) */

 } block_q5_1;

 /* Total: 24 bytes per 32 weights */


 /* ============================================================================

  * Q8_0: Simple 8-bit Quantization

  * - 32 weights per block

  * - 1 FP16 scale per block

  * - 34 bytes per 32 weights = 8.5 bits/weight

  * ============================================================================ */


 #define QK8_0 32


 typedef struct {

     ck_half d;             /* 2 bytes: scale */

     int8_t qs[QK8_0];      /* 32 bytes: 32 x 8-bit signed weights */

 } block_q8_0;

 /* Total: 34 bytes per 32 weights */


 /* ============================================================================

  * Q4_K: K-Quant 4-bit with Nested Scales (Primary Target)

  * - 256 weights per super-block

  * - 8 sub-blocks of 32 weights each

  * - Two-level scaling: super-block FP16 + sub-block 6-bit

  * - 144 bytes per 256 weights = 4.5 bits/weight

  *

  * This is the format used by Q4_K_M, Q4_K_S, Q4_K_L variants.

  * The M/S/L suffix indicates quantization aggressiveness, not structure.

  * ============================================================================ */


 #define QK_K 256

 #define K_SCALE_SIZE 12


 typedef struct {

     ck_half d;                    /* 2 bytes: super-block scale */

     ck_half dmin;                 /* 2 bytes: super-block minimum */

     uint8_t scales[K_SCALE_SIZE]; /* 12 bytes: 8 sub-block scales + 8 sub-block mins (6-bit packed) */

     uint8_t qs[QK_K / 2];         /* 128 bytes: 256 x 4-bit weights */

 } block_q4_K;

 /* Total: 144 bytes per 256 weights */


 /* ============================================================================

  * Q5_K: K-Quant 5-bit with Nested Scales

  * - 256 weights per super-block

  * - 8 sub-blocks of 32 weights each

  * - Two-level scaling: super-block FP16 + sub-block 6-bit

  * - High bit per weight stored separately (1 bit each)

  * - 176 bytes per 256 weights = 5.5 bits/weight

  * ============================================================================ */


 typedef struct {

     ck_half d;                    /* 2 bytes: super-block scale */

     ck_half dmin;                 /* 2 bytes: super-block minimum */

     uint8_t scales[K_SCALE_SIZE]; /* 12 bytes: 8 sub-block scales + 8 sub-block mins (6-bit packed) */

     uint8_t qh[QK_K / 8];         /* 32 bytes: high 1-bit for 256 weights */

     uint8_t qs[QK_K / 2];         /* 128 bytes: 256 x 4-bit weights */

 } block_q5_K;

 /* Total: 176 bytes per 256 weights */


 /* ============================================================================

  * Q6_K: K-Quant 6-bit (per-16 scales)

  * - 256 weights per block

  * - 16 sub-blocks of 16 weights each

  * - Stored as low 4 bits (ql) + high 2 bits (qh) + int8 scales

  * ============================================================================ */


 typedef struct {

     uint8_t ql[QK_K / 2];      /* 128 bytes: low 4 bits */

     uint8_t qh[QK_K / 4];      /* 64 bytes: high 2 bits */

     int8_t scales[QK_K / 16];  /* 16 bytes: 16 sub-block scales */

     ck_half d;                 /* 2 bytes: super-block scale */

 } block_q6_K;

 /* Total: 210 bytes per 256 weights */


 /* ============================================================================

  * Q8_K: K-Quant 8-bit (used for activations in some ops)

  * - 256 weights per super-block

  * - 1 FP32 scale per block (not FP16 like others!)

  * ============================================================================ */


 typedef struct {

     float d;                  /* 4 bytes: scale */

     int8_t qs[QK_K];          /* 256 bytes: 256 x 8-bit signed weights */

     int16_t bsums[QK_K / 16]; /* 32 bytes: block sums for optimization */

 } block_q8_K;

 /* Total: 292 bytes per 256 weights */


 /* ============================================================================

  * Size Calculation Utilities

  * ============================================================================ */


 /**

  * @brief Get the block size (number of weights per block) for a quant type

  */

 static inline size_t ck_quant_block_size(int type) {

     switch (type) {

         case 0: return QK4_0;    /* Q4_0 */

         case 1: return QK8_0;    /* Q8_0 */

         case 2: return QK_K;     /* Q4_K */

         case 3: return QK_K;     /* Q8_K */

         case CK_DT_Q4_1: return QK4_1;

         case CK_DT_Q5_0: return QK5_0;

         case CK_DT_Q5_1: return QK5_1;

         case CK_DT_Q5_K: return QK_K;

         case CK_DT_Q6_K: return QK_K;

         default: return 1;

     }

 }


 /**

  * @brief Get the byte size per block for a quant type

  */

 static inline size_t ck_quant_type_size(int type) {

     switch (type) {

         case 0: return sizeof(block_q4_0);

         case 1: return sizeof(block_q8_0);

         case 2: return sizeof(block_q4_K);

         case 3: return sizeof(block_q8_K);

         case CK_DT_Q4_1: return sizeof(block_q4_1);

         case CK_DT_Q5_0: return sizeof(block_q5_0);

         case CK_DT_Q5_1: return sizeof(block_q5_1);

         case CK_DT_Q5_K: return sizeof(block_q5_K);

         case CK_DT_Q6_K: return sizeof(block_q6_K);

         default: return 4; /* FP32 */

     }

 }


 /**

  * @brief Calculate total bytes needed for n_elements with given quant type

  */

 static inline size_t ck_quant_row_size(int type, int64_t n_elements) {

     size_t block_size = ck_quant_block_size(type);

     size_t type_size = ck_quant_type_size(type);

     return (n_elements / block_size) * type_size;

 }


 /* ============================================================================

  * Q4_K Scale Unpacking Utilities

  *

  * The scales[12] array packs 8 scales and 8 mins in 6-bit format.

  * Unpacking is non-trivial due to the bit packing.

  * ============================================================================ */


 /**

  * @brief Unpack Q4_K sub-block scales and mins

  *

  * @param scales The packed scales[12] array from block_q4_K

  * @param sc Output: 8 unpacked scale values (multiply by super-block d)

  * @param m Output: 8 unpacked min values (multiply by super-block dmin)

  *

  * This matches llama.cpp's get_scale_min_k4() function exactly.

  * The 12-byte scales array layout:

  *   - bytes 0-3: 6-bit scales[0-3] (high 2 bits used for scales[4-7])

  *   - bytes 4-7: 6-bit mins[0-3] (high 2 bits used for mins[4-7])

  *   - bytes 8-11: low 4 bits for scales[4-7], high 4 bits for mins[4-7]

  */

 static inline void unpack_q4_k_scales(const uint8_t *scales,

                                        uint8_t *sc, uint8_t *m) {

     /* Direct 6-bit values for indices 0-3 */

     sc[0] = scales[0] & 0x3F;

     sc[1] = scales[1] & 0x3F;

     sc[2] = scales[2] & 0x3F;

     sc[3] = scales[3] & 0x3F;


     m[0] = scales[4] & 0x3F;

     m[1] = scales[5] & 0x3F;

     m[2] = scales[6] & 0x3F;

     m[3] = scales[7] & 0x3F;


     /* 6-bit values for indices 4-7: low 4 bits from bytes 8-11,

      * high 2 bits from upper bits of bytes 0-3 (scales) and 4-7 (mins) */

     sc[4] = (scales[8]  & 0x0F) | ((scales[0] >> 6) << 4);

     sc[5] = (scales[9]  & 0x0F) | ((scales[1] >> 6) << 4);

     sc[6] = (scales[10] & 0x0F) | ((scales[2] >> 6) << 4);

     sc[7] = (scales[11] & 0x0F) | ((scales[3] >> 6) << 4);


     m[4] = (scales[8]  >> 4) | ((scales[4] >> 6) << 4);

     m[5] = (scales[9]  >> 4) | ((scales[5] >> 6) << 4);

     m[6] = (scales[10] >> 4) | ((scales[6] >> 6) << 4);

     m[7] = (scales[11] >> 4) | ((scales[7] >> 6) << 4);

 }


 /**

  * @brief Unpack Q5_K sub-block scales and mins

  *

  * @param scales The packed scales[12] array from block_q5_K

  * @param sc Output: 8 unpacked scale values (multiply by super-block d)

  * @param m Output: 8 unpacked min values (multiply by super-block dmin)

  *

  * Q5_K uses the same 6-bit packed format as Q4_K for scales/mins.

  * The 12-byte scales array layout is identical:

  *   - bytes 0-3: 6-bit scales[0-3] (high 2 bits used for scales[4-7])

  *   - bytes 4-7: 6-bit mins[0-3] (high 2 bits used for mins[4-7])

  *   - bytes 8-11: low 4 bits for scales[4-7], high 4 bits for mins[4-7]

  */

 static inline void unpack_q5_k_scales(const uint8_t *scales,

                                      uint8_t *sc, uint8_t *m) {

     /* Q5_K uses identical packing as Q4_K for scales/mins */

     unpack_q4_k_scales(scales, sc, m);

 }


 /* ============================================================================

  * FP16 Conversion Utilities

  *

  * Three variants:

  *   _soft  - Pure C bit manipulation (always available, portable)

  *   _simd  - F16C hardware instruction (vcvtph2ps/vcvtps2ph, Ivy Bridge+)

  *   (default) - Auto-dispatches to best available at compile time

  * ============================================================================ */


 /**

  * @brief Convert FP16 (ck_half) to FP32 — software implementation

  */

 static inline float ck_fp16_to_fp32_soft(ck_half h) {

     uint32_t sign = (h & 0x8000) << 16;

     uint32_t exp = (h >> 10) & 0x1F;

     uint32_t mant = h & 0x3FF;


     uint32_t result;


     if (exp == 0) {

         if (mant == 0) {

             result = sign;

         } else {

             /* Denormalized - convert to normalized FP32 */

             exp = 1;

             while ((mant & 0x400) == 0) {

                 mant <<= 1;

                 exp--;

             }

             mant &= 0x3FF;

             result = sign | ((exp + 127 - 15) << 23) | (mant << 13);

         }

     } else if (exp == 31) {

         result = sign | 0x7F800000 | (mant << 13);

     } else {

         result = sign | ((exp + 127 - 15) << 23) | (mant << 13);

     }


     union { uint32_t u; float f; } u;

     u.u = result;

     return u.f;

 }


 /**

  * @brief Convert FP32 to FP16 (ck_half) — software implementation

  */

 static inline ck_half ck_fp32_to_fp16_soft(float f) {

     union { uint32_t u; float f; } u;

     u.f = f;


     uint32_t sign = (u.u >> 16) & 0x8000;

     int32_t exp = ((u.u >> 23) & 0xFF) - 127 + 15;

     uint32_t mant = (u.u >> 13) & 0x3FF;


     if (exp <= 0) {

         if (exp < -10) {

             return sign;

         }

         mant = (mant | 0x400) >> (1 - exp);

         return sign | mant;

     } else if (exp >= 31) {

         return sign | 0x7C00;

     }


     return sign | (exp << 10) | mant;

 }


 /* --------------------------------------------------------------------------

  * F16C Hardware SIMD conversion (requires Intel Ivy Bridge+ or AMD Piledriver+)

  * Uses vcvtsh2ss / vcvtss2sh single-element hardware instructions.

  * -------------------------------------------------------------------------- */

 #if defined(__F16C__)

 #include <immintrin.h>


 /**

  * @brief Convert FP16 to FP32 — F16C hardware (1 instruction: vcvtsh2ss)

  */

 static inline float ck_fp16_to_fp32_simd(ck_half h) {

     return _cvtsh_ss(h);

 }


 /**

  * @brief Convert FP32 to FP16 — F16C hardware (1 instruction: vcvtss2sh)

  */

 static inline ck_half ck_fp32_to_fp16_simd(float f) {

     return (ck_half)_cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);

 }

 #endif /* __F16C__ */


 /* --------------------------------------------------------------------------

  * Default dispatch: selects hardware SIMD when available, else software

  * -------------------------------------------------------------------------- */

 static inline float ck_fp16_to_fp32(ck_half h) {

 #if defined(__F16C__)

     return ck_fp16_to_fp32_simd(h);

 #else

     return ck_fp16_to_fp32_soft(h);

 #endif

 }


 static inline ck_half ck_fp32_to_fp16(float f) {

 #if defined(__F16C__)

     return ck_fp32_to_fp16_simd(f);

 #else

     return ck_fp32_to_fp16_soft(f);

 #endif

 }


 /* Convenience macros */

 #define CK_FP16_TO_FP32(x) ck_fp16_to_fp32(x)

 #define CK_FP32_TO_FP16(x) ck_fp32_to_fp16(x)

 #define CK_FP16_TO_FP32_SIMD(x) ck_fp16_to_fp32_simd(x)

 #define CK_FP32_TO_FP16_SIMD(x) ck_fp32_to_fp16_simd(x)

 #define CK_FP16_TO_FP32_SOFT(x) ck_fp16_to_fp32_soft(x)

 #define CK_FP32_TO_FP16_SOFT(x) ck_fp32_to_fp16_soft(x)


 /* Legacy compatibility (for files that used the old names) */

 typedef ck_half ggml_half;

 #define ggml_fp16_to_fp32 ck_fp16_to_fp32

 #define ggml_fp32_to_fp16 ck_fp32_to_fp16

 #define GGML_FP16_TO_FP32 CK_FP16_TO_FP32

 #define GGML_FP32_TO_FP16 CK_FP32_TO_FP16


 /* ============================================================================

  * SSE Optimized Kernels

  * ============================================================================ */


 void gemm_nt_q5_0_sse_v2(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);

 void gemm_nt_q6_k_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);

 void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);

 void gemv_q4_k_q8_k_sse(float *y, const void *W, const void *x_q8, int M, int K);

 void quantize_row_q8_k_sse(const float *x, void *vy, int k);

 void rmsnorm_q8_k_fused(const float *input, const float *gamma, void *vy, int tokens, int d_model, int aligned_embed_dim, float eps);


 /* Q5_K kernels (reference implementation) */

 void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K);

 void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);

 void gemv_q5_k(float *y, const void *W, const float *x, int M, int K);

 void gemm_nt_q5_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K);


 /* INT8 activation batch GEMM kernels (Q5_0 weights x Q8_0 activations) */

 void gemm_nt_q5_0_q8_0(const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K);

 void gemm_nt_q5_0_q8_0_unroll_avx(const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K);

 void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy);

 void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy);

 void quantize_row_q8_0(const float *x, void *vy, int k);


 #ifdef __cplusplus

 }

 #endif


 #endif /* CKERNEL_QUANT_H */

ckernel_dtype.h

CK_DT_Q5_0
@ CK_DT_Q5_0
Definition: ckernel_dtype.h:44

CK_DT_Q5_K
@ CK_DT_Q5_K
Definition: ckernel_dtype.h:46

CK_DT_Q6_K
@ CK_DT_Q6_K
Definition: ckernel_dtype.h:41

CK_DT_Q4_1
@ CK_DT_Q4_1
Definition: ckernel_dtype.h:39

CK_DT_Q5_1
@ CK_DT_Q5_1
Definition: ckernel_dtype.h:45

gemm_nt_q5_0_sse_v2
void gemm_nt_q5_0_sse_v2(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_0_sse_v2.c:77

QK5_0
#define QK5_0
Definition: ckernel_quant.h:67

gemv_q5_k_ref
void gemv_q5_k_ref(float *y, const void *W, const float *x, int M, int K)
Definition: gemm_kernels_q5_k.c:92

ck_fp16_to_fp32_soft
static float ck_fp16_to_fp32_soft(ck_half h)
Convert FP16 (ck_half) to FP32 — software implementation.
Definition: ckernel_quant.h:303

K_SCALE_SIZE
#define K_SCALE_SIZE
Definition: ckernel_quant.h:121

ck_half
uint16_t ck_half
Definition: ckernel_quant.h:26

gemm_nt_q6_k_ref
void gemm_nt_q6_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q6k.c:243

gemv_q5_k
void gemv_q5_k(float *y, const void *W, const float *x, int M, int K)
Definition: gemm_kernels_q5_k.c:199

QK5_1
#define QK5_1
Definition: ckernel_quant.h:84

gemm_nt_q6_k_sse
void gemm_nt_q6_k_sse(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q6k_sse.c:66

vec_dot_q5_0_q8_0
void vec_dot_q5_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q5_0 x Q8_0.
Definition: gemm_kernels_q5_0.c:1498

ck_quant_type_size
static size_t ck_quant_type_size(int type)
Get the byte size per block for a quant type.
Definition: ckernel_quant.h:202

rmsnorm_q8_k_fused
void rmsnorm_q8_k_fused(const float *input, const float *gamma, void *vy, int tokens, int d_model, int aligned_embed_dim, float eps)
Definition: rmsnorm_q8_k_fused.c:54

QK4_0
#define QK4_0
Definition: ckernel_quant.h:35

gemm_nt_q5_k_ref
void gemm_nt_q5_k_ref(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_k.c:145

ck_fp32_to_fp16
static ck_half ck_fp32_to_fp16(float f)
Definition: ckernel_quant.h:391

QK4_1
#define QK4_1
Definition: ckernel_quant.h:50

gemm_nt_q5_0_q8_0
void gemm_nt_q5_0_q8_0(const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)
Batch GEMM with Q5_0 weights and Q8_0 activations for prefill.
Definition: gemm_kernels_q5_0.c:1617

ggml_half
ck_half ggml_half
Definition: ckernel_quant.h:408

gemm_nt_q5_0_q8_0_unroll_avx
void gemm_nt_q5_0_q8_0_unroll_avx(const void *A_q8, const void *B_q5, const float *bias, float *C, int M, int N, int K)

ck_fp16_to_fp32
static float ck_fp16_to_fp32(ck_half h)
Definition: ckernel_quant.h:383

unpack_q5_k_scales
static void unpack_q5_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q5_K sub-block scales and mins.
Definition: ckernel_quant.h:285

quantize_row_q8_k_sse
void quantize_row_q8_k_sse(const float *x, void *vy, int k)
Definition: quantize_row_q8_k_sse.c:29

ck_quant_block_size
static size_t ck_quant_block_size(int type)
Get the block size (number of weights per block) for a quant type.
Definition: ckernel_quant.h:184

unpack_q4_k_scales
static void unpack_q4_k_scales(const uint8_t *scales, uint8_t *sc, uint8_t *m)
Unpack Q4_K sub-block scales and mins.
Definition: ckernel_quant.h:246

quantize_row_q8_0
void quantize_row_q8_0(const float *x, void *vy, int k)
Quantize FP32 to Q8_0 format (scalar reference)
Definition: gemm_kernels_q8_0.c:59

ck_quant_row_size
static size_t ck_quant_row_size(int type, int64_t n_elements)
Calculate total bytes needed for n_elements with given quant type.
Definition: ckernel_quant.h:220

vec_dot_q8_0_q8_0
void vec_dot_q8_0_q8_0(int n, float *s, const void *vx, const void *vy)
Auto-dispatch quantized dot product Q8_0 x Q8_0.
Definition: gemm_kernels_q8_0.c:1013

ck_fp32_to_fp16_soft
static ck_half ck_fp32_to_fp16_soft(float f)
Convert FP32 to FP16 (ck_half) — software implementation.
Definition: ckernel_quant.h:337

QK8_0
#define QK8_0
Definition: ckernel_quant.h:101

gemm_nt_q5_k
void gemm_nt_q5_k(const float *A, const void *B, const float *bias, float *C, int M, int N, int K)
Definition: gemm_kernels_q5_k.c:218

gemv_q4_k_q8_k_sse
void gemv_q4_k_q8_k_sse(float *y, const void *W, const void *x_q8, int M, int K)
Definition: gemm_kernels_q4k_sse.c:33

QK_K
#define QK_K
Definition: ckernel_quant.h:120

C
#define C(color)
Definition: show_config.c:39

block_q4_0
Definition: ckernel_quant.h:37

block_q4_0::d
ck_half d
Definition: ckernel_quant.h:38

block_q4_1
Definition: ckernel_quant.h:52

block_q4_1::m
ck_half m
Definition: ckernel_quant.h:54

block_q4_1::d
ck_half d
Definition: ckernel_quant.h:53

block_q4_K
Definition: ckernel_quant.h:123

block_q4_K::d
ck_half d
Definition: ckernel_quant.h:124

block_q4_K::dmin
ck_half dmin
Definition: ckernel_quant.h:125

block_q5_0
Definition: ckernel_quant.h:69

block_q5_0::d
ck_half d
Definition: ckernel_quant.h:70

block_q5_1
Definition: ckernel_quant.h:86

block_q5_1::m
ck_half m
Definition: ckernel_quant.h:88

block_q5_1::d
ck_half d
Definition: ckernel_quant.h:87

block_q5_K
Definition: ckernel_quant.h:140

block_q5_K::dmin
ck_half dmin
Definition: ckernel_quant.h:142

block_q5_K::d
ck_half d
Definition: ckernel_quant.h:141

block_q6_K
Definition: ckernel_quant.h:156

block_q6_K::d
ck_half d
Definition: ckernel_quant.h:160

block_q8_0
Definition: ckernel_quant.h:103

block_q8_0::d
ck_half d
Definition: ckernel_quant.h:104

block_q8_K
Definition: ckernel_quant.h:170

block_q8_K::d
float d
Definition: ckernel_quant.h:171