Simplified v6 CLI using only generic kernels. More...

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#include "ckernel_engine.h"

Macros
#define	_GNU_SOURCE

#define	ALIGN_CONTEXT 32768

#define	ALIGN_EMBED 896

#define	ALIGN_HEAD 64

#define	MODEL_EMBED_DIM 896

#define	MODEL_HEAD_DIM 64

#define	MODEL_INTERMEDIATE 4864

#define	MODEL_INTERMEDIATE_SIZE 4864

#define	MODEL_MAX_SEQ_LEN 32768

#define	MODEL_NUM_HEADS 14

#define	MODEL_NUM_KV_HEADS 2

#define	MODEL_NUM_LAYERS 24

#define	MODEL_VOCAB_SIZE 128256

Functions
static void	apply_rope (float *x, int seq_len, int head_dim)

static void	gemm_nt (const float input, const float weight, float *output, int rows, int cols, int common)

int	main (int argc, char **argv)

static void	residual_add (float residual, float addend, int n)

static void	silu (float *x, int n)

static void	simple_attention (const float q, const float k, const float v, float output, int num_heads, int num_kv_heads, int seq_len, int head_dim)

static void	simple_embedding (const int32_t tokens, int num_tokens, const float weight, float *output, int vocab_size, int embed_dim)

static void	simple_rmsnorm (const float input, const float gamma, float *output, int tokens, int d_model, float eps)

static void	softmax (float *x, int n)

void	v6_prefill (const float embed_weight, const int32_t tokens, int num_tokens, float *logits)

Detailed Description

Simplified v6 CLI using only generic kernels.

This is a minimal v6 implementation that uses:

Generic GEMM (gemm_blocked_serial) instead of quantized kernels
Generic RMSNorm
Precomputed RoPE
OMP parallelization for prefill

Definition in file v6_simple.c.

Macro Definition Documentation

◆ _GNU_SOURCE

#define _GNU_SOURCE

Definition at line 12 of file v6_simple.c.

◆ ALIGN_CONTEXT

#define ALIGN_CONTEXT 32768

Definition at line 37 of file v6_simple.c.

◆ ALIGN_EMBED

#define ALIGN_EMBED 896

Definition at line 34 of file v6_simple.c.

◆ ALIGN_HEAD

#define ALIGN_HEAD 64

Definition at line 35 of file v6_simple.c.

◆ MODEL_EMBED_DIM

#define MODEL_EMBED_DIM 896

Definition at line 24 of file v6_simple.c.

◆ MODEL_HEAD_DIM

#define MODEL_HEAD_DIM 64

Definition at line 28 of file v6_simple.c.

◆ MODEL_INTERMEDIATE

#define MODEL_INTERMEDIATE 4864

Definition at line 36 of file v6_simple.c.

◆ MODEL_INTERMEDIATE_SIZE

#define MODEL_INTERMEDIATE_SIZE 4864

Definition at line 29 of file v6_simple.c.

◆ MODEL_MAX_SEQ_LEN

#define MODEL_MAX_SEQ_LEN 32768

Definition at line 31 of file v6_simple.c.

◆ MODEL_NUM_HEADS

#define MODEL_NUM_HEADS 14

Definition at line 26 of file v6_simple.c.

◆ MODEL_NUM_KV_HEADS

#define MODEL_NUM_KV_HEADS 2

Definition at line 27 of file v6_simple.c.

◆ MODEL_NUM_LAYERS

#define MODEL_NUM_LAYERS 24

Definition at line 25 of file v6_simple.c.

◆ MODEL_VOCAB_SIZE

#define MODEL_VOCAB_SIZE 128256

Definition at line 30 of file v6_simple.c.

Function Documentation

◆ apply_rope()

static void apply_rope	(	float *	x,
		int	seq_len,
		int	head_dim
	)

static

Definition at line 173 of file v6_simple.c.

                                                             {
     /* Simplified - just identity for now */
     (void)x;
     (void)seq_len;
     (void)head_dim;
 }

Referenced by v6_prefill().

◆ gemm_nt()

static void gemm_nt	(	const float *	input,
		const float *	weight,
		float *	output,
		int	rows,
		int	cols,
		int	common
	)

static

Definition at line 145 of file v6_simple.c.

                                                     {
     for (int r = 0; r < rows; r++) {
         for (int c = 0; c < cols; c++) {
             float sum = 0.0f;
             for (int k = 0; k < common; k++) {
                 sum += input[r * common + k] * weight[c * common + k];
             }
             output[r * cols + c] = sum;
         }
     }
 }

Referenced by v6_prefill().

◆ main()

int main	(	int	argc,
		char **	argv
	)

Definition at line 295 of file v6_simple.c.

                                 {
     printf("=== V6 Simple CLI ===\n");
     printf("Generic kernel implementation\n");
     printf("OMP parallelization for prefill\n\n");
  
     if (argc < 2) {
         printf("Usage: %s <weights.bin> [options]\n", argv[0]);
         printf("\nOptions:\n");
         printf("  -p, --prompt <text>   Input prompt\n");
         printf("  -t, --tokens <n>      Max tokens (default: 50)\n");
         printf("  -h, --help            Show help\n");
         return 1;
     }
  
     const char *weights_path = argv[1];
     const char *prompt = "Hello";
     int max_tokens = 50;
  
     for (int i = 2; i < argc; i++) {
         if (strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "--prompt") == 0) {
             prompt = argv[++i];
         } else if (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--tokens") == 0) {
             max_tokens = atoi(argv[++i]);
         } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
             printf("Usage: %s <weights.bin> [options]\n", argv[0]);
             return 0;
         }
     }
  
     printf("Model: Qwen2 0.5B (generic kernels)\n");
     printf("Prompt: %s\n", prompt);
     printf("Max tokens: %d\n", max_tokens);
     printf("\n[Note: This is a simplified v6 implementation using generic kernels]\n");
     printf("[Real weights loading and inference would require full implementation]\n");
  
     /* Placeholder for actual inference */
     printf("\nAssistant: (v6 placeholder - full implementation pending)\n");
  
     return 0;
 }

◆ residual_add()

static void residual_add	(	float *	residual,
		float *	addend,
		int	n
	)

static

Definition at line 166 of file v6_simple.c.

                                                                 {
     for (int i = 0; i < n; i++) {
         residual[i] += addend[i];
     }
 }

Referenced by v6_prefill().

◆ silu()

static void silu	(	float *	x,
		int	n
	)

static

Definition at line 159 of file v6_simple.c.

                                   {
     for (int i = 0; i < n; i++) {
         x[i] = x[i] / (1.0f + expf(-x[i]));
     }
 }

Referenced by fused_mlp_swiglu_prefill_bias(), swiglu_backward(), swiglu_backward_bf16(), swiglu_backward_exact(), swiglu_forward(), swiglu_forward_bf16(), swiglu_forward_exact(), and v6_prefill().

◆ simple_attention()

static void simple_attention	(	const float *	q,
		const float *	k,
		const float *	v,
		float *	output,
		int	num_heads,
		int	num_kv_heads,
		int	seq_len,
		int	head_dim
	)

static

Definition at line 80 of file v6_simple.c.

                                                         {
     int hidden_dim = num_heads * head_dim;
  
     /* For each head, compute attention */
     for (int h = 0; h < num_heads; h++) {
         const float *q_head = q + h * head_dim;
         const float *k_head = k;  /* KV heads repeated */
         const float *v_head = v;
  
         /* Repeat K/V for GQA */
         int repeat = num_heads / num_kv_heads;
  
         float *out_head = output + h * head_dim;
  
         /* Compute attention scores */
         float scores[MODEL_MAX_SEQ_LEN];
         for (int t = 0; t < seq_len; t++) {
             float score = 0.0f;
             for (int d = 0; d < head_dim; d++) {
                 score += q_head[d] * k_head[t * head_dim + d];
             }
             scores[t] = score / sqrtf((float)head_dim);
         }
  
         /* Causal mask */
         for (int t = 0; t < seq_len; t++) {
             if (t >= seq_len - 1) {
                 scores[t] = scores[t];  /* Last token can attend to all */
             } else {
                 scores[t] = -1e9f;  /* Mask future tokens */
             }
         }
  
         /* Softmax */
         softmax(scores, seq_len);
  
         /* Weighted sum */
         for (int d = 0; d < head_dim; d++) {
             float sum = 0.0f;
             for (int t = 0; t < seq_len; t++) {
                 sum += scores[t] * v_head[t * head_dim + d];
             }
             out_head[d] = sum;
         }
     }
 }

References MODEL_MAX_SEQ_LEN, score, and softmax().

Referenced by v6_prefill().

◆ simple_embedding()

static void simple_embedding	(	const int32_t *	tokens,
		int	num_tokens,
		const float *	weight,
		float *	output,
		int	vocab_size,
		int	embed_dim
	)

static

Definition at line 130 of file v6_simple.c.

                                                            {
     for (int t = 0; t < num_tokens; t++) {
         int token_id = tokens[t];
         if (token_id >= 0 && token_id < vocab_size) {
             memcpy(output + t * embed_dim, weight + token_id * embed_dim,
                    embed_dim * sizeof(float));
         } else {
             memset(output + t * embed_dim, 0, embed_dim * sizeof(float));
         }
     }
 }

References vocab_size.

Referenced by v6_prefill().

◆ simple_rmsnorm()

static void simple_rmsnorm	(	const float *	input,
		const float *	gamma,
		float *	output,
		int	tokens,
		int	d_model,
		float	eps
	)

static

Definition at line 40 of file v6_simple.c.

                                                                {
     for (int t = 0; t < tokens; t++) {
         const float *in_row = input + t * d_model;
         float *out_row = output + t * d_model;
  
         /* Compute variance */
         float variance = 0.0f;
         for (int i = 0; i < d_model; i++) {
             variance += in_row[i] * in_row[i];
         }
         variance /= d_model;
  
         /* Normalize */
         float scale = 1.0f / sqrtf(variance + eps);
         for (int i = 0; i < d_model; i++) {
             out_row[i] = in_row[i] * gamma[i] * scale;
         }
     }
 }

Referenced by v6_prefill().

◆ softmax()

static void softmax	(	float *	x,
		int	n
	)

static

Definition at line 62 of file v6_simple.c.

                                      {
     float max_val = x[0];
     for (int i = 1; i < n; i++) {
         if (x[i] > max_val) max_val = x[i];
     }
  
     float sum = 0.0f;
     for (int i = 0; i < n; i++) {
         x[i] = expf(x[i] - max_val);
         sum += x[i];
     }
  
     for (int i = 0; i < n; i++) {
         x[i] /= sum;
     }
 }

Referenced by simple_attention().

◆ v6_prefill()

void v6_prefill	(	const float *	embed_weight,
		const int32_t *	tokens,
		int	num_tokens,
		float *	logits
	)

Definition at line 181 of file v6_simple.c.

                                {
     if (!embed_weight || !tokens || num_tokens <= 0) return;
  
     /* Allocate buffers */
     const int embed_dim = ALIGN_EMBED;
     const int intermediate = MODEL_INTERMEDIATE;
     const int num_layers = MODEL_NUM_LAYERS;
     const int num_heads = MODEL_NUM_HEADS;
     const int num_kv_heads = MODEL_NUM_KV_HEADS;
     const int head_dim = MODEL_HEAD_DIM;
  
     /* Per-token hidden states: (num_tokens) x (num_layers + 1) x embed_dim */
     float *hidden = malloc(num_tokens * (num_layers + 1) * embed_dim * sizeof(float));
     if (!hidden) {
         fprintf(stderr, "Failed to allocate hidden states\n");
         return;
     }
  
     /* Temporary buffers per layer */
     float *q = malloc(num_heads * head_dim * sizeof(float));
     float *k = malloc(num_kv_heads * head_dim * sizeof(float));
     float *v = malloc(num_kv_heads * head_dim * sizeof(float));
     float *attn = malloc(num_heads * head_dim * sizeof(float));
     float *mlp = malloc(intermediate * sizeof(float));
  
     if (!q || !k || !v || !attn || !mlp) {
         fprintf(stderr, "Failed to allocate temp buffers\n");
         free(hidden);
         free(q);
         free(k);
         free(v);
         free(attn);
         free(mlp);
         return;
     }
  
     /* Dummy layer weights (in real impl, these come from mapped memory) */
     const float *ln1_gamma = NULL;  /* Would come from weights */
     const float *ln2_gamma = NULL;
     const float *wq = NULL, *wk = NULL, *wv = NULL, *wo = NULL;
     const float *w1 = NULL, *w2 = NULL;
  
     /* OMP parallel for over tokens */
     #pragma omp parallel for schedule(dynamic, 1)
     for (int t = 0; t < num_tokens; t++) {
         float *h = hidden + t * (num_layers + 1) * embed_dim;
  
         /* Embedding lookup */
         simple_embedding(tokens + t, 1, embed_weight, h, MODEL_VOCAB_SIZE, embed_dim);
  
         /* Process through layers */
         for (int layer = 0; layer < num_layers; layer++) {
             float *layer_in = h;
             float *layer_out = h + embed_dim;
  
             /* RMSNorm */
             simple_rmsnorm(layer_in, ln1_gamma, layer_in, 1, embed_dim, 1e-6f);
  
             /* QKV projection */
             gemm_nt(layer_in, wq, q, 1, num_heads * head_dim, embed_dim);
             gemm_nt(layer_in, wk, k, 1, num_kv_heads * head_dim, embed_dim);
             gemm_nt(layer_in, wv, v, 1, num_kv_heads * head_dim, embed_dim);
  
             /* RoPE */
             apply_rope(q, 1, head_dim);
             apply_rope(k, 1, head_dim);
  
             /* Attention */
             simple_attention(q, k, v, attn, num_heads, num_kv_heads, 1, head_dim);
  
             /* Output projection */
             gemm_nt(attn, wo, layer_out, 1, embed_dim, num_heads * head_dim);
  
             /* Residual */
             residual_add(layer_in, layer_out, embed_dim);
  
             /* RMSNorm before MLP */
             simple_rmsnorm(layer_in, ln2_gamma, layer_in, 1, embed_dim, 1e-6f);
  
             /* MLP */
             gemm_nt(layer_in, w1, mlp, 1, 2 * intermediate, embed_dim);
             silu(mlp, 2 * intermediate);
             gemm_nt(mlp, w2, layer_out, 1, embed_dim, intermediate);
  
             /* Residual */
             residual_add(layer_in, layer_out, embed_dim);
         }
  
         /* Copy to output area */
         memcpy(hidden + t * (num_layers + 1) * embed_dim +
                num_layers * embed_dim, h, embed_dim * sizeof(float));
     }
  
     /* Final RMSNorm over all tokens */
     float *final_out = malloc(num_tokens * embed_dim * sizeof(float));
     if (final_out) {
         simple_rmsnorm(hidden + num_layers * embed_dim, ln1_gamma, final_out,
                        num_tokens, embed_dim, 1e-6f);
  
         /* LM head */
         gemm_nt(final_out, embed_weight, logits, num_tokens, MODEL_VOCAB_SIZE, embed_dim);
  
         free(final_out);
     }
  
     free(hidden);
     free(q);
     free(k);
     free(v);
     free(attn);
     free(mlp);
 }

References ALIGN_EMBED, apply_rope(), gemm_nt(), MODEL_HEAD_DIM, MODEL_INTERMEDIATE, MODEL_NUM_HEADS, MODEL_NUM_KV_HEADS, MODEL_NUM_LAYERS, MODEL_VOCAB_SIZE, residual_add(), silu(), simple_attention(), simple_embedding(), and simple_rmsnorm().

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ _GNU_SOURCE

◆ ALIGN_CONTEXT

◆ ALIGN_EMBED

◆ ALIGN_HEAD

◆ MODEL_EMBED_DIM

◆ MODEL_HEAD_DIM

◆ MODEL_INTERMEDIATE

◆ MODEL_INTERMEDIATE_SIZE

◆ MODEL_MAX_SEQ_LEN

◆ MODEL_NUM_HEADS

◆ MODEL_NUM_KV_HEADS

◆ MODEL_NUM_LAYERS

◆ MODEL_VOCAB_SIZE

Function Documentation

◆ apply_rope()

◆ gemm_nt()

◆ main()

◆ residual_add()

◆ silu()

◆ simple_attention()

◆ simple_embedding()

◆ simple_rmsnorm()

◆ softmax()

◆ v6_prefill()