24 #define MODEL_EMBED_DIM 896
25 #define MODEL_NUM_LAYERS 24
26 #define MODEL_NUM_HEADS 14
27 #define MODEL_NUM_KV_HEADS 2
28 #define MODEL_HEAD_DIM 64
29 #define MODEL_INTERMEDIATE_SIZE 4864
30 #define MODEL_VOCAB_SIZE 128256
31 #define MODEL_MAX_SEQ_LEN 32768
34 #define ALIGN_EMBED 896
36 #define MODEL_INTERMEDIATE 4864
37 #define ALIGN_CONTEXT 32768
40 static void simple_rmsnorm(
const float *input,
const float *gamma,
float *output,
41 int tokens,
int d_model,
float eps) {
42 for (
int t = 0; t < tokens; t++) {
43 const float *in_row = input + t * d_model;
44 float *out_row = output + t * d_model;
47 float variance = 0.0f;
48 for (
int i = 0; i < d_model; i++) {
49 variance += in_row[i] * in_row[i];
54 float scale = 1.0f / sqrtf(variance + eps);
55 for (
int i = 0; i < d_model; i++) {
56 out_row[i] = in_row[i] * gamma[i] * scale;
64 for (
int i = 1; i < n; i++) {
65 if (x[i] > max_val) max_val = x[i];
69 for (
int i = 0; i < n; i++) {
70 x[i] = expf(x[i] - max_val);
74 for (
int i = 0; i < n; i++) {
81 float *output,
int num_heads,
int num_kv_heads,
82 int seq_len,
int head_dim) {
83 int hidden_dim = num_heads * head_dim;
86 for (
int h = 0; h < num_heads; h++) {
87 const float *q_head = q + h * head_dim;
88 const float *k_head = k;
89 const float *v_head = v;
92 int repeat = num_heads / num_kv_heads;
94 float *out_head = output + h * head_dim;
98 for (
int t = 0; t < seq_len; t++) {
100 for (
int d = 0; d < head_dim; d++) {
101 score += q_head[d] * k_head[t * head_dim + d];
103 scores[t] =
score / sqrtf((
float)head_dim);
107 for (
int t = 0; t < seq_len; t++) {
108 if (t >= seq_len - 1) {
109 scores[t] = scores[t];
119 for (
int d = 0; d < head_dim; d++) {
121 for (
int t = 0; t < seq_len; t++) {
122 sum += scores[t] * v_head[t * head_dim + d];
131 const float *weight,
float *output,
133 for (
int t = 0; t < num_tokens; t++) {
134 int token_id = tokens[t];
136 memcpy(output + t * embed_dim, weight + token_id * embed_dim,
137 embed_dim *
sizeof(
float));
139 memset(output + t * embed_dim, 0, embed_dim *
sizeof(
float));
145 static void gemm_nt(
const float *input,
const float *weight,
float *output,
146 int rows,
int cols,
int common) {
147 for (
int r = 0; r < rows; r++) {
148 for (
int c = 0; c < cols; c++) {
150 for (
int k = 0; k < common; k++) {
151 sum += input[r * common + k] * weight[c * common + k];
153 output[r * cols + c] = sum;
159 static void silu(
float *x,
int n) {
160 for (
int i = 0; i < n; i++) {
161 x[i] = x[i] / (1.0f + expf(-x[i]));
167 for (
int i = 0; i < n; i++) {
168 residual[i] += addend[i];
173 static void apply_rope(
float *x,
int seq_len,
int head_dim) {
181 void v6_prefill(
const float *embed_weight,
const int32_t *tokens,
int num_tokens,
183 if (!embed_weight || !tokens || num_tokens <= 0)
return;
194 float *hidden = malloc(num_tokens * (num_layers + 1) * embed_dim *
sizeof(
float));
196 fprintf(stderr,
"Failed to allocate hidden states\n");
201 float *q = malloc(num_heads * head_dim *
sizeof(
float));
202 float *k = malloc(num_kv_heads * head_dim *
sizeof(
float));
203 float *v = malloc(num_kv_heads * head_dim *
sizeof(
float));
204 float *attn = malloc(num_heads * head_dim *
sizeof(
float));
205 float *mlp = malloc(intermediate *
sizeof(
float));
207 if (!q || !k || !v || !attn || !mlp) {
208 fprintf(stderr,
"Failed to allocate temp buffers\n");
219 const float *ln1_gamma = NULL;
220 const float *ln2_gamma = NULL;
221 const float *wq = NULL, *wk = NULL, *wv = NULL, *wo = NULL;
222 const float *w1 = NULL, *w2 = NULL;
225 #pragma omp parallel for schedule(dynamic, 1)
226 for (
int t = 0; t < num_tokens; t++) {
227 float *h = hidden + t * (num_layers + 1) * embed_dim;
233 for (
int layer = 0; layer < num_layers; layer++) {
235 float *layer_out = h + embed_dim;
238 simple_rmsnorm(layer_in, ln1_gamma, layer_in, 1, embed_dim, 1e-6f);
241 gemm_nt(layer_in, wq, q, 1, num_heads * head_dim, embed_dim);
242 gemm_nt(layer_in, wk, k, 1, num_kv_heads * head_dim, embed_dim);
243 gemm_nt(layer_in, wv, v, 1, num_kv_heads * head_dim, embed_dim);
253 gemm_nt(attn, wo, layer_out, 1, embed_dim, num_heads * head_dim);
259 simple_rmsnorm(layer_in, ln2_gamma, layer_in, 1, embed_dim, 1e-6f);
262 gemm_nt(layer_in, w1, mlp, 1, 2 * intermediate, embed_dim);
263 silu(mlp, 2 * intermediate);
264 gemm_nt(mlp, w2, layer_out, 1, embed_dim, intermediate);
271 memcpy(hidden + t * (num_layers + 1) * embed_dim +
272 num_layers * embed_dim, h, embed_dim *
sizeof(
float));
276 float *final_out = malloc(num_tokens * embed_dim *
sizeof(
float));
278 simple_rmsnorm(hidden + num_layers * embed_dim, ln1_gamma, final_out,
279 num_tokens, embed_dim, 1e-6f);
295 int main(
int argc,
char **argv) {
296 printf(
"=== V6 Simple CLI ===\n");
297 printf(
"Generic kernel implementation\n");
298 printf(
"OMP parallelization for prefill\n\n");
301 printf(
"Usage: %s <weights.bin> [options]\n", argv[0]);
302 printf(
"\nOptions:\n");
303 printf(
" -p, --prompt <text> Input prompt\n");
304 printf(
" -t, --tokens <n> Max tokens (default: 50)\n");
305 printf(
" -h, --help Show help\n");
309 const char *weights_path = argv[1];
310 const char *prompt =
"Hello";
313 for (
int i = 2; i < argc; i++) {
314 if (strcmp(argv[i],
"-p") == 0 || strcmp(argv[i],
"--prompt") == 0) {
316 }
else if (strcmp(argv[i],
"-t") == 0 || strcmp(argv[i],
"--tokens") == 0) {
317 max_tokens = atoi(argv[++i]);
318 }
else if (strcmp(argv[i],
"-h") == 0 || strcmp(argv[i],
"--help") == 0) {
319 printf(
"Usage: %s <weights.bin> [options]\n", argv[0]);
324 printf(
"Model: Qwen2 0.5B (generic kernels)\n");
325 printf(
"Prompt: %s\n", prompt);
326 printf(
"Max tokens: %d\n", max_tokens);
327 printf(
"\n[Note: This is a simplified v6 implementation using generic kernels]\n");
328 printf(
"[Real weights loading and inference would require full implementation]\n");
331 printf(
"\nAssistant: (v6 placeholder - full implementation pending)\n");
static void residual_add(float *residual, float *addend, int n)
static void simple_embedding(const int32_t *tokens, int num_tokens, const float *weight, float *output, int vocab_size, int embed_dim)
#define MODEL_INTERMEDIATE
int main(int argc, char **argv)
static void simple_attention(const float *q, const float *k, const float *v, float *output, int num_heads, int num_kv_heads, int seq_len, int head_dim)
static void silu(float *x, int n)
static void gemm_nt(const float *input, const float *weight, float *output, int rows, int cols, int common)
void v6_prefill(const float *embed_weight, const int32_t *tokens, int num_tokens, float *logits)
#define MODEL_NUM_KV_HEADS
static void softmax(float *x, int n)
static void simple_rmsnorm(const float *input, const float *gamma, float *output, int tokens, int d_model, float eps)
#define MODEL_MAX_SEQ_LEN
static void apply_rope(float *x, int seq_len, int head_dim)