24 #define CK_VERSION "6.0.0"
27 #define ANSI_RESET "\033[0m"
28 #define ANSI_BOLD "\033[1m"
29 #define ANSI_DIM "\033[2m"
30 #define ANSI_GREEN "\033[0;32m"
31 #define ANSI_YELLOW "\033[0;33m"
32 #define ANSI_CYAN "\033[0;36m"
35 #include "ck-kernel-inference.h"
36 #include "ck-kernel-prefill.h"
44 size_t runtime_offset;
51 float best_val = probs[
start];
54 if (probs[i] > best_val) {
63 static int load_manifest(
const char *path, ManifestEntry **entries,
int *num_entries) {
64 FILE *f = fopen(path,
"r");
66 fprintf(stderr,
"Failed to open manifest: %s\n", path);
71 fseek(f, 0, SEEK_END);
73 fseek(f, 0, SEEK_SET);
75 char *content = malloc(size + 1);
76 fread(content, 1, size, f);
83 while ((p = strstr(p,
"\"name\":")) != NULL) {
88 *entries = malloc(count *
sizeof(ManifestEntry));
94 while ((p = strstr(p,
"\"name\":")) != NULL && idx < count) {
95 char *
start = strchr(p,
'"') + 1;
98 if (len >=
sizeof((*entries)[idx].name)) len =
sizeof((*entries)[idx].name) - 1;
99 strncpy((*entries)[idx].name,
start, len);
100 (*entries)[idx].name[len] =
'\0';
103 char *dtype_p = strstr(p,
"\"dtype\":");
105 char *d_start = strchr(dtype_p,
'"') + 1;
106 char *d_end = strchr(d_start,
'"');
107 size_t d_len = d_end - d_start;
108 if (d_len >=
sizeof((*entries)[idx].dtype)) d_len =
sizeof((*entries)[idx].dtype) - 1;
109 strncpy((*entries)[idx].dtype, d_start, d_len);
110 (*entries)[idx].dtype[d_len] =
'\0';
114 char *fo_p = strstr(p,
"\"file_offset\":");
116 sscanf(fo_p + 14,
"%zu", &(*entries)[idx].file_offset);
120 char *size_p = strstr(p,
"\"size\":");
122 sscanf(size_p + 7,
"%zu", &(*entries)[idx].size);
126 char *ro_p = strstr(p,
"\"runtime_offset\":");
128 sscanf(ro_p + 17,
"%zu", &(*entries)[idx].runtime_offset);
140 static int load_weights(QWEN2_DECODEModel *model,
const char *bump_path,
141 const char *manifest_path) {
142 printf(
"[INFO] Loading weights from: %s\n", bump_path);
145 int fd = open(bump_path, O_RDONLY);
147 fprintf(stderr,
"Failed to open BUMP file: %s\n", bump_path);
152 off_t file_size = lseek(fd, 0, SEEK_END);
153 lseek(fd, 0, SEEK_SET);
154 printf(
"[INFO] BUMP file size: %ld bytes\n", (
long)file_size);
157 ManifestEntry *entries = NULL;
159 if (
load_manifest(manifest_path, &entries, &num_entries) != 0) {
163 printf(
"[INFO] Manifest entries: %d\n", num_entries);
166 void *bump_base = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
167 if (bump_base == MAP_FAILED) {
168 fprintf(stderr,
"Failed to mmap BUMP file\n");
176 for (
int i = 0; i < num_entries; i++) {
177 ManifestEntry *e = &entries[i];
179 if (e->runtime_offset + e->size > model->total_bytes) {
180 fprintf(stderr,
"Warning: Entry %s exceeds model memory\n", e->name);
184 memcpy((
char *)model->base + e->runtime_offset,
185 (
char *)bump_base + e->file_offset,
189 munmap(bump_base, file_size);
192 printf(
"[INFO] Weights loaded successfully\n");
198 const char *tokenizer_path,
const char *prompt,
199 int max_tokens,
float temperature,
int topk) {
201 printf(
"\n C-Kernel-Engine v6 Inference\n");
206 printf(
"[INFO] Loading tokenizer: %s\n", tokenizer_path);
209 fprintf(stderr,
"Failed to init tokenizer\n");
214 fprintf(stderr,
"Failed to load tokenizer: %s\n", tokenizer_path);
221 QWEN2_DECODEModel model;
222 printf(
"[INFO] Allocating model (%zu bytes)...\n", (
size_t)QWEN2_DECODE_TOTAL_BYTES);
224 if (qwen2_decode_model_allocate(&model) != 0) {
225 fprintf(stderr,
"Failed to allocate model\n");
229 printf(
"[INFO] Model allocated at %p\n", model.base);
232 if (
load_weights(&model, bump_path, manifest_path) != 0) {
233 qwen2_decode_model_free(&model);
241 printf(
"[INFO] Tokenized prompt: %d tokens\n", num_tokens);
243 if (num_tokens <= 0) {
244 fprintf(stderr,
"Failed to tokenize prompt\n");
245 qwen2_decode_model_free(&model);
251 printf(
"[INFO] Input tokens: ");
252 for (
int i = 0; i < num_tokens; i++) {
255 printf(
"%d(%s) ", tokens[i], tok_str);
257 printf(
"%d(?) ", tokens[i]);
263 printf(
"[INFO] Running prefill for %d tokens...\n", num_tokens);
264 qwen2_decode_forward(&model, tokens, num_tokens);
267 float *logits = (
float *)((
char *)model.base + QWEN2_DECODE_FOOTER.logits);
270 printf(
"\n[INFO] Generating %d tokens...\n", max_tokens);
273 int eos_token = 151645;
274 int32_t
token = tokens[num_tokens - 1];
277 char output_buffer[4096];
279 output_buffer[0] =
'\0';
281 for (
int i = 0; i < max_tokens; i++) {
283 qwen2_decode_decode(&model, &
token, 0);
286 logits = (
float *)((
char *)model.base + QWEN2_DECODE_FOOTER.logits);
289 int next_token =
sample_topk(logits, QWEN2_DECODE_VOCAB_SIZE, topk);
294 printf(
"%s", tok_str);
298 size_t len = strlen(tok_str);
299 if (output_pos + len <
sizeof(output_buffer) - 1) {
300 strcpy(output_buffer + output_pos, tok_str);
303 }
else if (next_token == eos_token) {
305 printf(
"\n[INFO] EOS token received\n");
313 printf(
"\n\n[INFO] Inference complete\n");
316 qwen2_decode_model_free(&model);
326 printf(
" ____ _ _ _ _ _ _ _ _____ _ _ \n");
327 printf(
" | _ \\| | | (_) | | (_) | | | |/ ___| | | |\n");
328 printf(
" | |_) | |_ _ ___| |_ __ _| |_ ___ _____| | | |\\ `--.| |__ __| |\n");
329 printf(
" | _ <| | | | |/ __| | |/ _` | __| \\ \\ / / _ \\ | | | |`--. \\ '_ \\ / _` |\n");
330 printf(
" | |_) | | |_| | (__| | | (_| | |_| |\\ V / __/ |_| |/\\__/ / | | | (_| |\n");
331 printf(
" |____/|_|\\__,_|\\___|_|_|\\__,_|\\__|_| \\_/ \\___|\\___|\\___/\\____/|_| |_|\\__,_|\n");
334 printf(
" v%s - Native C Inference\n",
CK_VERSION);
338 int main(
int argc,
char **argv) {
339 const char *bump_path = NULL;
340 const char *manifest_path = NULL;
341 const char *tokenizer_path = NULL;
342 const char *prompt =
"Hello";
343 int max_tokens = 100;
344 float temperature = 0.7f;
348 for (
int i = 1; i < argc; i++) {
349 if (strcmp(argv[i],
"-h") == 0 || strcmp(argv[i],
"--help") == 0) {
351 printf(
"Usage: %s <weights.bump> [options]\n", argv[0]);
352 printf(
"\nOptions:\n");
353 printf(
" -m, --model <file> Weights BUMP file\n");
354 printf(
" -t, --tokenizer <file> Tokenizer JSON file\n");
355 printf(
" -p, --prompt <text> Prompt (default: Hello)\n");
356 printf(
" -n, --tokens <n> Max tokens (default: 100)\n");
357 printf(
" --temp <float> Temperature (default: 0.7)\n");
358 printf(
" --top-k <n> Top-k (default: 40)\n");
359 printf(
" -h, --help Show help\n");
362 else if ((strcmp(argv[i],
"-m") == 0 || strcmp(argv[i],
"--model") == 0) && i + 1 < argc) {
363 bump_path = argv[++i];
365 else if ((strcmp(argv[i],
"-t") == 0 || strcmp(argv[i],
"--tokenizer") == 0) && i + 1 < argc) {
366 tokenizer_path = argv[++i];
368 else if ((strcmp(argv[i],
"-p") == 0 || strcmp(argv[i],
"--prompt") == 0) && i + 1 < argc) {
371 else if (strcmp(argv[i],
"-n") == 0 && i + 1 < argc) {
372 max_tokens = atoi(argv[++i]);
374 else if (strcmp(argv[i],
"--temp") == 0 && i + 1 < argc) {
375 temperature = atof(argv[++i]);
377 else if (strcmp(argv[i],
"--top-k") == 0 && i + 1 < argc) {
378 topk = atoi(argv[++i]);
384 fprintf(stderr,
"Error: No weights file specified\n");
385 fprintf(stderr,
"Usage: %s <weights.bump> [options]\n", argv[0]);
390 if (!manifest_path) {
391 manifest_path =
"generated/weights_manifest.json";
393 if (!tokenizer_path) {
394 tokenizer_path =
"generated/tokenizer.json";
397 if (access(bump_path, F_OK) != 0) {
398 fprintf(stderr,
"Error: Weights file not found: %s\n", bump_path);
402 if (access(tokenizer_path, F_OK) != 0) {
403 fprintf(stderr,
"Error: Tokenizer not found: %s\n", tokenizer_path);
407 return run_inference(bump_path, manifest_path, tokenizer_path,
408 prompt, max_tokens, temperature, topk);
int ck_tokenizer_init(CKTokenizer *tok)
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
int ck_tokenizer_load(CKTokenizer *tok, const char *path)
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
void ck_tokenizer_free(CKTokenizer *tok)
static int ck_tokenizer_vocab_size(const CKTokenizer *tok)
int main(int argc, char **argv)
static int load_weights(QWEN2_DECODEModel *model, const char *bump_path, const char *manifest_path)
static int load_manifest(const char *path, ManifestEntry **entries, int *num_entries)
static int run_inference(const char *bump_path, const char *manifest_path, const char *tokenizer_path, const char *prompt, int max_tokens, float temperature, int topk)
static void print_banner(void)
static int sample_topk(float *probs, int vocab_size, int topk)