24 #define CK_VERSION "6.0.0"
27 #define ANSI_RESET "\033[0m"
28 #define ANSI_BOLD "\033[1m"
29 #define ANSI_DIM "\033[2m"
30 #define ANSI_GREEN "\033[0;32m"
31 #define ANSI_YELLOW "\033[0;33m"
32 #define ANSI_CYAN "\033[0;36m"
35 #include "ck-kernel-inference.h"
36 #include "ck-kernel-prefill.h"
44 size_t runtime_offset;
51 float best_val = probs[
start];
54 if (probs[i] > best_val) {
63 static int load_manifest(
const char *path, ManifestEntry **entries,
int *num_entries) {
64 FILE *f = fopen(path,
"r");
66 fprintf(stderr,
"Failed to open manifest: %s\n", path);
71 fseek(f, 0, SEEK_END);
73 fseek(f, 0, SEEK_SET);
75 char *content = malloc(size + 1);
76 fread(content, 1, size, f);
83 while ((p = strstr(p,
"\"name\":")) != NULL) {
88 *entries = malloc(count *
sizeof(ManifestEntry));
94 while ((p = strstr(p,
"\"name\":")) != NULL && idx < count) {
95 char *
start = strchr(p,
'"') + 1;
98 if (len >=
sizeof((*entries)[idx].name)) len =
sizeof((*entries)[idx].name) - 1;
99 strncpy((*entries)[idx].name,
start, len);
100 (*entries)[idx].name[len] =
'\0';
103 char *dtype_p = strstr(p,
"\"dtype\":");
105 char *d_start = strchr(dtype_p,
'"') + 1;
106 char *d_end = strchr(d_start,
'"');
107 size_t d_len = d_end - d_start;
108 if (d_len >=
sizeof((*entries)[idx].dtype)) d_len =
sizeof((*entries)[idx].dtype) - 1;
109 strncpy((*entries)[idx].dtype, d_start, d_len);
110 (*entries)[idx].dtype[d_len] =
'\0';
114 char *fo_p = strstr(p,
"\"file_offset\":");
116 sscanf(fo_p + 14,
"%zu", &(*entries)[idx].file_offset);
120 char *size_p = strstr(p,
"\"size\":");
122 sscanf(size_p + 7,
"%zu", &(*entries)[idx].size);
126 char *ro_p = strstr(p,
"\"runtime_offset\":");
128 sscanf(ro_p + 17,
"%zu", &(*entries)[idx].runtime_offset);
140 static int load_weights(QWEN2_DECODEModel *model,
const char *bump_path,
141 const char *manifest_path) {
142 printf(
"[INFO] Loading weights from: %s\n", bump_path);
145 int fd = open(bump_path, O_RDONLY);
147 fprintf(stderr,
"Failed to open BUMP file: %s\n", bump_path);
152 off_t file_size = lseek(fd, 0, SEEK_END);
153 lseek(fd, 0, SEEK_SET);
154 printf(
"[INFO] BUMP file size: %ld bytes\n", (
long)file_size);
157 ManifestEntry *entries = NULL;
159 if (
load_manifest(manifest_path, &entries, &num_entries) != 0) {
163 printf(
"[INFO] Manifest entries: %d\n", num_entries);
166 void *bump_base = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
167 if (bump_base == MAP_FAILED) {
168 fprintf(stderr,
"Failed to mmap BUMP file\n");
176 for (
int i = 0; i < num_entries; i++) {
177 ManifestEntry *e = &entries[i];
179 if (e->runtime_offset + e->size > model->total_bytes) {
180 fprintf(stderr,
"Warning: Entry %s exceeds model memory\n", e->name);
184 memcpy((
char *)model->base + e->runtime_offset,
185 (
char *)bump_base + e->file_offset,
189 munmap(bump_base, file_size);
192 printf(
"[INFO] Weights loaded successfully\n");
198 const char *tokenizer_path,
const char *prompt,
199 int max_tokens,
float temperature,
int topk) {
201 printf(
"\n C-Kernel-Engine v6 Inference\n");
205 (void)tokenizer_path;
209 fprintf(stderr,
"Failed to init tokenizer\n");
214 QWEN2_DECODEModel model;
215 printf(
"[INFO] Allocating model (%zu bytes)...\n", (
size_t)QWEN2_DECODE_TOTAL_BYTES);
217 if (qwen2_decode_model_allocate(&model) != 0) {
218 fprintf(stderr,
"Failed to allocate model\n");
222 printf(
"[INFO] Model allocated at %p\n", model.base);
225 if (
load_weights(&model, bump_path, manifest_path) != 0) {
226 qwen2_decode_model_free(&model);
232 const int vocab_size = QWEN2_DECODE_VOCAB_SIZE;
233 const int num_merges = QWEN2_DECODE_NUM_MERGES;
234 const int vocab_bytes = QWEN2_DECODE_TOTAL_VOCAB_BYTES;
236 fprintf(stderr,
"Tokenizer data missing in model\n");
237 qwen2_decode_model_free(&model);
242 const int32_t *vocab_offsets = (
const int32_t *)QWEN2_DECODE_PTR(&model, QWEN2_DECODE_HEADER.vocab_offsets);
243 const char *vocab_strings = (
const char *)((
char *)model.base + QWEN2_DECODE_HEADER.vocab_strings);
245 ? (
const int32_t *)((
char *)model.base + QWEN2_DECODE_HEADER.vocab_merges)
249 fprintf(stderr,
"Failed to load tokenizer from model\n");
250 qwen2_decode_model_free(&model);
255 printf(
"[INFO] Tokenizer loaded: %d tokens\n",
vocab_size);
259 printf(
"[INFO] Tokenized prompt: %d tokens\n", num_tokens);
261 if (num_tokens <= 0) {
262 fprintf(stderr,
"Failed to tokenize prompt\n");
263 qwen2_decode_model_free(&model);
269 printf(
"[INFO] Input tokens: ");
270 for (
int i = 0; i < num_tokens; i++) {
273 printf(
"%d(%s) ", tokens[i], tok_str);
275 printf(
"%d(?) ", tokens[i]);
281 printf(
"[INFO] Running prefill for %d tokens...\n", num_tokens);
282 qwen2_decode_forward(&model, tokens, num_tokens);
285 float *logits = (
float *)((
char *)model.base + QWEN2_DECODE_FOOTER.logits);
288 printf(
"\n[INFO] Generating %d tokens...\n", max_tokens);
291 int eos_token = 151645;
292 int32_t
token = tokens[num_tokens - 1];
295 char output_buffer[4096];
297 output_buffer[0] =
'\0';
299 for (
int i = 0; i < max_tokens; i++) {
301 qwen2_decode_decode(&model, &
token, 0);
304 logits = (
float *)((
char *)model.base + QWEN2_DECODE_FOOTER.logits);
307 int next_token =
sample_topk(logits, QWEN2_DECODE_VOCAB_SIZE, topk);
312 printf(
"%s", tok_str);
316 size_t len = strlen(tok_str);
317 if (output_pos + len <
sizeof(output_buffer) - 1) {
318 strcpy(output_buffer + output_pos, tok_str);
321 }
else if (next_token == eos_token) {
323 printf(
"\n[INFO] EOS token received\n");
331 printf(
"\n\n[INFO] Inference complete\n");
334 qwen2_decode_model_free(&model);
344 printf(
" ____ _ _ _ _ _ _ _ _____ _ _ \n");
345 printf(
" | _ \\| | | (_) | | (_) | | | |/ ___| | | |\n");
346 printf(
" | |_) | |_ _ ___| |_ __ _| |_ ___ _____| | | |\\ `--.| |__ __| |\n");
347 printf(
" | _ <| | | | |/ __| | |/ _` | __| \\ \\ / / _ \\ | | | |`--. \\ '_ \\ / _` |\n");
348 printf(
" | |_) | | |_| | (__| | | (_| | |_| |\\ V / __/ |_| |/\\__/ / | | | (_| |\n");
349 printf(
" |____/|_|\\__,_|\\___|_|_|\\__,_|\\__|_| \\_/ \\___|\\___|\\___/\\____/|_| |_|\\__,_|\n");
352 printf(
" v%s - Native C Inference\n",
CK_VERSION);
356 int main(
int argc,
char **argv) {
357 const char *bump_path = NULL;
358 const char *manifest_path = NULL;
359 const char *tokenizer_path = NULL;
360 const char *prompt =
"Hello";
361 int max_tokens = 100;
362 float temperature = 0.7f;
366 for (
int i = 1; i < argc; i++) {
367 if (strcmp(argv[i],
"-h") == 0 || strcmp(argv[i],
"--help") == 0) {
369 printf(
"Usage: %s <weights.bump> [options]\n", argv[0]);
370 printf(
"\nOptions:\n");
371 printf(
" -m, --model <file> Weights BUMP file\n");
372 printf(
" -t, --tokenizer <file> (unused) tokenizer is loaded from weights\n");
373 printf(
" -p, --prompt <text> Prompt (default: Hello)\n");
374 printf(
" -n, --tokens <n> Max tokens (default: 100)\n");
375 printf(
" --temp <float> Temperature (default: 0.7)\n");
376 printf(
" --top-k <n> Top-k (default: 40)\n");
377 printf(
" -h, --help Show help\n");
380 else if ((strcmp(argv[i],
"-m") == 0 || strcmp(argv[i],
"--model") == 0) && i + 1 < argc) {
381 bump_path = argv[++i];
383 else if ((strcmp(argv[i],
"-t") == 0 || strcmp(argv[i],
"--tokenizer") == 0) && i + 1 < argc) {
384 tokenizer_path = argv[++i];
386 else if ((strcmp(argv[i],
"-p") == 0 || strcmp(argv[i],
"--prompt") == 0) && i + 1 < argc) {
389 else if (strcmp(argv[i],
"-n") == 0 && i + 1 < argc) {
390 max_tokens = atoi(argv[++i]);
392 else if (strcmp(argv[i],
"--temp") == 0 && i + 1 < argc) {
393 temperature = atof(argv[++i]);
395 else if (strcmp(argv[i],
"--top-k") == 0 && i + 1 < argc) {
396 topk = atoi(argv[++i]);
402 fprintf(stderr,
"Error: No weights file specified\n");
403 fprintf(stderr,
"Usage: %s <weights.bump> [options]\n", argv[0]);
408 if (!manifest_path) {
409 manifest_path =
"generated/weights_manifest.json";
411 if (access(bump_path, F_OK) != 0) {
412 fprintf(stderr,
"Error: Weights file not found: %s\n", bump_path);
416 return run_inference(bump_path, manifest_path, tokenizer_path,
417 prompt, max_tokens, temperature, topk);
void ck_tokenizer_free(CKTokenizer *tok)
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
void ck_true_bpe_free(CKTrueBPE *bpe)
CKTrueBPE * ck_true_bpe_create(void)
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
int const int32_t const char int num_merges
int main(int argc, char **argv)
static int load_weights(QWEN2_DECODEModel *model, const char *bump_path, const char *manifest_path)
static int load_manifest(const char *path, ManifestEntry **entries, int *num_entries)
static int run_inference(const char *bump_path, const char *manifest_path, const char *tokenizer_path, const char *prompt, int max_tokens, float temperature, int topk)
static void print_banner(void)
static int sample_topk(float *probs, int vocab_size, int topk)