← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_cli_v6.6.c
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine v6.6 Native CLI
3  *
4  * Features:
5  * - Model auto-discovery from cache
6  * - Readline support for history/editing
7  * - Chat template support (Qwen, LLaMA, etc.)
8  * - Temperature/top-p sampling
9  * - Streaming output
10  *
11  * Usage:
12  * ck-cli-v6.6 --model <name> # Auto-discover from cache
13  * ck-cli-v6.6 <libmodel.so> <weights.bump> # Direct paths
14  * ck-cli-v6.6 --lib <.so> --weights <.bump> # Named args
15  */
16 
17 #define _GNU_SOURCE
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <stdbool.h>
23 #include <errno.h>
24 #include <signal.h>
25 #include <dlfcn.h>
26 #include <unistd.h>
27 #include <time.h>
28 #include <math.h>
29 #include <dirent.h>
30 #include <sys/stat.h>
31 
32 #ifdef HAVE_READLINE
33 #include <readline/readline.h>
34 #include <readline/history.h>
35 #endif
36 
37 #include "tokenizer/true_bpe.h"
38 #include "ck_features.h"
39 
40 #define CK_CLI_VERSION "6.6.0"
41 #define CK_CLI_DEFAULT_MAX_TOKENS 256
42 #define CK_CLI_EOS_MAX 8
43 #define CK_CLI_OUTPUT_BUF_SIZE 4096
44 #define CK_CLI_MAX_CONTEXT 32768
45 #define CK_CLI_HISTORY_FILE ".ck_cli_history"
46 
47 static volatile sig_atomic_t g_exit_requested = 0;
48 static volatile sig_atomic_t g_generation_active = 0;
49 
50 /* Timing globals */
51 static double g_prefill_time_ms = 0.0;
52 static double g_decode_time_ms = 0.0;
53 static int g_decode_count = 0;
54 static int g_prompt_tokens = 0;
55 
56 static void handle_sigint(int sig) {
57  (void)sig;
58  if (g_generation_active) {
59  g_generation_active = 0; /* Stop generation but don't exit */
60  } else {
61  g_exit_requested = 1;
62  }
63 }
64 
65 /* ============================================================================
66  * Model API Types
67  * ============================================================================ */
68 
69 typedef int (*init_t)(const char *weights_path);
70 typedef int (*embed_t)(const int32_t *tokens, int num_tokens);
71 typedef int (*forward_t)(float *logits_out);
72 typedef int (*kv_enable_t)(int capacity);
73 typedef void (*kv_reset_t)(void);
74 typedef int (*decode_t)(int32_t token, float *logits_out);
75 typedef int (*sample_argmax_t)(void);
76 typedef float *(*get_logits_t)(void);
77 typedef int (*get_int_t)(void);
78 typedef void *(*get_ptr_t)(void);
79 typedef void (*free_t)(void);
80 
81 typedef struct {
82  void *handle;
83  init_t init;
84  embed_t embed;
85  forward_t forward;
86  kv_enable_t kv_enable;
87  kv_reset_t kv_reset;
88  decode_t decode;
89  sample_argmax_t sample;
90  get_logits_t get_logits;
91  get_int_t get_logits_stride;
92  get_int_t get_context;
93  get_int_t get_vocab_size;
94  get_int_t get_num_merges;
95  get_int_t get_vocab_bytes;
96  get_int_t get_active_tokens;
97  get_ptr_t get_offsets;
98  get_ptr_t get_strings;
99  get_ptr_t get_merges;
100  free_t free_fn;
101 } ModelAPI;
102 
103 /* ============================================================================
104  * Chat Template Types
105  * ============================================================================ */
106 
107 typedef enum {
114 
115 typedef struct {
116  ChatTemplateType type;
117  const char *system_prefix;
118  const char *system_suffix;
119  const char *user_prefix;
120  const char *user_suffix;
121  const char *assistant_prefix;
122  const char *assistant_suffix;
123 } ChatTemplate;
124 
125 static const ChatTemplate g_templates[] = {
126  [CHAT_TEMPLATE_NONE] = {
127  .type = CHAT_TEMPLATE_NONE,
128  .system_prefix = "", .system_suffix = "\n",
129  .user_prefix = "", .user_suffix = "\n",
130  .assistant_prefix = "", .assistant_suffix = "",
131  },
132  [CHAT_TEMPLATE_QWEN] = {
133  .type = CHAT_TEMPLATE_QWEN,
134  .system_prefix = "<|im_start|>system\n",
135  .system_suffix = "<|im_end|>\n",
136  .user_prefix = "<|im_start|>user\n",
137  .user_suffix = "<|im_end|>\n",
138  .assistant_prefix = "<|im_start|>assistant\n",
139  .assistant_suffix = "<|im_end|>",
140  },
141  [CHAT_TEMPLATE_LLAMA] = {
142  .type = CHAT_TEMPLATE_LLAMA,
143  .system_prefix = "[INST] <<SYS>>\n",
144  .system_suffix = "\n<</SYS>>\n\n",
145  .user_prefix = "",
146  .user_suffix = " [/INST]",
147  .assistant_prefix = " ",
148  .assistant_suffix = " </s><s>[INST] ",
149  },
151  .type = CHAT_TEMPLATE_CHATML,
152  .system_prefix = "<|im_start|>system\n",
153  .system_suffix = "<|im_end|>\n",
154  .user_prefix = "<|im_start|>user\n",
155  .user_suffix = "<|im_end|>\n",
156  .assistant_prefix = "<|im_start|>assistant\n",
157  .assistant_suffix = "<|im_end|>",
158  },
160  .type = CHAT_TEMPLATE_MISTRAL,
161  .system_prefix = "",
162  .system_suffix = "\n\n",
163  .user_prefix = "[INST] ",
164  .user_suffix = " [/INST]",
165  .assistant_prefix = "",
166  .assistant_suffix = "</s> ",
167  },
168 };
169 
170 /* ============================================================================
171  * CLI Options
172  * ============================================================================ */
173 
174 typedef struct {
175  const char *model_name; /* Model name for auto-discovery */
176  const char *lib_path;
177  const char *weights_path;
178  const char *prompt_once;
179  const char *system_prompt;
180  int max_tokens;
181  int context_override;
182  float temperature;
183  float top_p;
184  bool ignore_eos;
185  bool stream;
186  bool timing;
187  bool verbose;
188  bool no_chat_template;
189  ChatTemplateType chat_template;
190  int eos_ids[CK_CLI_EOS_MAX];
191  int eos_count;
192 } CLIOptions;
193 
194 /* ============================================================================
195  * Cache Discovery
196  * ============================================================================ */
197 
198 static const char *get_cache_dir(void) {
199  static char cache_path[4096];
200  const char *home = getenv("HOME");
201  if (!home) home = "/tmp";
202  snprintf(cache_path, sizeof(cache_path), "%s/.cache/ck-engine-v6.6/models", home);
203  return cache_path;
204 }
205 
206 static bool find_model_in_cache(const char *model_name, char *lib_out, char *weights_out, size_t out_size) {
207  const char *cache_dir = get_cache_dir();
208  DIR *dir = opendir(cache_dir);
209  if (!dir) return false;
210 
211  struct dirent *entry;
212  while ((entry = readdir(dir)) != NULL) {
213  if (entry->d_name[0] == '.') continue;
214 
215  /* Check if directory name contains model_name */
216  if (strstr(entry->d_name, model_name) != NULL) {
217  char model_dir[4096];
218  snprintf(model_dir, sizeof(model_dir), "%s/%s", cache_dir, entry->d_name);
219 
220  /* Check for required files */
221  char so_path[4096], bump_path[4096];
222  snprintf(so_path, sizeof(so_path), "%s/ck-kernel-inference.so", model_dir);
223  snprintf(bump_path, sizeof(bump_path), "%s/weights.bump", model_dir);
224 
225  struct stat st;
226  if (stat(so_path, &st) == 0 && stat(bump_path, &st) == 0) {
227  strncpy(lib_out, so_path, out_size - 1);
228  strncpy(weights_out, bump_path, out_size - 1);
229  closedir(dir);
230  return true;
231  }
232  }
233  }
234  closedir(dir);
235  return false;
236 }
237 
238 /* ============================================================================
239  * EOS Token Loading
240  * ============================================================================ */
241 
242 static bool load_eos_from_vocab_json(const char *weights_path, CLIOptions *opt) {
243  if (!weights_path || !opt) return false;
244 
245  /* Construct vocab.json path from weights path */
246  char vocab_path[4096];
247  const char *slash = strrchr(weights_path, '/');
248  if (!slash) return false;
249 
250  size_t dir_len = (size_t)(slash - weights_path);
251  if (dir_len + 12 >= sizeof(vocab_path)) return false;
252 
253  memcpy(vocab_path, weights_path, dir_len);
254  vocab_path[dir_len] = '\0';
255  strcat(vocab_path, "/vocab.json");
256 
257  FILE *f = fopen(vocab_path, "r");
258  if (!f) return false;
259 
260  /* Simple JSON parsing for special_tokens */
261  char buf[8192];
262  size_t n = fread(buf, 1, sizeof(buf) - 1, f);
263  fclose(f);
264  buf[n] = '\0';
265 
266  /* Look for "special_tokens" section */
267  const char *st = strstr(buf, "\"special_tokens\"");
268  if (!st) return false;
269 
270  /* Extract eos token */
271  const char *eos = strstr(st, "\"eos\"");
272  if (eos) {
273  const char *colon = strchr(eos, ':');
274  if (colon) {
275  int eos_id = atoi(colon + 1);
276  if (eos_id > 0) {
277  opt->eos_ids[0] = eos_id;
278  opt->eos_count = 1;
279  }
280  }
281  }
282 
283  /* Extract bos token (often used as im_end for chat) */
284  const char *bos = strstr(st, "\"bos\"");
285  if (bos) {
286  const char *colon = strchr(bos, ':');
287  if (colon) {
288  int bos_id = atoi(colon + 1);
289  if (bos_id > 0 && bos_id != opt->eos_ids[0]) {
290  opt->eos_ids[opt->eos_count++] = bos_id;
291  }
292  }
293  }
294 
295  return opt->eos_count > 0;
296 }
297 
298 static void list_available_models(void) {
299  const char *cache_dir = get_cache_dir();
300  DIR *dir = opendir(cache_dir);
301  if (!dir) {
302  fprintf(stderr, "No models found in %s\n", cache_dir);
303  return;
304  }
305 
306  printf("Available models in %s:\n", cache_dir);
307  struct dirent *entry;
308  int count = 0;
309  while ((entry = readdir(dir)) != NULL) {
310  if (entry->d_name[0] == '.') continue;
311 
312  char model_dir[4096];
313  snprintf(model_dir, sizeof(model_dir), "%s/%s", cache_dir, entry->d_name);
314 
315  char so_path[4096];
316  snprintf(so_path, sizeof(so_path), "%s/ck-kernel-inference.so", model_dir);
317 
318  struct stat st;
319  if (stat(so_path, &st) == 0) {
320  printf(" - %s\n", entry->d_name);
321  count++;
322  }
323  }
324  closedir(dir);
325 
326  if (count == 0) {
327  printf(" (none found)\n");
328  }
329 }
330 
331 /* ============================================================================
332  * Sampling
333  * ============================================================================ */
334 
335 static int sample_top_p(float *logits, int vocab_size, float temperature, float top_p) {
336  if (temperature <= 0.0f || top_p <= 0.0f) {
337  /* Argmax */
338  int best = 0;
339  float best_val = logits[0];
340  for (int i = 1; i < vocab_size; i++) {
341  if (logits[i] > best_val) {
342  best_val = logits[i];
343  best = i;
344  }
345  }
346  return best;
347  }
348 
349  /* Apply temperature */
350  float max_logit = logits[0];
351  for (int i = 1; i < vocab_size; i++) {
352  if (logits[i] > max_logit) max_logit = logits[i];
353  }
354 
355  float sum = 0.0f;
356  for (int i = 0; i < vocab_size; i++) {
357  logits[i] = expf((logits[i] - max_logit) / temperature);
358  sum += logits[i];
359  }
360 
361  /* Normalize to probabilities */
362  for (int i = 0; i < vocab_size; i++) {
363  logits[i] /= sum;
364  }
365 
366  /* Sort indices by probability (simple selection for top-p) */
367  /* For efficiency, we'll do nucleus sampling with cumulative sum */
368  float cumsum = 0.0f;
369  float threshold = (float)rand() / (float)RAND_MAX * top_p;
370 
371  /* Find nucleus tokens and sample */
372  int *indices = (int *)malloc(vocab_size * sizeof(int));
373  float *probs = (float *)malloc(vocab_size * sizeof(float));
374  for (int i = 0; i < vocab_size; i++) {
375  indices[i] = i;
376  probs[i] = logits[i];
377  }
378 
379  /* Simple sort (for small vocab, bubble sort is fine; for large, use qsort) */
380  for (int i = 0; i < vocab_size - 1; i++) {
381  for (int j = i + 1; j < vocab_size; j++) {
382  if (probs[j] > probs[i]) {
383  float tmp_p = probs[i]; probs[i] = probs[j]; probs[j] = tmp_p;
384  int tmp_i = indices[i]; indices[i] = indices[j]; indices[j] = tmp_i;
385  }
386  }
387  cumsum += probs[i];
388  if (cumsum >= top_p) break;
389  }
390 
391  /* Sample from nucleus */
392  float r = (float)rand() / (float)RAND_MAX * cumsum;
393  float acc = 0.0f;
394  int result = indices[0];
395  for (int i = 0; cumsum > 0 && i < vocab_size; i++) {
396  acc += probs[i];
397  if (acc >= r) {
398  result = indices[i];
399  break;
400  }
401  if (acc >= cumsum) break;
402  }
403 
404  free(indices);
405  free(probs);
406  return result;
407 }
408 
409 /* ============================================================================
410  * Output Helpers
411  * ============================================================================ */
412 
413 /**
414  * Decode GPT-2 byte-level BPE representation back to actual bytes.
415  *
416  * GPT-2's tokenizer maps certain bytes to Unicode code points:
417  * - Bytes 0x00-0x20 → U+0100-U+0120 (Ā Ć ċ ... Ġ)
418  * - Bytes 0x7F-0xA0 → U+017F-U+01A0
419  * - Printable ASCII (0x21-0x7E) stays as-is
420  *
421  * This function reverses that mapping.
422  *
423  * @param token Input BPE token string (UTF-8)
424  * @param out Output buffer for decoded bytes
425  * @param max Size of output buffer
426  * @return Number of bytes written (not including NUL)
427  */
428 static int decode_bpe_token(const char *token, char *out, int max) {
429  if (!token || max <= 0) return 0;
430 
431  const unsigned char *src = (const unsigned char *)token;
432  int out_len = 0;
433 
434  while (*src && out_len < max - 1) {
435  unsigned int codepoint;
436  int bytes;
437 
438  /* Decode UTF-8 to codepoint */
439  if ((src[0] & 0x80) == 0) {
440  /* Single byte ASCII */
441  codepoint = src[0];
442  bytes = 1;
443  } else if ((src[0] & 0xE0) == 0xC0 && (src[1] & 0xC0) == 0x80) {
444  /* Two byte sequence */
445  codepoint = ((src[0] & 0x1F) << 6) | (src[1] & 0x3F);
446  bytes = 2;
447  } else if ((src[0] & 0xF0) == 0xE0 && (src[1] & 0xC0) == 0x80 && (src[2] & 0xC0) == 0x80) {
448  /* Three byte sequence */
449  codepoint = ((src[0] & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F);
450  bytes = 3;
451  } else if ((src[0] & 0xF8) == 0xF0 && (src[1] & 0xC0) == 0x80 &&
452  (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
453  /* Four byte sequence */
454  codepoint = ((src[0] & 0x07) << 18) | ((src[1] & 0x3F) << 12) |
455  ((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
456  bytes = 4;
457  } else {
458  /* Invalid UTF-8, copy byte as-is */
459  out[out_len++] = (char)*src;
460  src++;
461  continue;
462  }
463 
464  /* Check if this is a GPT-2 byte-encoded character */
465  if (codepoint >= 0x100 && codepoint <= 0x120) {
466  /* Bytes 0x00-0x20: U+0100-U+0120 → byte = codepoint - 0x100 */
467  out[out_len++] = (char)(codepoint - 0x100);
468  } else if (codepoint >= 0x17F && codepoint <= 0x1A0) {
469  /* Bytes 0x7F-0xA0: U+017F-U+01A0 → byte = codepoint - 0x100 */
470  out[out_len++] = (char)(codepoint - 0x100);
471  } else if (codepoint < 0x80) {
472  /* Regular ASCII - copy as-is */
473  out[out_len++] = (char)codepoint;
474  } else if (codepoint == 0x2581) {
475  /* SentencePiece space marker ▁ (U+2581) → space */
476  out[out_len++] = ' ';
477  } else {
478  /* Other UTF-8 characters - copy original bytes */
479  for (int i = 0; i < bytes && out_len < max - 1; i++) {
480  out[out_len++] = (char)src[i];
481  }
482  }
483 
484  src += bytes;
485  }
486 
487  out[out_len] = '\0';
488  return out_len;
489 }
490 
491 static void output_flush(char *buf, size_t *len) {
492  if (*len == 0) return;
493  fwrite(buf, 1, *len, stdout);
494  *len = 0;
495 }
496 
497 static void output_append(char *buf, size_t *len, const char *text) {
498  if (!text || !*text) return;
499  size_t n = strlen(text);
500  if (*len + n >= CK_CLI_OUTPUT_BUF_SIZE) {
501  output_flush(buf, len);
502  }
503  if (n >= CK_CLI_OUTPUT_BUF_SIZE) {
504  fwrite(text, 1, n, stdout);
505  return;
506  }
507  memcpy(buf + *len, text, n);
508  *len += n;
509 }
510 
511 static void output_token(char *buf, size_t *len, const char *token) {
512  if (!token || !*token) return;
513 
514  /* Decode BPE byte-level encoding to actual bytes */
515  char decoded[1024];
516  int n = decode_bpe_token(token, decoded, sizeof(decoded));
517  if (n > 0) {
518  output_append(buf, len, decoded);
519  }
520 }
521 
522 /* ============================================================================
523  * Model Loading
524  * ============================================================================ */
525 
526 static bool resolve_symbol(void *handle, const char *name, void **out_ptr, bool required) {
527  void *sym = dlsym(handle, name);
528  if (!sym && required) {
529  fprintf(stderr, "Error: missing symbol %s\n", name);
530  return false;
531  }
532  if (out_ptr) *out_ptr = sym;
533  return true;
534 }
535 
536 static bool load_model_api(const char *lib_path, ModelAPI *api) {
537  if (!lib_path || !api) return false;
538  memset(api, 0, sizeof(*api));
539  api->handle = dlopen(lib_path, RTLD_NOW);
540  if (!api->handle) {
541  fprintf(stderr, "Error: dlopen failed: %s\n", dlerror());
542  return false;
543  }
544 
545  if (!resolve_symbol(api->handle, "ck_model_init", (void **)&api->init, true)) return false;
546  if (!resolve_symbol(api->handle, "ck_model_embed_tokens", (void **)&api->embed, true)) return false;
547  if (!resolve_symbol(api->handle, "ck_model_forward", (void **)&api->forward, true)) return false;
548  if (!resolve_symbol(api->handle, "ck_model_decode", (void **)&api->decode, true)) return false;
549  resolve_symbol(api->handle, "ck_model_sample_argmax", (void **)&api->sample, false); /* Optional - we can sample from logits */
550  resolve_symbol(api->handle, "ck_model_get_logits", (void **)&api->get_logits, false);
551  resolve_symbol(api->handle, "ck_model_get_logits_stride", (void **)&api->get_logits_stride, false);
552  resolve_symbol(api->handle, "ck_model_kv_cache_enable", (void **)&api->kv_enable, false);
553  resolve_symbol(api->handle, "ck_model_kv_cache_reset", (void **)&api->kv_reset, false);
554  resolve_symbol(api->handle, "ck_model_get_context_window", (void **)&api->get_context, false);
555  resolve_symbol(api->handle, "ck_model_get_vocab_size", (void **)&api->get_vocab_size, false);
556  resolve_symbol(api->handle, "ck_model_get_num_merges", (void **)&api->get_num_merges, false);
557  resolve_symbol(api->handle, "ck_model_get_vocab_strings_size", (void **)&api->get_vocab_bytes, false);
558  resolve_symbol(api->handle, "ck_model_get_active_tokens", (void **)&api->get_active_tokens, false);
559  resolve_symbol(api->handle, "ck_model_get_vocab_offsets", (void **)&api->get_offsets, false);
560  resolve_symbol(api->handle, "ck_model_get_vocab_strings", (void **)&api->get_strings, false);
561  resolve_symbol(api->handle, "ck_model_get_vocab_merges", (void **)&api->get_merges, false);
562  resolve_symbol(api->handle, "ck_model_free", (void **)&api->free_fn, false);
563 
564  if (!api->get_vocab_size || !api->get_offsets || !api->get_strings) {
565  fprintf(stderr, "Error: vocab accessors missing from model\n");
566  return false;
567  }
568  return true;
569 }
570 
571 /* ============================================================================
572  * Chat Template Application
573  * ============================================================================ */
574 
575 static ChatTemplateType detect_chat_template(const char *model_name) {
576  if (!model_name) return CHAT_TEMPLATE_CHATML;
577 
578  /* Lowercase comparison */
579  char lower[256];
580  strncpy(lower, model_name, sizeof(lower) - 1);
581  for (char *p = lower; *p; p++) *p = (*p >= 'A' && *p <= 'Z') ? *p + 32 : *p;
582 
583  if (strstr(lower, "qwen")) return CHAT_TEMPLATE_QWEN;
584  if (strstr(lower, "llama")) return CHAT_TEMPLATE_LLAMA;
585  if (strstr(lower, "mistral")) return CHAT_TEMPLATE_MISTRAL;
586 
587  return CHAT_TEMPLATE_CHATML; /* Default */
588 }
589 
590 static char *apply_chat_template(const ChatTemplate *tmpl, const char *system, const char *user) {
591  size_t needed = 0;
592  if (system && *system) {
593  needed += strlen(tmpl->system_prefix) + strlen(system) + strlen(tmpl->system_suffix);
594  }
595  needed += strlen(tmpl->user_prefix) + strlen(user) + strlen(tmpl->user_suffix);
596  needed += strlen(tmpl->assistant_prefix);
597  needed += 1; /* null terminator */
598 
599  char *result = (char *)malloc(needed);
600  if (!result) return NULL;
601 
602  result[0] = '\0';
603  if (system && *system) {
604  strcat(result, tmpl->system_prefix);
605  strcat(result, system);
606  strcat(result, tmpl->system_suffix);
607  }
608  strcat(result, tmpl->user_prefix);
609  strcat(result, user);
610  strcat(result, tmpl->user_suffix);
611  strcat(result, tmpl->assistant_prefix);
612 
613  return result;
614 }
615 
616 /* ============================================================================
617  * EOS Token Handling
618  * ============================================================================ */
619 
620 static bool is_eos_token(const CLIOptions *opt, int token) {
621  if (!opt || opt->ignore_eos) return false;
622  for (int i = 0; i < opt->eos_count; i++) {
623  if (opt->eos_ids[i] == token) return true;
624  }
625  return false;
626 }
627 
628 /**
629  * Text-based EOS pattern detection with pending output buffering.
630  *
631  * When special tokens like <|im_end|> are tokenized as regular text
632  * (e.g., !, im, _end, !), we need to detect the pattern in the output
633  * and avoid outputting the partial pattern tokens.
634  *
635  * This is a workaround for tokenizers that don't properly encode special tokens.
636  */
637 #define EOS_PATTERN_BUF_SIZE 64
638 #define EOS_PENDING_MAX 8
639 
640 typedef struct {
641  char pattern_buf[EOS_PATTERN_BUF_SIZE]; /* Accumulated text for pattern matching */
642  int pattern_len;
643  char *pending[EOS_PENDING_MAX]; /* Pending token texts (not yet output) */
644  int pending_count;
645  const char *target_pattern; /* Pattern to detect */
646  const char *partial_prefix; /* Prefix that might start the pattern */
647 } EOSPatternState;
648 
649 static EOSPatternState g_eos_state = {0};
650 
651 static void eos_pattern_reset(void) {
652  g_eos_state.pattern_len = 0;
653  g_eos_state.pattern_buf[0] = '\0';
654  for (int i = 0; i < g_eos_state.pending_count; i++) {
655  free(g_eos_state.pending[i]);
656  g_eos_state.pending[i] = NULL;
657  }
658  g_eos_state.pending_count = 0;
659  g_eos_state.target_pattern = NULL;
660  g_eos_state.partial_prefix = NULL;
661 }
662 
665  switch (tmpl) {
666  case CHAT_TEMPLATE_QWEN:
668  g_eos_state.target_pattern = "im_end";
669  g_eos_state.partial_prefix = "im";
670  break;
671  case CHAT_TEMPLATE_LLAMA:
673  g_eos_state.target_pattern = "</s>";
674  g_eos_state.partial_prefix = "</";
675  break;
676  default:
677  break;
678  }
679 }
680 
681 /**
682  * Check if token might be start of EOS pattern.
683  */
684 static bool eos_is_potential_prefix(const char *token) {
685  if (!token || !g_eos_state.partial_prefix) return false;
686 
687  /* Check if current accumulated buffer + token could start the pattern */
688  size_t tlen = strlen(token);
689  size_t plen = g_eos_state.pattern_len;
690  size_t target_len = g_eos_state.target_pattern ? strlen(g_eos_state.target_pattern) : 0;
691 
692  /* If buffer + token contains partial match of target, it's a potential prefix */
693  if (target_len == 0) return false;
694 
695  /* Build temp buffer */
696  char temp[EOS_PATTERN_BUF_SIZE];
697  if (plen + tlen >= EOS_PATTERN_BUF_SIZE) return false;
698  memcpy(temp, g_eos_state.pattern_buf, plen);
699  memcpy(temp + plen, token, tlen);
700  temp[plen + tlen] = '\0';
701 
702  /* Check if temp is a prefix of target or contains start of target */
703  const char *target = g_eos_state.target_pattern;
704  size_t temp_len = plen + tlen;
705 
706  /* Look for any suffix of temp that is a prefix of target */
707  for (size_t i = 0; i < temp_len; i++) {
708  size_t remaining = temp_len - i;
709  if (remaining > target_len) remaining = target_len;
710  if (strncmp(temp + i, target, remaining) == 0) {
711  return true;
712  }
713  }
714 
715  return false;
716 }
717 
718 /**
719  * Process a token for EOS pattern detection.
720  *
721  * @param token_text The token text to process
722  * @param out_buf Output buffer for safe-to-output text
723  * @param out_len Current length of output buffer
724  * @param tmpl Chat template type
725  * @return true if EOS pattern detected, false otherwise
726  */
727 static bool eos_pattern_process(const char *token_text, char *out_buf, size_t *out_len,
728  void (*output_fn)(char*, size_t*, const char*),
729  ChatTemplateType tmpl) {
730  if (!token_text || !g_eos_state.target_pattern) {
731  /* No pattern to match - output directly */
732  if (token_text && output_fn) output_fn(out_buf, out_len, token_text);
733  return false;
734  }
735 
736  /* Append to pattern buffer */
737  size_t tlen = strlen(token_text);
738  if (g_eos_state.pattern_len + (int)tlen < EOS_PATTERN_BUF_SIZE - 1) {
739  memcpy(g_eos_state.pattern_buf + g_eos_state.pattern_len, token_text, tlen);
740  g_eos_state.pattern_len += (int)tlen;
741  g_eos_state.pattern_buf[g_eos_state.pattern_len] = '\0';
742  }
743 
744  /* Check if pattern is complete */
745  if (strstr(g_eos_state.pattern_buf, g_eos_state.target_pattern)) {
746  /* EOS detected - don't output pending tokens */
748  return true;
749  }
750 
751  /* Check if this could still be part of the pattern */
752  if (eos_is_potential_prefix(token_text)) {
753  /* Hold this token - might be part of EOS */
754  if (g_eos_state.pending_count < EOS_PENDING_MAX) {
755  g_eos_state.pending[g_eos_state.pending_count] = strdup(token_text);
756  g_eos_state.pending_count++;
757  }
758  return false;
759  }
760 
761  /* Not part of pattern - flush pending tokens and this one */
762  for (int i = 0; i < g_eos_state.pending_count; i++) {
763  if (output_fn) output_fn(out_buf, out_len, g_eos_state.pending[i]);
764  free(g_eos_state.pending[i]);
765  g_eos_state.pending[i] = NULL;
766  }
767  g_eos_state.pending_count = 0;
768  g_eos_state.pattern_len = 0;
769  g_eos_state.pattern_buf[0] = '\0';
770 
771  if (output_fn) output_fn(out_buf, out_len, token_text);
772  return false;
773 }
774 
775 static bool parse_eos_ids(const char *arg, CLIOptions *opt) {
776  if (!arg || !opt) return false;
777  opt->eos_count = 0;
778  const char *p = arg;
779  while (*p && opt->eos_count < CK_CLI_EOS_MAX) {
780  char *end = NULL;
781  long v = strtol(p, &end, 10);
782  if (end == p) break;
783  opt->eos_ids[opt->eos_count++] = (int)v;
784  p = end;
785  if (*p == ',') p++;
786  }
787  return opt->eos_count > 0;
788 }
789 
790 /* ============================================================================
791  * Prompt Execution
792  * ============================================================================ */
793 
794 static int run_prompt(ModelAPI *api, CKTrueBPE *tokenizer, CLIOptions *opt, const char *input) {
795  if (!api || !tokenizer || !opt || !input) return -1;
796  if (g_exit_requested) return -1;
797 
798  int ctx = opt->context_override;
799  if (ctx <= 0 && api->get_context) ctx = api->get_context();
800  if (ctx <= 0) ctx = 4096;
801  if (ctx > CK_CLI_MAX_CONTEXT) ctx = CK_CLI_MAX_CONTEXT;
802 
803  int max_tokens = opt->max_tokens > 0 ? opt->max_tokens : CK_CLI_DEFAULT_MAX_TOKENS;
804 
805  /* Apply chat template if enabled */
806  const ChatTemplate *tmpl = &g_templates[opt->no_chat_template ? CHAT_TEMPLATE_NONE : opt->chat_template];
807  char *formatted = apply_chat_template(tmpl, opt->system_prompt, input);
808  if (!formatted) {
809  fprintf(stderr, "Error: failed to format prompt\n");
810  return -1;
811  }
812 
813  if (opt->verbose) {
814  printf("[DEBUG] Formatted prompt:\n%s\n", formatted);
815  }
816 
817  int32_t *ids = (int32_t *)malloc((size_t)ctx * sizeof(int32_t));
818  if (!ids) {
819  fprintf(stderr, "Error: failed to allocate token buffer\n");
820  free(formatted);
821  return -1;
822  }
823 
824  int n = ck_true_bpe_encode(tokenizer, formatted, -1, ids, ctx);
825  free(formatted);
826 
827  if (n <= 0) {
828  fprintf(stderr, "[Tokenizer] failed to encode prompt\n");
829  free(ids);
830  return -1;
831  }
832  if (n > ctx - max_tokens) {
833  n = ctx - max_tokens;
834  if (opt->verbose) {
835  printf("[DEBUG] Truncated prompt to %d tokens\n", n);
836  }
837  }
838 
839  g_prefill_time_ms = 0.0;
840  g_decode_time_ms = 0.0;
841  g_decode_count = 0;
842  g_prompt_tokens = n;
843 
844  if (api->kv_reset) api->kv_reset();
845 
846  if (api->embed(ids, n) != 0) {
847  fprintf(stderr, "[Model] embed failed\n");
848  free(ids);
849  return -1;
850  }
851 
852  struct timespec t0, t1;
853  clock_gettime(CLOCK_MONOTONIC, &t0);
854  if (api->forward(NULL) != 0) {
855  fprintf(stderr, "[Model] forward failed\n");
856  free(ids);
857  return -1;
858  }
859  clock_gettime(CLOCK_MONOTONIC, &t1);
860  g_prefill_time_ms = (t1.tv_sec - t0.tv_sec) * 1000.0 +
861  (t1.tv_nsec - t0.tv_nsec) / 1000000.0;
862 
863  /* Get vocab size for sampling */
864  int vocab_size = api->get_vocab_size ? api->get_vocab_size() : 0;
865 
866  /* Helper: sample next token from logits */
867  #define SAMPLE_NEXT_TOKEN() do { \
868  if (api->get_logits && vocab_size > 0) { \
869  float *logits = api->get_logits(); \
870  if (logits) { \
871  int stride = api->get_logits_stride ? api->get_logits_stride() : vocab_size; \
872  int active = api->get_active_tokens ? api->get_active_tokens() : 1; \
873  float *last_logits = logits; \
874  if (stride > 0) { \
875  if (active < 1) active = 1; \
876  last_logits = logits + (size_t)(active - 1) * (size_t)stride; \
877  } \
878  float *logits_copy = (float *)malloc(vocab_size * sizeof(float)); \
879  memcpy(logits_copy, last_logits, vocab_size * sizeof(float)); \
880  next_token = sample_top_p(logits_copy, vocab_size, opt->temperature, opt->top_p); \
881  free(logits_copy); \
882  } else if (api->sample) { \
883  next_token = api->sample(); \
884  } else { \
885  next_token = -1; \
886  } \
887  } else if (api->sample) { \
888  next_token = api->sample(); \
889  } else { \
890  next_token = -1; \
891  } \
892  } while(0)
893 
894  /* Sample first token */
895  int next_token;
897 
898  char out_buf[CK_CLI_OUTPUT_BUF_SIZE];
899  size_t out_len = 0;
900 
901  /* Initialize EOS pattern detection for this prompt */
902  eos_pattern_init(opt->chat_template);
903 
905 
906  for (int generated = 0; generated < max_tokens && !g_exit_requested && g_generation_active; generated++) {
907  if (next_token < 0) break;
908 
909  if (opt->verbose) {
910  const char *tok_str = ck_true_bpe_id_to_token(tokenizer, next_token);
911  fprintf(stderr, "[DEBUG] Token %d: %d (%s)\n", generated, next_token, tok_str ? tok_str : "NULL");
912  }
913 
914  if (is_eos_token(opt, next_token)) {
915  if (opt->verbose) {
916  fprintf(stderr, "[DEBUG] EOS detected (token ID), stopping\n");
917  }
918  break;
919  }
920 
921  const char *word = ck_true_bpe_id_to_token(tokenizer, next_token);
922 
923  /* Process token through EOS pattern detection (buffers potential EOS tokens) */
924  if (!opt->ignore_eos &&
925  eos_pattern_process(word, out_buf, &out_len, output_token, opt->chat_template)) {
926  if (opt->verbose) {
927  fprintf(stderr, "[DEBUG] EOS detected (text pattern), stopping\n");
928  }
929  break;
930  }
931 
932  if (opt->stream) {
933  output_flush(out_buf, &out_len);
934  fflush(stdout);
935  } else if (out_len > (CK_CLI_OUTPUT_BUF_SIZE / 2)) {
936  output_flush(out_buf, &out_len);
937  fflush(stdout);
938  }
939 
940  if (generated + 1 >= max_tokens) break;
941 
942  clock_gettime(CLOCK_MONOTONIC, &t0);
943  if (api->decode(next_token, NULL) != 0) {
944  fprintf(stderr, "\n[Model] decode failed\n");
945  break;
946  }
947  clock_gettime(CLOCK_MONOTONIC, &t1);
948  g_decode_time_ms += (t1.tv_sec - t0.tv_sec) * 1000.0 +
949  (t1.tv_nsec - t0.tv_nsec) / 1000000.0;
950  g_decode_count++;
951 
952  /* Sample next token */
954  }
955 
956  #undef SAMPLE_NEXT_TOKEN
958  output_flush(out_buf, &out_len);
959  printf("\n");
960 
961  if (opt->timing) {
962  double total_ms = g_prefill_time_ms + g_decode_time_ms;
963  double prefill_rate = g_prompt_tokens / (g_prefill_time_ms / 1000.0);
964  double decode_rate = g_decode_count > 0 ? g_decode_count / (g_decode_time_ms / 1000.0) : 0.0;
965  double avg_decode = g_decode_count > 0 ? g_decode_time_ms / g_decode_count : 0.0;
966 
967  printf("\033[90m"); /* Gray text */
968  printf("prompt: %3d tok / %7.1f ms (%5.1f tok/s) | ", g_prompt_tokens, g_prefill_time_ms, prefill_rate);
969  printf("decode: %3d tok / %7.1f ms (%5.1f tok/s, %5.1f ms/tok)\033[0m\n",
970  g_decode_count, g_decode_time_ms, decode_rate, avg_decode);
971  }
972  fflush(stdout);
973 
974  free(ids);
975  return 0;
976 }
977 
978 /* ============================================================================
979  * Help & Argument Parsing
980  * ============================================================================ */
981 
982 static void print_banner(void) {
983  printf("\n");
984  printf(" \033[1;36mC-Kernel-Engine v%s\033[0m\n", CK_CLI_VERSION);
985  printf(" Native inference CLI with true-BPE tokenization\n");
986  printf("\n");
987 }
988 
989 static void print_help(const char *prog) {
990  print_banner();
991  fprintf(stderr, "Usage:\n");
992  fprintf(stderr, " %s --model <name> Auto-discover model from cache\n", prog);
993  fprintf(stderr, " %s <libmodel.so> <weights.bump> Direct paths\n", prog);
994  fprintf(stderr, " %s --lib <.so> --weights <.bump> Named arguments\n", prog);
995  fprintf(stderr, "\nOptions:\n");
996  fprintf(stderr, " --model, -m NAME Model name (searches in cache)\n");
997  fprintf(stderr, " --lib PATH Path to compiled model .so\n");
998  fprintf(stderr, " --weights PATH Path to weights .bump file\n");
999  fprintf(stderr, " --prompt, -p TEXT Run single prompt (non-interactive)\n");
1000  fprintf(stderr, " --system, -S TEXT System prompt\n");
1001  fprintf(stderr, " --max-tokens, -n N Max tokens to generate (default: %d)\n", CK_CLI_DEFAULT_MAX_TOKENS);
1002  fprintf(stderr, " --context, -c N Override context/KV cache size\n");
1003  fprintf(stderr, " --temperature, -T F Sampling temperature (default: 0.0 = greedy)\n");
1004  fprintf(stderr, " --top-p F Nucleus sampling top-p (default: 0.9)\n");
1005  fprintf(stderr, " --stream, -s Stream tokens as generated\n");
1006  fprintf(stderr, " --timing, -t Show timing breakdown\n");
1007  fprintf(stderr, " --no-chat-template Disable chat template formatting\n");
1008  fprintf(stderr, " --eos IDS Comma-separated EOS token IDs\n");
1009  fprintf(stderr, " --ignore-eos Do not stop on EOS tokens\n");
1010  fprintf(stderr, " --list List available models\n");
1011  fprintf(stderr, " --verbose, -v Verbose output\n");
1012  fprintf(stderr, " --help, -h Show this help\n");
1013  fprintf(stderr, "\nREPL Commands:\n");
1014  fprintf(stderr, " /exit, /quit Exit the REPL\n");
1015  fprintf(stderr, " /reset Reset KV cache\n");
1016  fprintf(stderr, " /timing Toggle timing display\n");
1017  fprintf(stderr, " /temp <value> Set temperature\n");
1018  fprintf(stderr, " /system <text> Set system prompt\n");
1019  fprintf(stderr, " /help Show help\n");
1020 }
1021 
1022 static bool parse_args(int argc, char **argv, CLIOptions *opt) {
1023  if (!opt) return false;
1024  memset(opt, 0, sizeof(*opt));
1025  opt->max_tokens = CK_CLI_DEFAULT_MAX_TOKENS;
1026  opt->temperature = 0.0f; /* Greedy by default */
1027  opt->top_p = 0.9f;
1028  opt->stream = true; /* Stream by default */
1029  opt->timing = true; /* Show timing by default */
1030  /* Default EOS tokens for Qwen/ChatML */
1031  opt->eos_ids[0] = 151643; /* <|im_end|> */
1032  opt->eos_ids[1] = 151645; /* <|endoftext|> */
1033  opt->eos_ids[2] = 151644; /* <|im_sep|> */
1034  opt->eos_count = 3;
1035 
1036  for (int i = 1; i < argc; i++) {
1037  const char *arg = argv[i];
1038 
1039  if (!strcmp(arg, "--help") || !strcmp(arg, "-h")) {
1040  print_help(argv[0]);
1041  return false;
1042  } else if (!strcmp(arg, "--list")) {
1044  return false;
1045  } else if ((!strcmp(arg, "--model") || !strcmp(arg, "-m")) && i + 1 < argc) {
1046  opt->model_name = argv[++i];
1047  } else if (!strcmp(arg, "--lib") && i + 1 < argc) {
1048  opt->lib_path = argv[++i];
1049  } else if (!strcmp(arg, "--weights") && i + 1 < argc) {
1050  opt->weights_path = argv[++i];
1051  } else if ((!strcmp(arg, "--prompt") || !strcmp(arg, "-p")) && i + 1 < argc) {
1052  opt->prompt_once = argv[++i];
1053  } else if ((!strcmp(arg, "--system") || !strcmp(arg, "-S")) && i + 1 < argc) {
1054  opt->system_prompt = argv[++i];
1055  } else if ((!strcmp(arg, "--max-tokens") || !strcmp(arg, "-n")) && i + 1 < argc) {
1056  opt->max_tokens = atoi(argv[++i]);
1057  } else if ((!strcmp(arg, "--context") || !strcmp(arg, "-c")) && i + 1 < argc) {
1058  opt->context_override = atoi(argv[++i]);
1059  } else if ((!strcmp(arg, "--temperature") || !strcmp(arg, "-T")) && i + 1 < argc) {
1060  opt->temperature = (float)atof(argv[++i]);
1061  } else if (!strcmp(arg, "--top-p") && i + 1 < argc) {
1062  opt->top_p = (float)atof(argv[++i]);
1063  } else if (!strcmp(arg, "--stream") || !strcmp(arg, "-s")) {
1064  opt->stream = true;
1065  } else if (!strcmp(arg, "--no-stream")) {
1066  opt->stream = false;
1067  } else if (!strcmp(arg, "--timing") || !strcmp(arg, "-t")) {
1068  opt->timing = true;
1069  } else if (!strcmp(arg, "--no-timing")) {
1070  opt->timing = false;
1071  } else if (!strcmp(arg, "--no-chat-template")) {
1072  opt->no_chat_template = true;
1073  } else if (!strcmp(arg, "--eos") && i + 1 < argc) {
1074  parse_eos_ids(argv[++i], opt);
1075  } else if (!strcmp(arg, "--ignore-eos")) {
1076  opt->ignore_eos = true;
1077  } else if (!strcmp(arg, "--verbose") || !strcmp(arg, "-v")) {
1078  opt->verbose = true;
1079  } else if (arg[0] != '-') {
1080  if (!opt->lib_path) opt->lib_path = arg;
1081  else if (!opt->weights_path) opt->weights_path = arg;
1082  else {
1083  fprintf(stderr, "Unknown argument: %s\n", arg);
1084  return false;
1085  }
1086  } else {
1087  fprintf(stderr, "Unknown option: %s\n", arg);
1088  return false;
1089  }
1090  }
1091 
1092  /* Auto-discover model if --model specified */
1093  if (opt->model_name && (!opt->lib_path || !opt->weights_path)) {
1094  static char lib_buf[4096], weights_buf[4096];
1095  if (find_model_in_cache(opt->model_name, lib_buf, weights_buf, sizeof(lib_buf))) {
1096  opt->lib_path = lib_buf;
1097  opt->weights_path = weights_buf;
1098  } else {
1099  fprintf(stderr, "Error: model '%s' not found in cache\n", opt->model_name);
1100  fprintf(stderr, "Run with --list to see available models\n");
1101  return false;
1102  }
1103  }
1104 
1105  if (!opt->lib_path || !opt->weights_path) {
1106  print_help(argv[0]);
1107  return false;
1108  }
1109 
1110  /* Auto-detect chat template from model name/path */
1111  const char *name_for_template = opt->model_name ? opt->model_name : opt->lib_path;
1112  opt->chat_template = detect_chat_template(name_for_template);
1113 
1114  /* Load EOS tokens from vocab.json if available */
1115  if (load_eos_from_vocab_json(opt->weights_path, opt)) {
1116  if (opt->verbose) {
1117  printf("[DEBUG] Loaded %d EOS tokens: ", opt->eos_count);
1118  for (int i = 0; i < opt->eos_count; i++) {
1119  printf("%d ", opt->eos_ids[i]);
1120  }
1121  printf("\n");
1122  }
1123  }
1124 
1125  return true;
1126 }
1127 
1128 /* ============================================================================
1129  * REPL Command Processing
1130  * ============================================================================ */
1131 
1132 static bool process_repl_command(const char *line, CLIOptions *opt, ModelAPI *api) {
1133  if (!line || line[0] != '/') return false;
1134 
1135  if (!strncmp(line, "/exit", 5) || !strncmp(line, "/quit", 5)) {
1136  g_exit_requested = 1;
1137  return true;
1138  }
1139  if (!strncmp(line, "/help", 5)) {
1140  printf("REPL Commands:\n");
1141  printf(" /exit, /quit Exit\n");
1142  printf(" /reset Reset KV cache\n");
1143  printf(" /timing Toggle timing display\n");
1144  printf(" /temp <value> Set temperature (0 = greedy)\n");
1145  printf(" /top-p <value> Set top-p\n");
1146  printf(" /system <text> Set system prompt\n");
1147  printf(" /clear Clear system prompt\n");
1148  printf(" /verbose Toggle verbose mode\n");
1149  return true;
1150  }
1151  if (!strncmp(line, "/reset", 6)) {
1152  if (api->kv_reset) {
1153  api->kv_reset();
1154  printf("[KV cache reset]\n");
1155  }
1156  return true;
1157  }
1158  if (!strncmp(line, "/timing", 7)) {
1159  opt->timing = !opt->timing;
1160  printf("[Timing %s]\n", opt->timing ? "enabled" : "disabled");
1161  return true;
1162  }
1163  if (!strncmp(line, "/verbose", 8)) {
1164  opt->verbose = !opt->verbose;
1165  printf("[Verbose %s]\n", opt->verbose ? "enabled" : "disabled");
1166  return true;
1167  }
1168  if (!strncmp(line, "/temp ", 6)) {
1169  opt->temperature = (float)atof(line + 6);
1170  printf("[Temperature set to %.2f]\n", opt->temperature);
1171  return true;
1172  }
1173  if (!strncmp(line, "/top-p ", 7)) {
1174  opt->top_p = (float)atof(line + 7);
1175  printf("[Top-p set to %.2f]\n", opt->top_p);
1176  return true;
1177  }
1178  if (!strncmp(line, "/system ", 8)) {
1179  opt->system_prompt = strdup(line + 8);
1180  printf("[System prompt set]\n");
1181  return true;
1182  }
1183  if (!strncmp(line, "/clear", 6)) {
1184  opt->system_prompt = NULL;
1185  printf("[System prompt cleared]\n");
1186  return true;
1187  }
1188 
1189  printf("Unknown command: %s\n", line);
1190  return true;
1191 }
1192 
1193 /* ============================================================================
1194  * Main
1195  * ============================================================================ */
1196 
1197 int main(int argc, char **argv) {
1198  signal(SIGINT, handle_sigint);
1199  srand((unsigned int)time(NULL));
1200 
1201  CLIOptions opt;
1202  if (!parse_args(argc, argv, &opt)) {
1203  return 1;
1204  }
1205 
1206  print_banner();
1207  printf("Loading: %s\n", opt.lib_path);
1208 
1209  ModelAPI api;
1210  if (!load_model_api(opt.lib_path, &api)) {
1211  return 1;
1212  }
1213 
1214  printf("Initializing model...\n");
1215  if (api.init(opt.weights_path) != 0) {
1216  fprintf(stderr, "Error: ck_model_init failed\n");
1217  return 1;
1218  }
1219 
1220  int ctx = opt.context_override;
1221  if (ctx <= 0 && api.get_context) ctx = api.get_context();
1222  if (api.kv_enable && ctx > 0) {
1223  api.kv_enable(ctx);
1224  }
1225 
1226  CKTrueBPE *tokenizer = ck_true_bpe_create();
1227  if (!tokenizer) {
1228  fprintf(stderr, "[Tokenizer] failed to create\n");
1229  return 1;
1230  }
1231 
1232  int vocab_size = api.get_vocab_size ? api.get_vocab_size() : 0;
1233  int vocab_bytes = api.get_vocab_bytes ? api.get_vocab_bytes() : 0;
1234  int num_merges = api.get_num_merges ? api.get_num_merges() : 0;
1235  const int32_t *offsets = (const int32_t *)api.get_offsets();
1236  const char *strings = (const char *)api.get_strings();
1237  const int32_t *merges = api.get_merges ? (const int32_t *)api.get_merges() : NULL;
1238 
1239  if (vocab_size <= 0 || vocab_bytes <= 0 || !offsets || !strings) {
1240  fprintf(stderr, "[Tokenizer] missing vocab data in model\n");
1241  ck_true_bpe_free(tokenizer);
1242  return 1;
1243  }
1244 
1246  fprintf(stderr, "[Tokenizer] failed to load vocab\n");
1247  ck_true_bpe_free(tokenizer);
1248  return 1;
1249  }
1250 
1251  /* Register special tokens for pre-BPE matching.
1252  * This is done in the CLI (orchestrator), NOT the generated model code.
1253  * The generated model code stays "dumb" - just inference.
1254  * Model-specific token handling is the CLI's responsibility.
1255  */
1256  {
1257  /* Common special tokens across model families */
1258  static const char *special_tokens[] = {
1259  /* Qwen/ChatML */
1260  "<|im_start|>", "<|im_end|>", "<|endoftext|>",
1261  /* Llama 3 */
1262  "<|eot_id|>", "<|begin_of_text|>", "<|end_of_text|>",
1263  "<|start_header_id|>", "<|end_header_id|>",
1264  /* Generic */
1265  "</s>", "<s>", "<pad>", "<unk>",
1266  NULL
1267  };
1268  int registered = 0;
1269  for (int i = 0; special_tokens[i] != NULL; i++) {
1270  int32_t id = ck_true_bpe_lookup(tokenizer, special_tokens[i]);
1271  /* Verify it's actually this token (not unk) via round-trip */
1272  const char *check = ck_true_bpe_id_to_token(tokenizer, id);
1273  if (check && strcmp(check, special_tokens[i]) == 0) {
1274  ck_true_bpe_add_special_token(tokenizer, special_tokens[i], id);
1275  registered++;
1276  if (opt.verbose) {
1277  printf("[Tokenizer] Registered special: %s -> %d\n", special_tokens[i], id);
1278  }
1279  }
1280  }
1281  if (opt.verbose) {
1282  printf("[Tokenizer] Registered %d special tokens for pre-BPE matching\n", registered);
1283  }
1284  }
1285 
1286  printf("Ready! Vocab: %d, Context: %d, Template: %s\n",
1287  vocab_size, ctx,
1288  opt.no_chat_template ? "none" :
1289  opt.chat_template == CHAT_TEMPLATE_QWEN ? "qwen" :
1290  opt.chat_template == CHAT_TEMPLATE_LLAMA ? "llama" :
1291  opt.chat_template == CHAT_TEMPLATE_MISTRAL ? "mistral" : "chatml");
1292 
1293  /* Print CPU capability info */
1295  printf("[Hardware] %s | Vector: %d-bit | FMA: %s | AI Accel: %s | Kernel: %s\n",
1296  cap.name, cap.width, cap.has_fma ? "Yes" : "No",
1297  cap.has_ai_accel ? "Yes" : "No", cap.best_kernel);
1298 
1299  printf("Type /help for commands, Ctrl+C to stop generation\n\n");
1300 
1301  setvbuf(stdout, NULL, _IOFBF, 1 << 20);
1302 
1303  if (opt.prompt_once) {
1304  run_prompt(&api, tokenizer, &opt, opt.prompt_once);
1305  } else {
1306  /* REPL */
1307 #ifdef HAVE_READLINE
1308  char *home = getenv("HOME");
1309  char history_path[4096];
1310  if (home) {
1311  snprintf(history_path, sizeof(history_path), "%s/%s", home, CK_CLI_HISTORY_FILE);
1312  read_history(history_path);
1313  }
1314 #endif
1315 
1316  while (!g_exit_requested) {
1317 #ifdef HAVE_READLINE
1318  char *line = readline("\033[1;32mYou:\033[0m ");
1319  if (!line) break;
1320  if (*line) add_history(line);
1321 #else
1322  printf("\033[1;32mYou:\033[0m ");
1323  fflush(stdout);
1324  char line_buf[4096];
1325  if (!fgets(line_buf, sizeof(line_buf), stdin)) {
1326  if (feof(stdin) || g_exit_requested) break;
1327  if (errno == EINTR) break;
1328  continue;
1329  }
1330  /* Remove trailing newline */
1331  size_t len = strlen(line_buf);
1332  if (len > 0 && line_buf[len-1] == '\n') line_buf[len-1] = '\0';
1333  char *line = line_buf;
1334 #endif
1335 
1336  if (line[0] == '\0') {
1337 #ifdef HAVE_READLINE
1338  free(line);
1339 #endif
1340  continue;
1341  }
1342 
1343  if (line[0] == '/') {
1344  process_repl_command(line, &opt, &api);
1345 #ifdef HAVE_READLINE
1346  free(line);
1347 #endif
1348  continue;
1349  }
1350 
1351  printf("\033[1;34mAssistant:\033[0m ");
1352  fflush(stdout);
1353  run_prompt(&api, tokenizer, &opt, line);
1354 
1355 #ifdef HAVE_READLINE
1356  free(line);
1357 #endif
1358  }
1359 
1360 #ifdef HAVE_READLINE
1361  if (home) {
1362  write_history(history_path);
1363  }
1364 #endif
1365  }
1366 
1367  ck_true_bpe_free(tokenizer);
1368  if (api.free_fn) api.free_fn();
1369  if (api.handle) dlclose(api.handle);
1370 
1371  printf("\nGoodbye!\n");
1372  return 0;
1373 }
ChatTemplateType
Definition: ck_cli_v6.5.c:106
void(* kv_reset_t)(void)
Definition: ck_cli_v6.6.c:73
void *(* get_ptr_t)(void)
Definition: ck_cli_v6.6.c:78
static bool parse_eos_ids(const char *arg, CLIOptions *opt)
Definition: ck_cli_v6.6.c:775
int(* init_t)(const char *weights_path)
Definition: ck_cli_v6.6.c:69
static bool resolve_symbol(void *handle, const char *name, void **out_ptr, bool required)
Definition: ck_cli_v6.6.c:526
static double g_decode_time_ms
Definition: ck_cli_v6.6.c:52
static void handle_sigint(int sig)
Definition: ck_cli_v6.6.c:56
int(* embed_t)(const int32_t *tokens, int num_tokens)
Definition: ck_cli_v6.6.c:70
static char * apply_chat_template(const ChatTemplate *tmpl, const char *system, const char *user)
Definition: ck_cli_v6.6.c:590
int(* kv_enable_t)(int capacity)
Definition: ck_cli_v6.6.c:72
static int sample_top_p(float *logits, int vocab_size, float temperature, float top_p)
Definition: ck_cli_v6.6.c:335
static bool load_eos_from_vocab_json(const char *weights_path, CLIOptions *opt)
Definition: ck_cli_v6.6.c:242
ChatTemplateType
Definition: ck_cli_v6.6.c:107
@ CHAT_TEMPLATE_LLAMA
Definition: ck_cli_v6.6.c:110
@ CHAT_TEMPLATE_MISTRAL
Definition: ck_cli_v6.6.c:112
@ CHAT_TEMPLATE_QWEN
Definition: ck_cli_v6.6.c:109
@ CHAT_TEMPLATE_CHATML
Definition: ck_cli_v6.6.c:111
@ CHAT_TEMPLATE_NONE
Definition: ck_cli_v6.6.c:108
static double g_prefill_time_ms
Definition: ck_cli_v6.6.c:51
static bool find_model_in_cache(const char *model_name, char *lib_out, char *weights_out, size_t out_size)
Definition: ck_cli_v6.6.c:206
int main(int argc, char **argv)
Definition: ck_cli_v6.6.c:1197
static void print_help(const char *prog)
Definition: ck_cli_v6.6.c:989
#define CK_CLI_EOS_MAX
Definition: ck_cli_v6.6.c:42
static ChatTemplateType detect_chat_template(const char *model_name)
Definition: ck_cli_v6.6.c:575
void(* free_t)(void)
Definition: ck_cli_v6.6.c:79
#define CK_CLI_HISTORY_FILE
Definition: ck_cli_v6.6.c:45
static int g_decode_count
Definition: ck_cli_v6.6.c:53
static bool process_repl_command(const char *line, CLIOptions *opt, ModelAPI *api)
Definition: ck_cli_v6.6.c:1132
static bool is_eos_token(const CLIOptions *opt, int token)
Definition: ck_cli_v6.6.c:620
static bool eos_is_potential_prefix(const char *token)
Definition: ck_cli_v6.6.c:684
int(* forward_t)(float *logits_out)
Definition: ck_cli_v6.6.c:71
#define EOS_PENDING_MAX
Definition: ck_cli_v6.6.c:638
float *(* get_logits_t)(void)
Definition: ck_cli_v6.6.c:76
static void eos_pattern_init(ChatTemplateType tmpl)
Definition: ck_cli_v6.6.c:663
static void output_append(char *buf, size_t *len, const char *text)
Definition: ck_cli_v6.6.c:497
static void list_available_models(void)
Definition: ck_cli_v6.6.c:298
static volatile sig_atomic_t g_generation_active
Definition: ck_cli_v6.6.c:48
#define CK_CLI_MAX_CONTEXT
Definition: ck_cli_v6.6.c:44
static int decode_bpe_token(const char *token, char *out, int max)
Definition: ck_cli_v6.6.c:428
int(* get_int_t)(void)
Definition: ck_cli_v6.6.c:77
int(* decode_t)(int32_t token, float *logits_out)
Definition: ck_cli_v6.6.c:74
#define EOS_PATTERN_BUF_SIZE
Definition: ck_cli_v6.6.c:637
static void print_banner(void)
Definition: ck_cli_v6.6.c:982
static bool parse_args(int argc, char **argv, CLIOptions *opt)
Definition: ck_cli_v6.6.c:1022
static int run_prompt(ModelAPI *api, CKTrueBPE *tokenizer, CLIOptions *opt, const char *input)
Definition: ck_cli_v6.6.c:794
static void eos_pattern_reset(void)
Definition: ck_cli_v6.6.c:651
static volatile sig_atomic_t g_exit_requested
Definition: ck_cli_v6.6.c:47
static EOSPatternState g_eos_state
Definition: ck_cli_v6.6.c:649
#define CK_CLI_OUTPUT_BUF_SIZE
Definition: ck_cli_v6.6.c:43
#define CK_CLI_DEFAULT_MAX_TOKENS
Definition: ck_cli_v6.6.c:41
static void output_flush(char *buf, size_t *len)
Definition: ck_cli_v6.6.c:491
static const ChatTemplate g_templates[]
Definition: ck_cli_v6.6.c:125
static const char * get_cache_dir(void)
Definition: ck_cli_v6.6.c:198
static bool load_model_api(const char *lib_path, ModelAPI *api)
Definition: ck_cli_v6.6.c:536
#define SAMPLE_NEXT_TOKEN()
static void output_token(char *buf, size_t *len, const char *token)
Definition: ck_cli_v6.6.c:511
static bool eos_pattern_process(const char *token_text, char *out_buf, size_t *out_len, void(*output_fn)(char *, size_t *, const char *), ChatTemplateType tmpl)
Definition: ck_cli_v6.6.c:727
#define CK_CLI_VERSION
Definition: ck_cli_v6.6.c:40
int(* sample_argmax_t)(void)
Definition: ck_cli_v6.6.c:75
static int g_prompt_tokens
Definition: ck_cli_v6.6.c:54
CPU feature detection and dispatch macros.
static ck_capability_t ck_get_capabilities(void)
Get current platform capabilities.
Definition: ck_features.h:226
CPU capability information structure.
Definition: ck_features.h:215
const char * best_kernel
Definition: ck_features.h:220
const char * name
Definition: ck_features.h:216
const int32_t * ids
Definition: tokenizer.h:443
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t bos
Definition: tokenizer.h:230
const int32_t int int * out_len
Definition: tokenizer.h:445
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1338
void ck_true_bpe_free(CKTrueBPE *bpe)
Definition: true_bpe.c:405
CKTrueBPE * ck_true_bpe_create(void)
Definition: true_bpe.c:342
int ck_true_bpe_add_special_token(CKTrueBPE *bpe, const char *token, int32_t id)
Definition: true_bpe.c:565
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: true_bpe.c:606
int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token)
Definition: true_bpe.c:638
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
uint32_t end
Definition: utf8.c:215