← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_cli_v6.5.c
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine v6.5 Native CLI
3  *
4  * Features:
5  * - Model auto-discovery from cache
6  * - Readline support for history/editing
7  * - Chat template support (Qwen, LLaMA, etc.)
8  * - Temperature/top-p sampling
9  * - Streaming output
10  *
11  * Usage:
12  * ck-cli-v6.5 --model <name> # Auto-discover from cache
13  * ck-cli-v6.5 <libmodel.so> <weights.bump> # Direct paths
14  * ck-cli-v6.5 --lib <.so> --weights <.bump> # Named args
15  */
16 
17 #define _GNU_SOURCE
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <stdint.h>
21 #include <string.h>
22 #include <stdbool.h>
23 #include <errno.h>
24 #include <signal.h>
25 #include <dlfcn.h>
26 #include <unistd.h>
27 #include <time.h>
28 #include <math.h>
29 #include <dirent.h>
30 #include <sys/stat.h>
31 
32 #ifdef HAVE_READLINE
33 #include <readline/readline.h>
34 #include <readline/history.h>
35 #endif
36 
37 #include "tokenizer/true_bpe.h"
38 #include "ck_features.h"
39 
40 #define CK_CLI_VERSION "6.5.0"
41 #define CK_CLI_DEFAULT_MAX_TOKENS 256
42 #define CK_CLI_EOS_MAX 8
43 #define CK_CLI_OUTPUT_BUF_SIZE 4096
44 #define CK_CLI_MAX_CONTEXT 32768
45 #define CK_CLI_HISTORY_FILE ".ck_cli_history"
46 
47 static volatile sig_atomic_t g_exit_requested = 0;
48 static volatile sig_atomic_t g_generation_active = 0;
49 
50 /* Timing globals */
51 static double g_prefill_time_ms = 0.0;
52 static double g_decode_time_ms = 0.0;
53 static int g_decode_count = 0;
54 static int g_prompt_tokens = 0;
55 
56 static void handle_sigint(int sig) {
57  (void)sig;
58  if (g_generation_active) {
59  g_generation_active = 0; /* Stop generation but don't exit */
60  } else {
61  g_exit_requested = 1;
62  }
63 }
64 
65 /* ============================================================================
66  * Model API Types
67  * ============================================================================ */
68 
69 typedef int (*init_t)(const char *weights_path);
70 typedef int (*embed_t)(const int32_t *tokens, int num_tokens);
71 typedef int (*forward_t)(float *logits_out);
72 typedef int (*kv_enable_t)(int capacity);
73 typedef void (*kv_reset_t)(void);
74 typedef int (*decode_t)(int32_t token, float *logits_out);
75 typedef int (*sample_argmax_t)(void);
76 typedef float *(*get_logits_t)(void);
77 typedef int (*get_int_t)(void);
78 typedef void *(*get_ptr_t)(void);
79 typedef void (*free_t)(void);
80 
81 typedef struct {
82  void *handle;
83  init_t init;
84  embed_t embed;
85  forward_t forward;
86  kv_enable_t kv_enable;
87  kv_reset_t kv_reset;
88  decode_t decode;
89  sample_argmax_t sample;
90  get_logits_t get_logits;
91  get_int_t get_context;
92  get_int_t get_vocab_size;
93  get_int_t get_num_merges;
94  get_int_t get_vocab_bytes;
95  get_int_t get_active_tokens;
96  get_ptr_t get_offsets;
97  get_ptr_t get_strings;
98  get_ptr_t get_merges;
99  free_t free_fn;
100 } ModelAPI;
101 
102 /* ============================================================================
103  * Chat Template Types
104  * ============================================================================ */
105 
106 typedef enum {
113 
114 typedef struct {
115  ChatTemplateType type;
116  const char *system_prefix;
117  const char *system_suffix;
118  const char *user_prefix;
119  const char *user_suffix;
120  const char *assistant_prefix;
121  const char *assistant_suffix;
122 } ChatTemplate;
123 
124 static const ChatTemplate g_templates[] = {
125  [CHAT_TEMPLATE_NONE] = {
126  .type = CHAT_TEMPLATE_NONE,
127  .system_prefix = "", .system_suffix = "\n",
128  .user_prefix = "", .user_suffix = "\n",
129  .assistant_prefix = "", .assistant_suffix = "",
130  },
131  [CHAT_TEMPLATE_QWEN] = {
132  .type = CHAT_TEMPLATE_QWEN,
133  .system_prefix = "<|im_start|>system\n",
134  .system_suffix = "<|im_end|>\n",
135  .user_prefix = "<|im_start|>user\n",
136  .user_suffix = "<|im_end|>\n",
137  .assistant_prefix = "<|im_start|>assistant\n",
138  .assistant_suffix = "<|im_end|>",
139  },
140  [CHAT_TEMPLATE_LLAMA] = {
141  .type = CHAT_TEMPLATE_LLAMA,
142  .system_prefix = "[INST] <<SYS>>\n",
143  .system_suffix = "\n<</SYS>>\n\n",
144  .user_prefix = "",
145  .user_suffix = " [/INST]",
146  .assistant_prefix = " ",
147  .assistant_suffix = " </s><s>[INST] ",
148  },
150  .type = CHAT_TEMPLATE_CHATML,
151  .system_prefix = "<|im_start|>system\n",
152  .system_suffix = "<|im_end|>\n",
153  .user_prefix = "<|im_start|>user\n",
154  .user_suffix = "<|im_end|>\n",
155  .assistant_prefix = "<|im_start|>assistant\n",
156  .assistant_suffix = "<|im_end|>",
157  },
159  .type = CHAT_TEMPLATE_MISTRAL,
160  .system_prefix = "",
161  .system_suffix = "\n\n",
162  .user_prefix = "[INST] ",
163  .user_suffix = " [/INST]",
164  .assistant_prefix = "",
165  .assistant_suffix = "</s> ",
166  },
167 };
168 
169 /* ============================================================================
170  * CLI Options
171  * ============================================================================ */
172 
173 typedef struct {
174  const char *model_name; /* Model name for auto-discovery */
175  const char *lib_path;
176  const char *weights_path;
177  const char *prompt_once;
178  const char *system_prompt;
179  int max_tokens;
180  int context_override;
181  float temperature;
182  float top_p;
183  bool ignore_eos;
184  bool stream;
185  bool timing;
186  bool verbose;
187  bool no_chat_template;
188  ChatTemplateType chat_template;
189  int eos_ids[CK_CLI_EOS_MAX];
190  int eos_count;
191 } CLIOptions;
192 
193 /* ============================================================================
194  * Cache Discovery
195  * ============================================================================ */
196 
197 static const char *get_cache_dir(void) {
198  static char cache_path[4096];
199  const char *home = getenv("HOME");
200  if (!home) home = "/tmp";
201  snprintf(cache_path, sizeof(cache_path), "%s/.cache/ck-engine-v6.5/models", home);
202  return cache_path;
203 }
204 
205 static bool find_model_in_cache(const char *model_name, char *lib_out, char *weights_out, size_t out_size) {
206  const char *cache_dir = get_cache_dir();
207  DIR *dir = opendir(cache_dir);
208  if (!dir) return false;
209 
210  struct dirent *entry;
211  while ((entry = readdir(dir)) != NULL) {
212  if (entry->d_name[0] == '.') continue;
213 
214  /* Check if directory name contains model_name */
215  if (strstr(entry->d_name, model_name) != NULL) {
216  char model_dir[4096];
217  snprintf(model_dir, sizeof(model_dir), "%s/%s", cache_dir, entry->d_name);
218 
219  /* Check for required files */
220  char so_path[4096], bump_path[4096];
221  snprintf(so_path, sizeof(so_path), "%s/ck-kernel-inference.so", model_dir);
222  snprintf(bump_path, sizeof(bump_path), "%s/weights.bump", model_dir);
223 
224  struct stat st;
225  if (stat(so_path, &st) == 0 && stat(bump_path, &st) == 0) {
226  strncpy(lib_out, so_path, out_size - 1);
227  strncpy(weights_out, bump_path, out_size - 1);
228  closedir(dir);
229  return true;
230  }
231  }
232  }
233  closedir(dir);
234  return false;
235 }
236 
237 /* ============================================================================
238  * EOS Token Loading
239  * ============================================================================ */
240 
241 static bool load_eos_from_vocab_json(const char *weights_path, CLIOptions *opt) {
242  if (!weights_path || !opt) return false;
243 
244  /* Construct vocab.json path from weights path */
245  char vocab_path[4096];
246  const char *slash = strrchr(weights_path, '/');
247  if (!slash) return false;
248 
249  size_t dir_len = (size_t)(slash - weights_path);
250  if (dir_len + 12 >= sizeof(vocab_path)) return false;
251 
252  memcpy(vocab_path, weights_path, dir_len);
253  vocab_path[dir_len] = '\0';
254  strcat(vocab_path, "/vocab.json");
255 
256  FILE *f = fopen(vocab_path, "r");
257  if (!f) return false;
258 
259  /* Simple JSON parsing for special_tokens */
260  char buf[8192];
261  size_t n = fread(buf, 1, sizeof(buf) - 1, f);
262  fclose(f);
263  buf[n] = '\0';
264 
265  /* Look for "special_tokens" section */
266  const char *st = strstr(buf, "\"special_tokens\"");
267  if (!st) return false;
268 
269  /* Extract eos token */
270  const char *eos = strstr(st, "\"eos\"");
271  if (eos) {
272  const char *colon = strchr(eos, ':');
273  if (colon) {
274  int eos_id = atoi(colon + 1);
275  if (eos_id > 0) {
276  opt->eos_ids[0] = eos_id;
277  opt->eos_count = 1;
278  }
279  }
280  }
281 
282  /* Extract bos token (often used as im_end for chat) */
283  const char *bos = strstr(st, "\"bos\"");
284  if (bos) {
285  const char *colon = strchr(bos, ':');
286  if (colon) {
287  int bos_id = atoi(colon + 1);
288  if (bos_id > 0 && bos_id != opt->eos_ids[0]) {
289  opt->eos_ids[opt->eos_count++] = bos_id;
290  }
291  }
292  }
293 
294  return opt->eos_count > 0;
295 }
296 
297 static void list_available_models(void) {
298  const char *cache_dir = get_cache_dir();
299  DIR *dir = opendir(cache_dir);
300  if (!dir) {
301  fprintf(stderr, "No models found in %s\n", cache_dir);
302  return;
303  }
304 
305  printf("Available models in %s:\n", cache_dir);
306  struct dirent *entry;
307  int count = 0;
308  while ((entry = readdir(dir)) != NULL) {
309  if (entry->d_name[0] == '.') continue;
310 
311  char model_dir[4096];
312  snprintf(model_dir, sizeof(model_dir), "%s/%s", cache_dir, entry->d_name);
313 
314  char so_path[4096];
315  snprintf(so_path, sizeof(so_path), "%s/ck-kernel-inference.so", model_dir);
316 
317  struct stat st;
318  if (stat(so_path, &st) == 0) {
319  printf(" - %s\n", entry->d_name);
320  count++;
321  }
322  }
323  closedir(dir);
324 
325  if (count == 0) {
326  printf(" (none found)\n");
327  }
328 }
329 
330 /* ============================================================================
331  * Sampling
332  * ============================================================================ */
333 
334 static int sample_top_p(float *logits, int vocab_size, float temperature, float top_p) {
335  if (temperature <= 0.0f || top_p <= 0.0f) {
336  /* Argmax */
337  int best = 0;
338  float best_val = logits[0];
339  for (int i = 1; i < vocab_size; i++) {
340  if (logits[i] > best_val) {
341  best_val = logits[i];
342  best = i;
343  }
344  }
345  return best;
346  }
347 
348  /* Apply temperature */
349  float max_logit = logits[0];
350  for (int i = 1; i < vocab_size; i++) {
351  if (logits[i] > max_logit) max_logit = logits[i];
352  }
353 
354  float sum = 0.0f;
355  for (int i = 0; i < vocab_size; i++) {
356  logits[i] = expf((logits[i] - max_logit) / temperature);
357  sum += logits[i];
358  }
359 
360  /* Normalize to probabilities */
361  for (int i = 0; i < vocab_size; i++) {
362  logits[i] /= sum;
363  }
364 
365  /* Sort indices by probability (simple selection for top-p) */
366  /* For efficiency, we'll do nucleus sampling with cumulative sum */
367  float cumsum = 0.0f;
368  float threshold = (float)rand() / (float)RAND_MAX * top_p;
369 
370  /* Find nucleus tokens and sample */
371  int *indices = (int *)malloc(vocab_size * sizeof(int));
372  float *probs = (float *)malloc(vocab_size * sizeof(float));
373  for (int i = 0; i < vocab_size; i++) {
374  indices[i] = i;
375  probs[i] = logits[i];
376  }
377 
378  /* Simple sort (for small vocab, bubble sort is fine; for large, use qsort) */
379  for (int i = 0; i < vocab_size - 1; i++) {
380  for (int j = i + 1; j < vocab_size; j++) {
381  if (probs[j] > probs[i]) {
382  float tmp_p = probs[i]; probs[i] = probs[j]; probs[j] = tmp_p;
383  int tmp_i = indices[i]; indices[i] = indices[j]; indices[j] = tmp_i;
384  }
385  }
386  cumsum += probs[i];
387  if (cumsum >= top_p) break;
388  }
389 
390  /* Sample from nucleus */
391  float r = (float)rand() / (float)RAND_MAX * cumsum;
392  float acc = 0.0f;
393  int result = indices[0];
394  for (int i = 0; cumsum > 0 && i < vocab_size; i++) {
395  acc += probs[i];
396  if (acc >= r) {
397  result = indices[i];
398  break;
399  }
400  if (acc >= cumsum) break;
401  }
402 
403  free(indices);
404  free(probs);
405  return result;
406 }
407 
408 /* ============================================================================
409  * Output Helpers
410  * ============================================================================ */
411 
412 /**
413  * Decode GPT-2 byte-level BPE representation back to actual bytes.
414  *
415  * GPT-2's tokenizer maps certain bytes to Unicode code points:
416  * - Bytes 0x00-0x20 → U+0100-U+0120 (Ā Ć ċ ... Ġ)
417  * - Bytes 0x7F-0xA0 → U+017F-U+01A0
418  * - Printable ASCII (0x21-0x7E) stays as-is
419  *
420  * This function reverses that mapping.
421  *
422  * @param token Input BPE token string (UTF-8)
423  * @param out Output buffer for decoded bytes
424  * @param max Size of output buffer
425  * @return Number of bytes written (not including NUL)
426  */
427 static int decode_bpe_token(const char *token, char *out, int max) {
428  if (!token || max <= 0) return 0;
429 
430  const unsigned char *src = (const unsigned char *)token;
431  int out_len = 0;
432 
433  while (*src && out_len < max - 1) {
434  unsigned int codepoint;
435  int bytes;
436 
437  /* Decode UTF-8 to codepoint */
438  if ((src[0] & 0x80) == 0) {
439  /* Single byte ASCII */
440  codepoint = src[0];
441  bytes = 1;
442  } else if ((src[0] & 0xE0) == 0xC0 && (src[1] & 0xC0) == 0x80) {
443  /* Two byte sequence */
444  codepoint = ((src[0] & 0x1F) << 6) | (src[1] & 0x3F);
445  bytes = 2;
446  } else if ((src[0] & 0xF0) == 0xE0 && (src[1] & 0xC0) == 0x80 && (src[2] & 0xC0) == 0x80) {
447  /* Three byte sequence */
448  codepoint = ((src[0] & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F);
449  bytes = 3;
450  } else if ((src[0] & 0xF8) == 0xF0 && (src[1] & 0xC0) == 0x80 &&
451  (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
452  /* Four byte sequence */
453  codepoint = ((src[0] & 0x07) << 18) | ((src[1] & 0x3F) << 12) |
454  ((src[2] & 0x3F) << 6) | (src[3] & 0x3F);
455  bytes = 4;
456  } else {
457  /* Invalid UTF-8, copy byte as-is */
458  out[out_len++] = (char)*src;
459  src++;
460  continue;
461  }
462 
463  /* Check if this is a GPT-2 byte-encoded character */
464  if (codepoint >= 0x100 && codepoint <= 0x120) {
465  /* Bytes 0x00-0x20: U+0100-U+0120 → byte = codepoint - 0x100 */
466  out[out_len++] = (char)(codepoint - 0x100);
467  } else if (codepoint >= 0x17F && codepoint <= 0x1A0) {
468  /* Bytes 0x7F-0xA0: U+017F-U+01A0 → byte = codepoint - 0x100 */
469  out[out_len++] = (char)(codepoint - 0x100);
470  } else if (codepoint < 0x80) {
471  /* Regular ASCII - copy as-is */
472  out[out_len++] = (char)codepoint;
473  } else if (codepoint == 0x2581) {
474  /* SentencePiece space marker ▁ (U+2581) → space */
475  out[out_len++] = ' ';
476  } else {
477  /* Other UTF-8 characters - copy original bytes */
478  for (int i = 0; i < bytes && out_len < max - 1; i++) {
479  out[out_len++] = (char)src[i];
480  }
481  }
482 
483  src += bytes;
484  }
485 
486  out[out_len] = '\0';
487  return out_len;
488 }
489 
490 static void output_flush(char *buf, size_t *len) {
491  if (*len == 0) return;
492  fwrite(buf, 1, *len, stdout);
493  *len = 0;
494 }
495 
496 static void output_append(char *buf, size_t *len, const char *text) {
497  if (!text || !*text) return;
498  size_t n = strlen(text);
499  if (*len + n >= CK_CLI_OUTPUT_BUF_SIZE) {
500  output_flush(buf, len);
501  }
502  if (n >= CK_CLI_OUTPUT_BUF_SIZE) {
503  fwrite(text, 1, n, stdout);
504  return;
505  }
506  memcpy(buf + *len, text, n);
507  *len += n;
508 }
509 
510 static void output_token(char *buf, size_t *len, const char *token) {
511  if (!token || !*token) return;
512 
513  /* Decode BPE byte-level encoding to actual bytes */
514  char decoded[1024];
515  int n = decode_bpe_token(token, decoded, sizeof(decoded));
516  if (n > 0) {
517  output_append(buf, len, decoded);
518  }
519 }
520 
521 /* ============================================================================
522  * Model Loading
523  * ============================================================================ */
524 
525 static bool resolve_symbol(void *handle, const char *name, void **out_ptr, bool required) {
526  void *sym = dlsym(handle, name);
527  if (!sym && required) {
528  fprintf(stderr, "Error: missing symbol %s\n", name);
529  return false;
530  }
531  if (out_ptr) *out_ptr = sym;
532  return true;
533 }
534 
535 static bool load_model_api(const char *lib_path, ModelAPI *api) {
536  if (!lib_path || !api) return false;
537  memset(api, 0, sizeof(*api));
538  api->handle = dlopen(lib_path, RTLD_NOW);
539  if (!api->handle) {
540  fprintf(stderr, "Error: dlopen failed: %s\n", dlerror());
541  return false;
542  }
543 
544  if (!resolve_symbol(api->handle, "ck_model_init", (void **)&api->init, true)) return false;
545  if (!resolve_symbol(api->handle, "ck_model_embed_tokens", (void **)&api->embed, true)) return false;
546  if (!resolve_symbol(api->handle, "ck_model_forward", (void **)&api->forward, true)) return false;
547  if (!resolve_symbol(api->handle, "ck_model_decode", (void **)&api->decode, true)) return false;
548  if (!resolve_symbol(api->handle, "ck_model_sample_argmax", (void **)&api->sample, true)) return false;
549  resolve_symbol(api->handle, "ck_model_get_logits", (void **)&api->get_logits, false);
550  resolve_symbol(api->handle, "ck_model_kv_cache_enable", (void **)&api->kv_enable, false);
551  resolve_symbol(api->handle, "ck_model_kv_cache_reset", (void **)&api->kv_reset, false);
552  resolve_symbol(api->handle, "ck_model_get_context_window", (void **)&api->get_context, false);
553  resolve_symbol(api->handle, "ck_model_get_vocab_size", (void **)&api->get_vocab_size, false);
554  resolve_symbol(api->handle, "ck_model_get_num_merges", (void **)&api->get_num_merges, false);
555  resolve_symbol(api->handle, "ck_model_get_vocab_strings_size", (void **)&api->get_vocab_bytes, false);
556  resolve_symbol(api->handle, "ck_model_get_active_tokens", (void **)&api->get_active_tokens, false);
557  resolve_symbol(api->handle, "ck_model_get_vocab_offsets", (void **)&api->get_offsets, false);
558  resolve_symbol(api->handle, "ck_model_get_vocab_strings", (void **)&api->get_strings, false);
559  resolve_symbol(api->handle, "ck_model_get_vocab_merges", (void **)&api->get_merges, false);
560  resolve_symbol(api->handle, "ck_model_free", (void **)&api->free_fn, false);
561 
562  if (!api->get_vocab_size || !api->get_vocab_bytes || !api->get_offsets || !api->get_strings) {
563  fprintf(stderr, "Error: vocab accessors missing from model\n");
564  return false;
565  }
566  return true;
567 }
568 
569 /* ============================================================================
570  * Chat Template Application
571  * ============================================================================ */
572 
573 static ChatTemplateType detect_chat_template(const char *model_name) {
574  if (!model_name) return CHAT_TEMPLATE_CHATML;
575 
576  /* Lowercase comparison */
577  char lower[256];
578  strncpy(lower, model_name, sizeof(lower) - 1);
579  for (char *p = lower; *p; p++) *p = (*p >= 'A' && *p <= 'Z') ? *p + 32 : *p;
580 
581  if (strstr(lower, "qwen")) return CHAT_TEMPLATE_QWEN;
582  if (strstr(lower, "llama")) return CHAT_TEMPLATE_LLAMA;
583  if (strstr(lower, "mistral")) return CHAT_TEMPLATE_MISTRAL;
584 
585  return CHAT_TEMPLATE_CHATML; /* Default */
586 }
587 
588 static char *apply_chat_template(const ChatTemplate *tmpl, const char *system, const char *user) {
589  size_t needed = 0;
590  if (system && *system) {
591  needed += strlen(tmpl->system_prefix) + strlen(system) + strlen(tmpl->system_suffix);
592  }
593  needed += strlen(tmpl->user_prefix) + strlen(user) + strlen(tmpl->user_suffix);
594  needed += strlen(tmpl->assistant_prefix);
595  needed += 1; /* null terminator */
596 
597  char *result = (char *)malloc(needed);
598  if (!result) return NULL;
599 
600  result[0] = '\0';
601  if (system && *system) {
602  strcat(result, tmpl->system_prefix);
603  strcat(result, system);
604  strcat(result, tmpl->system_suffix);
605  }
606  strcat(result, tmpl->user_prefix);
607  strcat(result, user);
608  strcat(result, tmpl->user_suffix);
609  strcat(result, tmpl->assistant_prefix);
610 
611  return result;
612 }
613 
614 /* ============================================================================
615  * EOS Token Handling
616  * ============================================================================ */
617 
618 static bool is_eos_token(const CLIOptions *opt, int token) {
619  if (!opt || opt->ignore_eos) return false;
620  for (int i = 0; i < opt->eos_count; i++) {
621  if (opt->eos_ids[i] == token) return true;
622  }
623  return false;
624 }
625 
626 /**
627  * Text-based EOS pattern detection with pending output buffering.
628  *
629  * When special tokens like <|im_end|> are tokenized as regular text
630  * (e.g., !, im, _end, !), we need to detect the pattern in the output
631  * and avoid outputting the partial pattern tokens.
632  *
633  * This is a workaround for tokenizers that don't properly encode special tokens.
634  */
635 #define EOS_PATTERN_BUF_SIZE 64
636 #define EOS_PENDING_MAX 8
637 
638 typedef struct {
639  char pattern_buf[EOS_PATTERN_BUF_SIZE]; /* Accumulated text for pattern matching */
640  int pattern_len;
641  char *pending[EOS_PENDING_MAX]; /* Pending token texts (not yet output) */
642  int pending_count;
643  const char *target_pattern; /* Pattern to detect */
644  const char *partial_prefix; /* Prefix that might start the pattern */
645 } EOSPatternState;
646 
647 static EOSPatternState g_eos_state = {0};
648 
649 static void eos_pattern_reset(void) {
650  g_eos_state.pattern_len = 0;
651  g_eos_state.pattern_buf[0] = '\0';
652  for (int i = 0; i < g_eos_state.pending_count; i++) {
653  free(g_eos_state.pending[i]);
654  g_eos_state.pending[i] = NULL;
655  }
656  g_eos_state.pending_count = 0;
657  g_eos_state.target_pattern = NULL;
658  g_eos_state.partial_prefix = NULL;
659 }
660 
663  switch (tmpl) {
664  case CHAT_TEMPLATE_QWEN:
666  g_eos_state.target_pattern = "im_end";
667  g_eos_state.partial_prefix = "im";
668  break;
669  case CHAT_TEMPLATE_LLAMA:
671  g_eos_state.target_pattern = "</s>";
672  g_eos_state.partial_prefix = "</";
673  break;
674  default:
675  break;
676  }
677 }
678 
679 /**
680  * Check if token might be start of EOS pattern.
681  */
682 static bool eos_is_potential_prefix(const char *token) {
683  if (!token || !g_eos_state.partial_prefix) return false;
684 
685  /* Check if current accumulated buffer + token could start the pattern */
686  size_t tlen = strlen(token);
687  size_t plen = g_eos_state.pattern_len;
688  size_t target_len = g_eos_state.target_pattern ? strlen(g_eos_state.target_pattern) : 0;
689 
690  /* If buffer + token contains partial match of target, it's a potential prefix */
691  if (target_len == 0) return false;
692 
693  /* Build temp buffer */
694  char temp[EOS_PATTERN_BUF_SIZE];
695  if (plen + tlen >= EOS_PATTERN_BUF_SIZE) return false;
696  memcpy(temp, g_eos_state.pattern_buf, plen);
697  memcpy(temp + plen, token, tlen);
698  temp[plen + tlen] = '\0';
699 
700  /* Check if temp is a prefix of target or contains start of target */
701  const char *target = g_eos_state.target_pattern;
702  size_t temp_len = plen + tlen;
703 
704  /* Look for any suffix of temp that is a prefix of target */
705  for (size_t i = 0; i < temp_len; i++) {
706  size_t remaining = temp_len - i;
707  if (remaining > target_len) remaining = target_len;
708  if (strncmp(temp + i, target, remaining) == 0) {
709  return true;
710  }
711  }
712 
713  return false;
714 }
715 
716 /**
717  * Process a token for EOS pattern detection.
718  *
719  * @param token_text The token text to process
720  * @param out_buf Output buffer for safe-to-output text
721  * @param out_len Current length of output buffer
722  * @param tmpl Chat template type
723  * @return true if EOS pattern detected, false otherwise
724  */
725 static bool eos_pattern_process(const char *token_text, char *out_buf, size_t *out_len,
726  void (*output_fn)(char*, size_t*, const char*),
727  ChatTemplateType tmpl) {
728  if (!token_text || !g_eos_state.target_pattern) {
729  /* No pattern to match - output directly */
730  if (token_text && output_fn) output_fn(out_buf, out_len, token_text);
731  return false;
732  }
733 
734  /* Append to pattern buffer */
735  size_t tlen = strlen(token_text);
736  if (g_eos_state.pattern_len + (int)tlen < EOS_PATTERN_BUF_SIZE - 1) {
737  memcpy(g_eos_state.pattern_buf + g_eos_state.pattern_len, token_text, tlen);
738  g_eos_state.pattern_len += (int)tlen;
739  g_eos_state.pattern_buf[g_eos_state.pattern_len] = '\0';
740  }
741 
742  /* Check if pattern is complete */
743  if (strstr(g_eos_state.pattern_buf, g_eos_state.target_pattern)) {
744  /* EOS detected - don't output pending tokens */
746  return true;
747  }
748 
749  /* Check if this could still be part of the pattern */
750  if (eos_is_potential_prefix(token_text)) {
751  /* Hold this token - might be part of EOS */
752  if (g_eos_state.pending_count < EOS_PENDING_MAX) {
753  g_eos_state.pending[g_eos_state.pending_count] = strdup(token_text);
754  g_eos_state.pending_count++;
755  }
756  return false;
757  }
758 
759  /* Not part of pattern - flush pending tokens and this one */
760  for (int i = 0; i < g_eos_state.pending_count; i++) {
761  if (output_fn) output_fn(out_buf, out_len, g_eos_state.pending[i]);
762  free(g_eos_state.pending[i]);
763  g_eos_state.pending[i] = NULL;
764  }
765  g_eos_state.pending_count = 0;
766  g_eos_state.pattern_len = 0;
767  g_eos_state.pattern_buf[0] = '\0';
768 
769  if (output_fn) output_fn(out_buf, out_len, token_text);
770  return false;
771 }
772 
773 static bool parse_eos_ids(const char *arg, CLIOptions *opt) {
774  if (!arg || !opt) return false;
775  opt->eos_count = 0;
776  const char *p = arg;
777  while (*p && opt->eos_count < CK_CLI_EOS_MAX) {
778  char *end = NULL;
779  long v = strtol(p, &end, 10);
780  if (end == p) break;
781  opt->eos_ids[opt->eos_count++] = (int)v;
782  p = end;
783  if (*p == ',') p++;
784  }
785  return opt->eos_count > 0;
786 }
787 
788 /* ============================================================================
789  * Prompt Execution
790  * ============================================================================ */
791 
792 static int run_prompt(ModelAPI *api, CKTrueBPE *tokenizer, CLIOptions *opt, const char *input) {
793  if (!api || !tokenizer || !opt || !input) return -1;
794  if (g_exit_requested) return -1;
795 
796  int ctx = opt->context_override;
797  if (ctx <= 0 && api->get_context) ctx = api->get_context();
798  if (ctx <= 0) ctx = 4096;
799  if (ctx > CK_CLI_MAX_CONTEXT) ctx = CK_CLI_MAX_CONTEXT;
800 
801  int max_tokens = opt->max_tokens > 0 ? opt->max_tokens : CK_CLI_DEFAULT_MAX_TOKENS;
802 
803  /* Apply chat template if enabled */
804  const ChatTemplate *tmpl = &g_templates[opt->no_chat_template ? CHAT_TEMPLATE_NONE : opt->chat_template];
805  char *formatted = apply_chat_template(tmpl, opt->system_prompt, input);
806  if (!formatted) {
807  fprintf(stderr, "Error: failed to format prompt\n");
808  return -1;
809  }
810 
811  if (opt->verbose) {
812  printf("[DEBUG] Formatted prompt:\n%s\n", formatted);
813  }
814 
815  int32_t *ids = (int32_t *)malloc((size_t)ctx * sizeof(int32_t));
816  if (!ids) {
817  fprintf(stderr, "Error: failed to allocate token buffer\n");
818  free(formatted);
819  return -1;
820  }
821 
822  int n = ck_true_bpe_encode(tokenizer, formatted, -1, ids, ctx);
823  free(formatted);
824 
825  if (n <= 0) {
826  fprintf(stderr, "[Tokenizer] failed to encode prompt\n");
827  free(ids);
828  return -1;
829  }
830  if (n > ctx - max_tokens) {
831  n = ctx - max_tokens;
832  if (opt->verbose) {
833  printf("[DEBUG] Truncated prompt to %d tokens\n", n);
834  }
835  }
836 
837  g_prefill_time_ms = 0.0;
838  g_decode_time_ms = 0.0;
839  g_decode_count = 0;
840  g_prompt_tokens = n;
841 
842  if (api->kv_reset) api->kv_reset();
843 
844  if (api->embed(ids, n) != 0) {
845  fprintf(stderr, "[Model] embed failed\n");
846  free(ids);
847  return -1;
848  }
849 
850  struct timespec t0, t1;
851  clock_gettime(CLOCK_MONOTONIC, &t0);
852  if (api->forward(NULL) != 0) {
853  fprintf(stderr, "[Model] forward failed\n");
854  free(ids);
855  return -1;
856  }
857  clock_gettime(CLOCK_MONOTONIC, &t1);
858  g_prefill_time_ms = (t1.tv_sec - t0.tv_sec) * 1000.0 +
859  (t1.tv_nsec - t0.tv_nsec) / 1000000.0;
860 
861  /* Get vocab size for sampling */
862  int vocab_size = api->get_vocab_size ? api->get_vocab_size() : 0;
863 
864  /* Sample first token */
865  int next_token;
866  if (opt->temperature > 0.0f && api->get_logits && vocab_size > 0) {
867  float *logits = api->get_logits();
868  if (logits) {
869  /* Get logits for last position */
870  int active = api->get_active_tokens ? api->get_active_tokens() : 1;
871  float *last_logits = logits + (size_t)(active - 1) * vocab_size;
872  /* Make a copy since sampling modifies in place */
873  float *logits_copy = (float *)malloc(vocab_size * sizeof(float));
874  memcpy(logits_copy, last_logits, vocab_size * sizeof(float));
875  next_token = sample_top_p(logits_copy, vocab_size, opt->temperature, opt->top_p);
876  free(logits_copy);
877  } else {
878  next_token = api->sample();
879  }
880  } else {
881  next_token = api->sample();
882  }
883 
884  char out_buf[CK_CLI_OUTPUT_BUF_SIZE];
885  size_t out_len = 0;
886 
887  /* Initialize EOS pattern detection for this prompt */
888  eos_pattern_init(opt->chat_template);
889 
891 
892  for (int generated = 0; generated < max_tokens && !g_exit_requested && g_generation_active; generated++) {
893  if (next_token < 0) break;
894 
895  if (opt->verbose) {
896  const char *tok_str = ck_true_bpe_id_to_token(tokenizer, next_token);
897  fprintf(stderr, "[DEBUG] Token %d: %d (%s)\n", generated, next_token, tok_str ? tok_str : "NULL");
898  }
899 
900  if (is_eos_token(opt, next_token)) {
901  if (opt->verbose) {
902  fprintf(stderr, "[DEBUG] EOS detected (token ID), stopping\n");
903  }
904  break;
905  }
906 
907  const char *word = ck_true_bpe_id_to_token(tokenizer, next_token);
908 
909  /* Process token through EOS pattern detection (buffers potential EOS tokens) */
910  if (!opt->ignore_eos &&
911  eos_pattern_process(word, out_buf, &out_len, output_token, opt->chat_template)) {
912  if (opt->verbose) {
913  fprintf(stderr, "[DEBUG] EOS detected (text pattern), stopping\n");
914  }
915  break;
916  }
917 
918  if (opt->stream) {
919  output_flush(out_buf, &out_len);
920  fflush(stdout);
921  } else if (out_len > (CK_CLI_OUTPUT_BUF_SIZE / 2)) {
922  output_flush(out_buf, &out_len);
923  fflush(stdout);
924  }
925 
926  if (generated + 1 >= max_tokens) break;
927 
928  clock_gettime(CLOCK_MONOTONIC, &t0);
929  if (api->decode(next_token, NULL) != 0) {
930  fprintf(stderr, "\n[Model] decode failed\n");
931  break;
932  }
933  clock_gettime(CLOCK_MONOTONIC, &t1);
934  g_decode_time_ms += (t1.tv_sec - t0.tv_sec) * 1000.0 +
935  (t1.tv_nsec - t0.tv_nsec) / 1000000.0;
936  g_decode_count++;
937 
938  /* Sample next token */
939  if (opt->temperature > 0.0f && api->get_logits && vocab_size > 0) {
940  float *logits = api->get_logits();
941  if (logits) {
942  int active = api->get_active_tokens ? api->get_active_tokens() : 1;
943  float *last_logits = logits + (size_t)(active - 1) * vocab_size;
944  float *logits_copy = (float *)malloc(vocab_size * sizeof(float));
945  memcpy(logits_copy, last_logits, vocab_size * sizeof(float));
946  next_token = sample_top_p(logits_copy, vocab_size, opt->temperature, opt->top_p);
947  free(logits_copy);
948  } else {
949  next_token = api->sample();
950  }
951  } else {
952  next_token = api->sample();
953  }
954  }
955 
957  output_flush(out_buf, &out_len);
958  printf("\n");
959 
960  if (opt->timing) {
961  double total_ms = g_prefill_time_ms + g_decode_time_ms;
962  double prefill_rate = g_prompt_tokens / (g_prefill_time_ms / 1000.0);
963  double decode_rate = g_decode_count > 0 ? g_decode_count / (g_decode_time_ms / 1000.0) : 0.0;
964  double avg_decode = g_decode_count > 0 ? g_decode_time_ms / g_decode_count : 0.0;
965 
966  printf("\033[90m"); /* Gray text */
967  printf("prompt: %3d tok / %7.1f ms (%5.1f tok/s) | ", g_prompt_tokens, g_prefill_time_ms, prefill_rate);
968  printf("decode: %3d tok / %7.1f ms (%5.1f tok/s, %5.1f ms/tok)\033[0m\n",
969  g_decode_count, g_decode_time_ms, decode_rate, avg_decode);
970  }
971  fflush(stdout);
972 
973  free(ids);
974  return 0;
975 }
976 
977 /* ============================================================================
978  * Help & Argument Parsing
979  * ============================================================================ */
980 
981 static void print_banner(void) {
982  printf("\n");
983  printf(" \033[1;36mC-Kernel-Engine v%s\033[0m\n", CK_CLI_VERSION);
984  printf(" Native inference CLI with true-BPE tokenization\n");
985  printf("\n");
986 }
987 
988 static void print_help(const char *prog) {
989  print_banner();
990  fprintf(stderr, "Usage:\n");
991  fprintf(stderr, " %s --model <name> Auto-discover model from cache\n", prog);
992  fprintf(stderr, " %s <libmodel.so> <weights.bump> Direct paths\n", prog);
993  fprintf(stderr, " %s --lib <.so> --weights <.bump> Named arguments\n", prog);
994  fprintf(stderr, "\nOptions:\n");
995  fprintf(stderr, " --model, -m NAME Model name (searches in cache)\n");
996  fprintf(stderr, " --lib PATH Path to compiled model .so\n");
997  fprintf(stderr, " --weights PATH Path to weights .bump file\n");
998  fprintf(stderr, " --prompt, -p TEXT Run single prompt (non-interactive)\n");
999  fprintf(stderr, " --system, -S TEXT System prompt\n");
1000  fprintf(stderr, " --max-tokens, -n N Max tokens to generate (default: %d)\n", CK_CLI_DEFAULT_MAX_TOKENS);
1001  fprintf(stderr, " --context, -c N Override context/KV cache size\n");
1002  fprintf(stderr, " --temperature, -T F Sampling temperature (default: 0.0 = greedy)\n");
1003  fprintf(stderr, " --top-p F Nucleus sampling top-p (default: 0.9)\n");
1004  fprintf(stderr, " --stream, -s Stream tokens as generated\n");
1005  fprintf(stderr, " --timing, -t Show timing breakdown\n");
1006  fprintf(stderr, " --no-chat-template Disable chat template formatting\n");
1007  fprintf(stderr, " --eos IDS Comma-separated EOS token IDs\n");
1008  fprintf(stderr, " --ignore-eos Do not stop on EOS tokens\n");
1009  fprintf(stderr, " --list List available models\n");
1010  fprintf(stderr, " --verbose, -v Verbose output\n");
1011  fprintf(stderr, " --help, -h Show this help\n");
1012  fprintf(stderr, "\nREPL Commands:\n");
1013  fprintf(stderr, " /exit, /quit Exit the REPL\n");
1014  fprintf(stderr, " /reset Reset KV cache\n");
1015  fprintf(stderr, " /timing Toggle timing display\n");
1016  fprintf(stderr, " /temp <value> Set temperature\n");
1017  fprintf(stderr, " /system <text> Set system prompt\n");
1018  fprintf(stderr, " /help Show help\n");
1019 }
1020 
1021 static bool parse_args(int argc, char **argv, CLIOptions *opt) {
1022  if (!opt) return false;
1023  memset(opt, 0, sizeof(*opt));
1024  opt->max_tokens = CK_CLI_DEFAULT_MAX_TOKENS;
1025  opt->temperature = 0.0f; /* Greedy by default */
1026  opt->top_p = 0.9f;
1027  opt->stream = true; /* Stream by default */
1028  opt->timing = true; /* Show timing by default */
1029  /* Default EOS tokens for Qwen/ChatML */
1030  opt->eos_ids[0] = 151643; /* <|im_end|> */
1031  opt->eos_ids[1] = 151645; /* <|endoftext|> */
1032  opt->eos_ids[2] = 151644; /* <|im_sep|> */
1033  opt->eos_count = 3;
1034 
1035  for (int i = 1; i < argc; i++) {
1036  const char *arg = argv[i];
1037 
1038  if (!strcmp(arg, "--help") || !strcmp(arg, "-h")) {
1039  print_help(argv[0]);
1040  return false;
1041  } else if (!strcmp(arg, "--list")) {
1043  return false;
1044  } else if ((!strcmp(arg, "--model") || !strcmp(arg, "-m")) && i + 1 < argc) {
1045  opt->model_name = argv[++i];
1046  } else if (!strcmp(arg, "--lib") && i + 1 < argc) {
1047  opt->lib_path = argv[++i];
1048  } else if (!strcmp(arg, "--weights") && i + 1 < argc) {
1049  opt->weights_path = argv[++i];
1050  } else if ((!strcmp(arg, "--prompt") || !strcmp(arg, "-p")) && i + 1 < argc) {
1051  opt->prompt_once = argv[++i];
1052  } else if ((!strcmp(arg, "--system") || !strcmp(arg, "-S")) && i + 1 < argc) {
1053  opt->system_prompt = argv[++i];
1054  } else if ((!strcmp(arg, "--max-tokens") || !strcmp(arg, "-n")) && i + 1 < argc) {
1055  opt->max_tokens = atoi(argv[++i]);
1056  } else if ((!strcmp(arg, "--context") || !strcmp(arg, "-c")) && i + 1 < argc) {
1057  opt->context_override = atoi(argv[++i]);
1058  } else if ((!strcmp(arg, "--temperature") || !strcmp(arg, "-T")) && i + 1 < argc) {
1059  opt->temperature = (float)atof(argv[++i]);
1060  } else if (!strcmp(arg, "--top-p") && i + 1 < argc) {
1061  opt->top_p = (float)atof(argv[++i]);
1062  } else if (!strcmp(arg, "--stream") || !strcmp(arg, "-s")) {
1063  opt->stream = true;
1064  } else if (!strcmp(arg, "--no-stream")) {
1065  opt->stream = false;
1066  } else if (!strcmp(arg, "--timing") || !strcmp(arg, "-t")) {
1067  opt->timing = true;
1068  } else if (!strcmp(arg, "--no-timing")) {
1069  opt->timing = false;
1070  } else if (!strcmp(arg, "--no-chat-template")) {
1071  opt->no_chat_template = true;
1072  } else if (!strcmp(arg, "--eos") && i + 1 < argc) {
1073  parse_eos_ids(argv[++i], opt);
1074  } else if (!strcmp(arg, "--ignore-eos")) {
1075  opt->ignore_eos = true;
1076  } else if (!strcmp(arg, "--verbose") || !strcmp(arg, "-v")) {
1077  opt->verbose = true;
1078  } else if (arg[0] != '-') {
1079  if (!opt->lib_path) opt->lib_path = arg;
1080  else if (!opt->weights_path) opt->weights_path = arg;
1081  else {
1082  fprintf(stderr, "Unknown argument: %s\n", arg);
1083  return false;
1084  }
1085  } else {
1086  fprintf(stderr, "Unknown option: %s\n", arg);
1087  return false;
1088  }
1089  }
1090 
1091  /* Auto-discover model if --model specified */
1092  if (opt->model_name && (!opt->lib_path || !opt->weights_path)) {
1093  static char lib_buf[4096], weights_buf[4096];
1094  if (find_model_in_cache(opt->model_name, lib_buf, weights_buf, sizeof(lib_buf))) {
1095  opt->lib_path = lib_buf;
1096  opt->weights_path = weights_buf;
1097  } else {
1098  fprintf(stderr, "Error: model '%s' not found in cache\n", opt->model_name);
1099  fprintf(stderr, "Run with --list to see available models\n");
1100  return false;
1101  }
1102  }
1103 
1104  if (!opt->lib_path || !opt->weights_path) {
1105  print_help(argv[0]);
1106  return false;
1107  }
1108 
1109  /* Auto-detect chat template from model name/path */
1110  const char *name_for_template = opt->model_name ? opt->model_name : opt->lib_path;
1111  opt->chat_template = detect_chat_template(name_for_template);
1112 
1113  /* Load EOS tokens from vocab.json if available */
1114  if (load_eos_from_vocab_json(opt->weights_path, opt)) {
1115  if (opt->verbose) {
1116  printf("[DEBUG] Loaded %d EOS tokens: ", opt->eos_count);
1117  for (int i = 0; i < opt->eos_count; i++) {
1118  printf("%d ", opt->eos_ids[i]);
1119  }
1120  printf("\n");
1121  }
1122  }
1123 
1124  return true;
1125 }
1126 
1127 /* ============================================================================
1128  * REPL Command Processing
1129  * ============================================================================ */
1130 
1131 static bool process_repl_command(const char *line, CLIOptions *opt, ModelAPI *api) {
1132  if (!line || line[0] != '/') return false;
1133 
1134  if (!strncmp(line, "/exit", 5) || !strncmp(line, "/quit", 5)) {
1135  g_exit_requested = 1;
1136  return true;
1137  }
1138  if (!strncmp(line, "/help", 5)) {
1139  printf("REPL Commands:\n");
1140  printf(" /exit, /quit Exit\n");
1141  printf(" /reset Reset KV cache\n");
1142  printf(" /timing Toggle timing display\n");
1143  printf(" /temp <value> Set temperature (0 = greedy)\n");
1144  printf(" /top-p <value> Set top-p\n");
1145  printf(" /system <text> Set system prompt\n");
1146  printf(" /clear Clear system prompt\n");
1147  printf(" /verbose Toggle verbose mode\n");
1148  return true;
1149  }
1150  if (!strncmp(line, "/reset", 6)) {
1151  if (api->kv_reset) {
1152  api->kv_reset();
1153  printf("[KV cache reset]\n");
1154  }
1155  return true;
1156  }
1157  if (!strncmp(line, "/timing", 7)) {
1158  opt->timing = !opt->timing;
1159  printf("[Timing %s]\n", opt->timing ? "enabled" : "disabled");
1160  return true;
1161  }
1162  if (!strncmp(line, "/verbose", 8)) {
1163  opt->verbose = !opt->verbose;
1164  printf("[Verbose %s]\n", opt->verbose ? "enabled" : "disabled");
1165  return true;
1166  }
1167  if (!strncmp(line, "/temp ", 6)) {
1168  opt->temperature = (float)atof(line + 6);
1169  printf("[Temperature set to %.2f]\n", opt->temperature);
1170  return true;
1171  }
1172  if (!strncmp(line, "/top-p ", 7)) {
1173  opt->top_p = (float)atof(line + 7);
1174  printf("[Top-p set to %.2f]\n", opt->top_p);
1175  return true;
1176  }
1177  if (!strncmp(line, "/system ", 8)) {
1178  opt->system_prompt = strdup(line + 8);
1179  printf("[System prompt set]\n");
1180  return true;
1181  }
1182  if (!strncmp(line, "/clear", 6)) {
1183  opt->system_prompt = NULL;
1184  printf("[System prompt cleared]\n");
1185  return true;
1186  }
1187 
1188  printf("Unknown command: %s\n", line);
1189  return true;
1190 }
1191 
1192 /* ============================================================================
1193  * Main
1194  * ============================================================================ */
1195 
1196 int main(int argc, char **argv) {
1197  signal(SIGINT, handle_sigint);
1198  srand((unsigned int)time(NULL));
1199 
1200  CLIOptions opt;
1201  if (!parse_args(argc, argv, &opt)) {
1202  return 1;
1203  }
1204 
1205  print_banner();
1206  printf("Loading: %s\n", opt.lib_path);
1207 
1208  ModelAPI api;
1209  if (!load_model_api(opt.lib_path, &api)) {
1210  return 1;
1211  }
1212 
1213  printf("Initializing model...\n");
1214  if (api.init(opt.weights_path) != 0) {
1215  fprintf(stderr, "Error: ck_model_init failed\n");
1216  return 1;
1217  }
1218 
1219  int ctx = opt.context_override;
1220  if (ctx <= 0 && api.get_context) ctx = api.get_context();
1221  if (api.kv_enable && ctx > 0) {
1222  api.kv_enable(ctx);
1223  }
1224 
1225  CKTrueBPE *tokenizer = ck_true_bpe_create();
1226  if (!tokenizer) {
1227  fprintf(stderr, "[Tokenizer] failed to create\n");
1228  return 1;
1229  }
1230 
1231  int vocab_size = api.get_vocab_size ? api.get_vocab_size() : 0;
1232  int vocab_bytes = api.get_vocab_bytes ? api.get_vocab_bytes() : 0;
1233  int num_merges = api.get_num_merges ? api.get_num_merges() : 0;
1234  const int32_t *offsets = (const int32_t *)api.get_offsets();
1235  const char *strings = (const char *)api.get_strings();
1236  const int32_t *merges = api.get_merges ? (const int32_t *)api.get_merges() : NULL;
1237 
1238  if (vocab_size <= 0 || vocab_bytes <= 0 || !offsets || !strings) {
1239  fprintf(stderr, "[Tokenizer] missing vocab data in model\n");
1240  ck_true_bpe_free(tokenizer);
1241  return 1;
1242  }
1243 
1245  fprintf(stderr, "[Tokenizer] failed to load vocab\n");
1246  ck_true_bpe_free(tokenizer);
1247  return 1;
1248  }
1249 
1250  printf("Ready! Vocab: %d, Context: %d, Template: %s\n",
1251  vocab_size, ctx,
1252  opt.no_chat_template ? "none" :
1253  opt.chat_template == CHAT_TEMPLATE_QWEN ? "qwen" :
1254  opt.chat_template == CHAT_TEMPLATE_LLAMA ? "llama" :
1255  opt.chat_template == CHAT_TEMPLATE_MISTRAL ? "mistral" : "chatml");
1256 
1257  /* Print CPU capability info */
1259  printf("[Hardware] %s | Vector: %d-bit | FMA: %s | AI Accel: %s | Kernel: %s\n",
1260  cap.name, cap.width, cap.has_fma ? "Yes" : "No",
1261  cap.has_ai_accel ? "Yes" : "No", cap.best_kernel);
1262 
1263  printf("Type /help for commands, Ctrl+C to stop generation\n\n");
1264 
1265  setvbuf(stdout, NULL, _IOFBF, 1 << 20);
1266 
1267  if (opt.prompt_once) {
1268  run_prompt(&api, tokenizer, &opt, opt.prompt_once);
1269  } else {
1270  /* REPL */
1271 #ifdef HAVE_READLINE
1272  char *home = getenv("HOME");
1273  char history_path[4096];
1274  if (home) {
1275  snprintf(history_path, sizeof(history_path), "%s/%s", home, CK_CLI_HISTORY_FILE);
1276  read_history(history_path);
1277  }
1278 #endif
1279 
1280  while (!g_exit_requested) {
1281 #ifdef HAVE_READLINE
1282  char *line = readline("\033[1;32mYou:\033[0m ");
1283  if (!line) break;
1284  if (*line) add_history(line);
1285 #else
1286  printf("\033[1;32mYou:\033[0m ");
1287  fflush(stdout);
1288  char line_buf[4096];
1289  if (!fgets(line_buf, sizeof(line_buf), stdin)) {
1290  if (feof(stdin) || g_exit_requested) break;
1291  if (errno == EINTR) break;
1292  continue;
1293  }
1294  /* Remove trailing newline */
1295  size_t len = strlen(line_buf);
1296  if (len > 0 && line_buf[len-1] == '\n') line_buf[len-1] = '\0';
1297  char *line = line_buf;
1298 #endif
1299 
1300  if (line[0] == '\0') {
1301 #ifdef HAVE_READLINE
1302  free(line);
1303 #endif
1304  continue;
1305  }
1306 
1307  if (line[0] == '/') {
1308  process_repl_command(line, &opt, &api);
1309 #ifdef HAVE_READLINE
1310  free(line);
1311 #endif
1312  continue;
1313  }
1314 
1315  printf("\033[1;34mAssistant:\033[0m ");
1316  fflush(stdout);
1317  run_prompt(&api, tokenizer, &opt, line);
1318 
1319 #ifdef HAVE_READLINE
1320  free(line);
1321 #endif
1322  }
1323 
1324 #ifdef HAVE_READLINE
1325  if (home) {
1326  write_history(history_path);
1327  }
1328 #endif
1329  }
1330 
1331  ck_true_bpe_free(tokenizer);
1332  if (api.free_fn) api.free_fn();
1333  if (api.handle) dlclose(api.handle);
1334 
1335  printf("\nGoodbye!\n");
1336  return 0;
1337 }
void(* kv_reset_t)(void)
Definition: ck_cli_v6.5.c:73
void *(* get_ptr_t)(void)
Definition: ck_cli_v6.5.c:78
static bool parse_eos_ids(const char *arg, CLIOptions *opt)
Definition: ck_cli_v6.5.c:773
int(* init_t)(const char *weights_path)
Definition: ck_cli_v6.5.c:69
static bool resolve_symbol(void *handle, const char *name, void **out_ptr, bool required)
Definition: ck_cli_v6.5.c:525
static double g_decode_time_ms
Definition: ck_cli_v6.5.c:52
static void handle_sigint(int sig)
Definition: ck_cli_v6.5.c:56
int(* embed_t)(const int32_t *tokens, int num_tokens)
Definition: ck_cli_v6.5.c:70
static char * apply_chat_template(const ChatTemplate *tmpl, const char *system, const char *user)
Definition: ck_cli_v6.5.c:588
int(* kv_enable_t)(int capacity)
Definition: ck_cli_v6.5.c:72
static int sample_top_p(float *logits, int vocab_size, float temperature, float top_p)
Definition: ck_cli_v6.5.c:334
static bool load_eos_from_vocab_json(const char *weights_path, CLIOptions *opt)
Definition: ck_cli_v6.5.c:241
ChatTemplateType
Definition: ck_cli_v6.5.c:106
@ CHAT_TEMPLATE_LLAMA
Definition: ck_cli_v6.5.c:109
@ CHAT_TEMPLATE_MISTRAL
Definition: ck_cli_v6.5.c:111
@ CHAT_TEMPLATE_QWEN
Definition: ck_cli_v6.5.c:108
@ CHAT_TEMPLATE_CHATML
Definition: ck_cli_v6.5.c:110
@ CHAT_TEMPLATE_NONE
Definition: ck_cli_v6.5.c:107
static double g_prefill_time_ms
Definition: ck_cli_v6.5.c:51
static bool find_model_in_cache(const char *model_name, char *lib_out, char *weights_out, size_t out_size)
Definition: ck_cli_v6.5.c:205
int main(int argc, char **argv)
Definition: ck_cli_v6.5.c:1196
static void print_help(const char *prog)
Definition: ck_cli_v6.5.c:988
#define CK_CLI_EOS_MAX
Definition: ck_cli_v6.5.c:42
static ChatTemplateType detect_chat_template(const char *model_name)
Definition: ck_cli_v6.5.c:573
void(* free_t)(void)
Definition: ck_cli_v6.5.c:79
#define CK_CLI_HISTORY_FILE
Definition: ck_cli_v6.5.c:45
static int g_decode_count
Definition: ck_cli_v6.5.c:53
static bool process_repl_command(const char *line, CLIOptions *opt, ModelAPI *api)
Definition: ck_cli_v6.5.c:1131
static bool is_eos_token(const CLIOptions *opt, int token)
Definition: ck_cli_v6.5.c:618
static bool eos_is_potential_prefix(const char *token)
Definition: ck_cli_v6.5.c:682
int(* forward_t)(float *logits_out)
Definition: ck_cli_v6.5.c:71
#define EOS_PENDING_MAX
Definition: ck_cli_v6.5.c:636
float *(* get_logits_t)(void)
Definition: ck_cli_v6.5.c:76
static void eos_pattern_init(ChatTemplateType tmpl)
Definition: ck_cli_v6.5.c:661
static void output_append(char *buf, size_t *len, const char *text)
Definition: ck_cli_v6.5.c:496
static void list_available_models(void)
Definition: ck_cli_v6.5.c:297
static volatile sig_atomic_t g_generation_active
Definition: ck_cli_v6.5.c:48
#define CK_CLI_MAX_CONTEXT
Definition: ck_cli_v6.5.c:44
static int decode_bpe_token(const char *token, char *out, int max)
Definition: ck_cli_v6.5.c:427
int(* get_int_t)(void)
Definition: ck_cli_v6.5.c:77
int(* decode_t)(int32_t token, float *logits_out)
Definition: ck_cli_v6.5.c:74
#define EOS_PATTERN_BUF_SIZE
Definition: ck_cli_v6.5.c:635
static void print_banner(void)
Definition: ck_cli_v6.5.c:981
static bool parse_args(int argc, char **argv, CLIOptions *opt)
Definition: ck_cli_v6.5.c:1021
static int run_prompt(ModelAPI *api, CKTrueBPE *tokenizer, CLIOptions *opt, const char *input)
Definition: ck_cli_v6.5.c:792
static void eos_pattern_reset(void)
Definition: ck_cli_v6.5.c:649
static volatile sig_atomic_t g_exit_requested
Definition: ck_cli_v6.5.c:47
static EOSPatternState g_eos_state
Definition: ck_cli_v6.5.c:647
#define CK_CLI_OUTPUT_BUF_SIZE
Definition: ck_cli_v6.5.c:43
#define CK_CLI_DEFAULT_MAX_TOKENS
Definition: ck_cli_v6.5.c:41
static void output_flush(char *buf, size_t *len)
Definition: ck_cli_v6.5.c:490
static const ChatTemplate g_templates[]
Definition: ck_cli_v6.5.c:124
static const char * get_cache_dir(void)
Definition: ck_cli_v6.5.c:197
static bool load_model_api(const char *lib_path, ModelAPI *api)
Definition: ck_cli_v6.5.c:535
static void output_token(char *buf, size_t *len, const char *token)
Definition: ck_cli_v6.5.c:510
static bool eos_pattern_process(const char *token_text, char *out_buf, size_t *out_len, void(*output_fn)(char *, size_t *, const char *), ChatTemplateType tmpl)
Definition: ck_cli_v6.5.c:725
#define CK_CLI_VERSION
Definition: ck_cli_v6.5.c:40
int(* sample_argmax_t)(void)
Definition: ck_cli_v6.5.c:75
static int g_prompt_tokens
Definition: ck_cli_v6.5.c:54
CPU feature detection and dispatch macros.
static ck_capability_t ck_get_capabilities(void)
Get current platform capabilities.
Definition: ck_features.h:226
CPU capability information structure.
Definition: ck_features.h:215
const char * best_kernel
Definition: ck_features.h:220
const char * name
Definition: ck_features.h:216
const int32_t * ids
Definition: tokenizer.h:443
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t bos
Definition: tokenizer.h:230
const int32_t int int * out_len
Definition: tokenizer.h:445
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1338
void ck_true_bpe_free(CKTrueBPE *bpe)
Definition: true_bpe.c:405
CKTrueBPE * ck_true_bpe_create(void)
Definition: true_bpe.c:342
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: true_bpe.c:606
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
uint32_t end
Definition: utf8.c:215