← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_tokenizer.c
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine BPE Tokenizer Implementation [v4 - correctness complete, optimization pending]
3  *
4  * Pure C implementation of Byte-Pair Encoding.
5  * Reads HuggingFace tokenizer.json format.
6  *
7  * Token IDs are dense indices into the embedding table:
8  * embedding[token_id] gives the vector for that token.
9  *
10  * TODO: Performance optimizations (correctness is done, these are for throughput):
11  *
12  * 1. Pre-allocate encode buffer
13  * - ck_tokenizer_encode() mallocs int32_t[] per call (line ~629)
14  * - Add encode_buf + encode_buf_cap to CKTokenizer struct
15  * - Allocate in ck_tokenizer_init(), realloc only if input exceeds cap
16  * - Eliminates malloc/free syscall per encode call
17  *
18  * 2. Linked list instead of array shift in merge loop
19  * - Merge at line ~686 shifts entire tail of tokens[] left by 1: O(n) per merge
20  * - With n initial tokens and m merges, total shifting is O(n * m)
21  * - Doubly-linked list would make each merge O(1)
22  *
23  * 3. Priority queue for best-merge scan
24  * - find_best_merge scans ALL pairs each iteration (line ~673): O(n) per merge
25  * - Min-heap of valid merge candidates: O(n log n) total instead of O(n * m)
26  *
27  * Current profile: tokenizer runs 2x per prompt (not per token). GEMM kernels
28  * dominate at 57% of compute. These optimizations matter for batch/server use.
29  *
30  * By Anthony Shivakumar
31  */
32 
33 #include "ck_tokenizer.h"
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <ctype.h>
39 
40 /* Simple JSON parser state */
41 typedef struct {
42  const char *data;
43  const char *pos;
44  const char *end;
45 } JSONParser;
46 
47 /* ========================================================================== */
48 /* Memory Pool */
49 /* ========================================================================== */
50 
51 void ck_pool_init(CKMemPool *pool) {
52  memset(pool, 0, sizeof(*pool));
53 }
54 
55 static CKPoolBlock *pool_new_block(size_t capacity) {
56  CKPoolBlock *block = (CKPoolBlock *)malloc(sizeof(CKPoolBlock));
57  if (!block) return NULL;
58  block->data = (uint8_t *)malloc(capacity);
59  if (!block->data) {
60  free(block);
61  return NULL;
62  }
63  block->used = 0;
64  block->capacity = capacity;
65  block->next = NULL;
66  return block;
67 }
68 
69 void *ck_pool_alloc(CKMemPool *pool, size_t size) {
70  /* Align to 8 bytes */
71  size = (size + 7) & ~7;
72 
73  /* Check if current block has space */
74  if (pool->current && pool->current->used + size <= pool->current->capacity) {
75  void *ptr = pool->current->data + pool->current->used;
76  pool->current->used += size;
77  pool->total_allocated += size;
78  return ptr;
79  }
80 
81  /* Need new block */
82  size_t block_size = CK_POOL_BLOCK_SIZE;
83  if (size > block_size) block_size = size;
84 
85  CKPoolBlock *block = pool_new_block(block_size);
86  if (!block) return NULL;
87 
88  block->next = pool->head;
89  pool->head = block;
90  pool->current = block;
91 
92  void *ptr = block->data;
93  block->used = size;
94  pool->total_allocated += size;
95  return ptr;
96 }
97 
98 char *ck_pool_strdup(CKMemPool *pool, const char *s, int len) {
99  if (len < 0) len = (int)strlen(s);
100  char *copy = (char *)ck_pool_alloc(pool, len + 1);
101  if (!copy) return NULL;
102  memcpy(copy, s, len);
103  copy[len] = '\0';
104  return copy;
105 }
106 
107 void ck_pool_free(CKMemPool *pool) {
108  CKPoolBlock *block = pool->head;
109  while (block) {
110  CKPoolBlock *next = block->next;
111  free(block->data);
112  free(block);
113  block = next;
114  }
115  memset(pool, 0, sizeof(*pool));
116 }
117 
118 /* ========================================================================== */
119 /* Hash Functions */
120 /* ========================================================================== */
121 
122 /* FNV-1a hash for strings */
123 static uint32_t hash_string(const char *s, int len) {
124  uint32_t hash = 2166136261u;
125  for (int i = 0; i < len; i++) {
126  hash ^= (uint8_t)s[i];
127  hash *= 16777619u;
128  }
129  return hash;
130 }
131 
132 /* Hash for merge pair (left_id, right_id) */
133 static uint32_t hash_pair(int32_t left, int32_t right) {
134  uint64_t combined = ((uint64_t)left << 32) | (uint32_t)right;
135  /* MurmurHash3 finalizer */
136  combined ^= combined >> 33;
137  combined *= 0xff51afd7ed558ccdULL;
138  combined ^= combined >> 33;
139  combined *= 0xc4ceb9fe1a85ec53ULL;
140  combined ^= combined >> 33;
141  return (uint32_t)combined;
142 }
143 
144 /* ========================================================================== */
145 /* Tokenizer Init/Free */
146 /* ========================================================================== */
147 
149  memset(tok, 0, sizeof(*tok));
150  ck_pool_init(&tok->pool);
151 
152  /* Default special tokens */
153  tok->unk_id = 0;
154  tok->bos_id = 1;
155  tok->eos_id = 2;
156  tok->pad_id = 3;
157 
158  /* Allocate vocab hash table */
159  tok->vocab_hash_size = 65536; /* 64K buckets */
160  tok->vocab_hash = (CKVocabEntry **)calloc(tok->vocab_hash_size, sizeof(CKVocabEntry *));
161  if (!tok->vocab_hash) return -1;
162 
163  /* Allocate reverse vocab */
164  tok->id_to_token = (char **)calloc(CK_MAX_VOCAB_SIZE, sizeof(char *));
165  if (!tok->id_to_token) {
166  free(tok->vocab_hash);
167  return -1;
168  }
169 
170  /* Allocate merge hash table */
171  tok->merge_hash_size = 262144; /* 256K buckets */
172  tok->merge_hash = (int *)malloc(tok->merge_hash_size * sizeof(int));
173  if (!tok->merge_hash) {
174  free(tok->vocab_hash);
175  free(tok->id_to_token);
176  return -1;
177  }
178  memset(tok->merge_hash, -1, tok->merge_hash_size * sizeof(int));
179 
180  return 0;
181 }
182 
184  ck_pool_free(&tok->pool);
185  free(tok->vocab_hash);
186  free(tok->id_to_token);
187  free(tok->merges);
188  free(tok->merge_hash);
189  memset(tok, 0, sizeof(*tok));
190 }
191 
192 /* ========================================================================== */
193 /* Vocabulary Operations */
194 /* ========================================================================== */
195 
196 int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len) {
197  if (len < 0) len = (int)strlen(token);
198  if (tok->vocab_size >= CK_MAX_VOCAB_SIZE) return -1;
199 
200  /* Check if already exists */
201  int32_t existing = ck_tokenizer_lookup(tok, token, len);
202  if (existing != tok->unk_id || (len == 0)) {
203  return existing;
204  }
205 
206  /* Create new entry */
207  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
208  if (!entry) return -1;
209 
210  entry->token = ck_pool_strdup(&tok->pool, token, len);
211  if (!entry->token) return -1;
212  entry->token_len = len;
213  entry->id = tok->vocab_size;
214 
215  /* Add to hash table */
216  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
217  entry->next = tok->vocab_hash[bucket];
218  tok->vocab_hash[bucket] = entry;
219 
220  /* Add to reverse lookup */
221  tok->id_to_token[tok->vocab_size] = entry->token;
222 
223  tok->vocab_size++;
224  return entry->id;
225 }
226 
227 int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len) {
228  if (len < 0) len = (int)strlen(token);
229  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
230 
231  for (CKVocabEntry *e = tok->vocab_hash[bucket]; e; e = e->next) {
232  if (e->token_len == len && memcmp(e->token, token, len) == 0) {
233  return e->id;
234  }
235  }
236  return tok->unk_id;
237 }
238 
239 const char *ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id) {
240  if (id < 0 || id >= tok->vocab_size) return NULL;
241  return tok->id_to_token[id];
242 }
243 
244 /* ========================================================================== */
245 /* Merge Operations */
246 /* ========================================================================== */
247 
248 int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged) {
249  int idx = tok->num_merges;
250 
251  /* Grow merges array if needed */
252  if (idx % 4096 == 0) {
253  size_t new_cap = (idx + 4096) * sizeof(CKMergeRule);
254  CKMergeRule *new_merges = (CKMergeRule *)realloc(tok->merges, new_cap);
255  if (!new_merges) return -1;
256  tok->merges = new_merges;
257  }
258 
259  tok->merges[idx].left = left;
260  tok->merges[idx].right = right;
261  tok->merges[idx].merged = merged;
262  tok->merges[idx].priority = idx; /* Earlier = higher priority */
263 
264  /* Add to hash table */
265  uint32_t bucket = hash_pair(left, right) % tok->merge_hash_size;
266  /* Linear probing */
267  while (tok->merge_hash[bucket] >= 0) {
268  bucket = (bucket + 1) % tok->merge_hash_size;
269  }
270  tok->merge_hash[bucket] = idx;
271 
272  tok->num_merges++;
273  return 0;
274 }
275 
276 int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right) {
277  uint32_t bucket = hash_pair(left, right) % tok->merge_hash_size;
278 
279  /* Linear probing */
280  int probes = 0;
281  while (tok->merge_hash[bucket] >= 0 && probes < tok->merge_hash_size) {
282  int idx = tok->merge_hash[bucket];
283  if (tok->merges[idx].left == left && tok->merges[idx].right == right) {
284  return idx;
285  }
286  bucket = (bucket + 1) % tok->merge_hash_size;
287  probes++;
288  }
289  return -1;
290 }
291 
292 /* ========================================================================== */
293 /* JSON Parser (minimal, just for tokenizer.json) */
294 /* ========================================================================== */
295 
296 static void json_skip_whitespace(JSONParser *p) {
297  while (p->pos < p->end && isspace((unsigned char)*p->pos)) {
298  p->pos++;
299  }
300 }
301 
302 static int json_match_char(JSONParser *p, char c) {
304  if (p->pos < p->end && *p->pos == c) {
305  p->pos++;
306  return 1;
307  }
308  return 0;
309 }
310 
311 static int json_parse_string(JSONParser *p, char *buf, int max_len) {
313  if (p->pos >= p->end || *p->pos != '"') return -1;
314  p->pos++;
315 
316  int len = 0;
317  while (p->pos < p->end && *p->pos != '"') {
318  char c = *p->pos++;
319  if (c == '\\' && p->pos < p->end) {
320  c = *p->pos++;
321  switch (c) {
322  case 'n': c = '\n'; break;
323  case 'r': c = '\r'; break;
324  case 't': c = '\t'; break;
325  case '\\': c = '\\'; break;
326  case '"': c = '"'; break;
327  case 'u': {
328  /* Unicode escape \uXXXX */
329  if (p->pos + 4 <= p->end) {
330  char hex[5] = {p->pos[0], p->pos[1], p->pos[2], p->pos[3], 0};
331  unsigned int codepoint = (unsigned int)strtol(hex, NULL, 16);
332  p->pos += 4;
333  /* Convert to UTF-8 */
334  if (codepoint < 0x80) {
335  if (len < max_len - 1) buf[len++] = (char)codepoint;
336  } else if (codepoint < 0x800) {
337  if (len < max_len - 2) {
338  buf[len++] = (char)(0xC0 | (codepoint >> 6));
339  buf[len++] = (char)(0x80 | (codepoint & 0x3F));
340  }
341  } else {
342  if (len < max_len - 3) {
343  buf[len++] = (char)(0xE0 | (codepoint >> 12));
344  buf[len++] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
345  buf[len++] = (char)(0x80 | (codepoint & 0x3F));
346  }
347  }
348  continue;
349  }
350  break;
351  }
352  default: break;
353  }
354  }
355  if (len < max_len - 1) buf[len++] = c;
356  }
357  buf[len] = '\0';
358 
359  if (p->pos < p->end && *p->pos == '"') p->pos++;
360  return len;
361 }
362 
363 static int json_parse_int(JSONParser *p, int *out) {
365  if (p->pos >= p->end) return -1;
366 
367  int neg = 0;
368  if (*p->pos == '-') {
369  neg = 1;
370  p->pos++;
371  }
372 
373  if (p->pos >= p->end || !isdigit((unsigned char)*p->pos)) return -1;
374 
375  int val = 0;
376  while (p->pos < p->end && isdigit((unsigned char)*p->pos)) {
377  val = val * 10 + (*p->pos - '0');
378  p->pos++;
379  }
380 
381  *out = neg ? -val : val;
382  return 0;
383 }
384 
385 static void json_skip_value(JSONParser *p) {
387  if (p->pos >= p->end) return;
388 
389  char c = *p->pos;
390  if (c == '"') {
391  char buf[1024];
392  json_parse_string(p, buf, sizeof(buf));
393  } else if (c == '{') {
394  int depth = 1;
395  p->pos++;
396  while (p->pos < p->end && depth > 0) {
397  if (*p->pos == '{') depth++;
398  else if (*p->pos == '}') depth--;
399  else if (*p->pos == '"') {
400  char buf[1024];
401  json_parse_string(p, buf, sizeof(buf));
402  continue;
403  }
404  p->pos++;
405  }
406  } else if (c == '[') {
407  int depth = 1;
408  p->pos++;
409  while (p->pos < p->end && depth > 0) {
410  if (*p->pos == '[') depth++;
411  else if (*p->pos == ']') depth--;
412  else if (*p->pos == '"') {
413  char buf[1024];
414  json_parse_string(p, buf, sizeof(buf));
415  continue;
416  }
417  p->pos++;
418  }
419  } else {
420  /* Number, bool, null */
421  while (p->pos < p->end && !isspace((unsigned char)*p->pos) &&
422  *p->pos != ',' && *p->pos != '}' && *p->pos != ']') {
423  p->pos++;
424  }
425  }
426 }
427 
428 /* ========================================================================== */
429 /* Load from tokenizer.json */
430 /* ========================================================================== */
431 
432 int ck_tokenizer_load(CKTokenizer *tok, const char *path) {
433  FILE *f = fopen(path, "rb");
434  if (!f) {
435  fprintf(stderr, "Failed to open tokenizer: %s\n", path);
436  return -1;
437  }
438 
439  fseek(f, 0, SEEK_END);
440  long size = ftell(f);
441  fseek(f, 0, SEEK_SET);
442 
443  char *data = (char *)malloc(size + 1);
444  if (!data) {
445  fclose(f);
446  return -1;
447  }
448  fread(data, 1, size, f);
449  data[size] = '\0';
450  fclose(f);
451 
452  JSONParser parser = {data, data, data + size};
453  JSONParser *p = &parser;
454 
455  /* Parse top-level object */
456  if (!json_match_char(p, '{')) {
457  free(data);
458  return -1;
459  }
460 
461  char key[256];
462  while (p->pos < p->end && *p->pos != '}') {
463  if (json_parse_string(p, key, sizeof(key)) < 0) break;
464  if (!json_match_char(p, ':')) break;
465 
466  if (strcmp(key, "model") == 0) {
467  /* Parse model object */
468  if (!json_match_char(p, '{')) {
469  json_skip_value(p);
470  json_match_char(p, ',');
471  continue;
472  }
473 
474  while (p->pos < p->end && *p->pos != '}') {
475  if (json_parse_string(p, key, sizeof(key)) < 0) break;
476  if (!json_match_char(p, ':')) break;
477 
478  if (strcmp(key, "vocab") == 0) {
479  /* Parse vocab object: {"token": id, ...} */
480  if (!json_match_char(p, '{')) {
481  json_skip_value(p);
482  json_match_char(p, ',');
483  continue;
484  }
485 
486  char token[CK_MAX_TOKEN_LEN];
487  while (p->pos < p->end && *p->pos != '}') {
488  int token_len = json_parse_string(p, token, sizeof(token));
489  if (token_len < 0) break;
490  if (!json_match_char(p, ':')) break;
491 
492  int id;
493  if (json_parse_int(p, &id) < 0) break;
494 
495  /* Ensure we have space up to this ID */
496  while (tok->vocab_size <= id) {
497  ck_tokenizer_add_token(tok, "", 0);
498  }
499 
500  /* Add/update token */
501  uint32_t bucket = hash_string(token, token_len) % tok->vocab_hash_size;
502  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
503  entry->token = ck_pool_strdup(&tok->pool, token, token_len);
504  entry->token_len = token_len;
505  entry->id = id;
506  entry->next = tok->vocab_hash[bucket];
507  tok->vocab_hash[bucket] = entry;
508  tok->id_to_token[id] = entry->token;
509  if (id >= tok->vocab_size) tok->vocab_size = id + 1;
510 
511  json_match_char(p, ',');
512  }
513  json_match_char(p, '}');
514 
515  } else if (strcmp(key, "merges") == 0) {
516  /* Parse merges array: ["tok1 tok2", ...] */
517  if (!json_match_char(p, '[')) {
518  json_skip_value(p);
519  json_match_char(p, ',');
520  continue;
521  }
522 
523  char merge_str[512];
524  while (p->pos < p->end && *p->pos != ']') {
525  int merge_len = json_parse_string(p, merge_str, sizeof(merge_str));
526  if (merge_len < 0) break;
527 
528  /* Parse "token1 token2" */
529  char *space = strchr(merge_str, ' ');
530  if (space) {
531  *space = '\0';
532  char *tok1 = merge_str;
533  char *tok2 = space + 1;
534 
535  int32_t id1 = ck_tokenizer_lookup(tok, tok1, -1);
536  int32_t id2 = ck_tokenizer_lookup(tok, tok2, -1);
537 
538  /* Create merged token */
539  char merged[512];
540  snprintf(merged, sizeof(merged), "%s%s", tok1, tok2);
541  int32_t merged_id = ck_tokenizer_lookup(tok, merged, -1);
542 
543  if (merged_id == tok->unk_id) {
544  merged_id = ck_tokenizer_add_token(tok, merged, -1);
545  }
546 
547  ck_tokenizer_add_merge(tok, id1, id2, merged_id);
548  }
549 
550  json_match_char(p, ',');
551  }
552  json_match_char(p, ']');
553 
554  } else {
555  json_skip_value(p);
556  }
557 
558  json_match_char(p, ',');
559  }
560  json_match_char(p, '}');
561 
562  } else if (strcmp(key, "added_tokens") == 0) {
563  /* Parse added_tokens array for special tokens */
564  if (!json_match_char(p, '[')) {
565  json_skip_value(p);
566  json_match_char(p, ',');
567  continue;
568  }
569 
570  while (p->pos < p->end && *p->pos != ']') {
571  if (!json_match_char(p, '{')) {
572  json_skip_value(p);
573  json_match_char(p, ',');
574  continue;
575  }
576 
577  char content[256] = "";
578  int id = -1;
579  bool special = false;
580 
581  while (p->pos < p->end && *p->pos != '}') {
582  if (json_parse_string(p, key, sizeof(key)) < 0) break;
583  if (!json_match_char(p, ':')) break;
584 
585  if (strcmp(key, "content") == 0) {
586  json_parse_string(p, content, sizeof(content));
587  } else if (strcmp(key, "id") == 0) {
588  json_parse_int(p, &id);
589  } else if (strcmp(key, "special") == 0) {
591  special = (p->pos < p->end && *p->pos == 't');
592  json_skip_value(p);
593  } else {
594  json_skip_value(p);
595  }
596  json_match_char(p, ',');
597  }
598  json_match_char(p, '}');
599 
600  if (id >= 0 && content[0]) {
601  /* Identify special tokens */
602  if (strcmp(content, "<unk>") == 0 || strcmp(content, "[UNK]") == 0) {
603  tok->unk_id = id;
604  } else if (strcmp(content, "<s>") == 0 || strcmp(content, "<bos>") == 0 ||
605  strcmp(content, "[BOS]") == 0) {
606  tok->bos_id = id;
607  } else if (strcmp(content, "</s>") == 0 || strcmp(content, "<eos>") == 0 ||
608  strcmp(content, "[EOS]") == 0 || strcmp(content, "<|endoftext|>") == 0) {
609  tok->eos_id = id;
610  } else if (strcmp(content, "<pad>") == 0 || strcmp(content, "[PAD]") == 0) {
611  tok->pad_id = id;
612  }
613  }
614 
615  json_match_char(p, ',');
616  }
617  json_match_char(p, ']');
618 
619  } else {
620  json_skip_value(p);
621  }
622 
623  json_match_char(p, ',');
624  }
625 
626  free(data);
627 
628  printf("Loaded tokenizer: %d tokens, %d merges\n", tok->vocab_size, tok->num_merges);
629  printf(" UNK=%d BOS=%d EOS=%d PAD=%d\n", tok->unk_id, tok->bos_id, tok->eos_id, tok->pad_id);
630 
631  return 0;
632 }
633 
634 /* ========================================================================== */
635 /* BPE Encode */
636 /* ========================================================================== */
637 
639  const char *text,
640  int text_len,
641  int32_t *ids,
642  int max_ids) {
643  if (text_len < 0) text_len = (int)strlen(text);
644 
645  /* Pre-tokenize: split on whitespace, keep spaces as tokens */
646  /* For simplicity, treat each byte as initial token, then apply BPE */
647 
648  /* Initial tokens: one per byte */
649  int32_t *tokens = (int32_t *)malloc(text_len * sizeof(int32_t));
650  int num_tokens = 0;
651 
652  for (int i = 0; i < text_len; i++) {
653  /* Look up single-character token */
654  char c[2] = {text[i], '\0'};
655  int32_t id = ck_tokenizer_lookup(tok, c, 1);
656 
657  /* Handle special byte tokens like <0xXX> */
658  if (id == tok->unk_id) {
659  char byte_token[8];
660  snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)text[i]);
661  id = ck_tokenizer_lookup(tok, byte_token, -1);
662  }
663 
664  /* Try UTF-8 multi-byte sequences */
665  if (id == tok->unk_id && (unsigned char)text[i] >= 0x80) {
666  int utf8_len = 1;
667  if ((text[i] & 0xE0) == 0xC0) utf8_len = 2;
668  else if ((text[i] & 0xF0) == 0xE0) utf8_len = 3;
669  else if ((text[i] & 0xF8) == 0xF0) utf8_len = 4;
670 
671  if (i + utf8_len <= text_len) {
672  id = ck_tokenizer_lookup(tok, text + i, utf8_len);
673  if (id != tok->unk_id) {
674  tokens[num_tokens++] = id;
675  i += utf8_len - 1;
676  continue;
677  }
678  }
679  }
680 
681  tokens[num_tokens++] = id;
682  }
683 
684  /* Apply BPE merges iteratively */
685  bool changed = true;
686  while (changed && num_tokens > 1) {
687  changed = false;
688 
689  /* Find best merge (lowest priority = earliest in merge list) */
690  int best_pos = -1;
691  int best_priority = tok->num_merges;
692 
693  for (int i = 0; i < num_tokens - 1; i++) {
694  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[i], tokens[i + 1]);
695  if (merge_idx >= 0 && tok->merges[merge_idx].priority < best_priority) {
696  best_pos = i;
697  best_priority = tok->merges[merge_idx].priority;
698  }
699  }
700 
701  if (best_pos >= 0) {
702  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[best_pos], tokens[best_pos + 1]);
703  tokens[best_pos] = tok->merges[merge_idx].merged;
704 
705  /* Shift remaining tokens */
706  for (int i = best_pos + 1; i < num_tokens - 1; i++) {
707  tokens[i] = tokens[i + 1];
708  }
709  num_tokens--;
710  changed = true;
711  }
712  }
713 
714  /* Copy to output */
715  int out_len = 0;
716 
717  if (tok->add_bos && out_len < max_ids) {
718  ids[out_len++] = tok->bos_id;
719  }
720 
721  for (int i = 0; i < num_tokens && out_len < max_ids; i++) {
722  ids[out_len++] = tokens[i];
723  }
724 
725  if (tok->add_eos && out_len < max_ids) {
726  ids[out_len++] = tok->eos_id;
727  }
728 
729  free(tokens);
730  return out_len;
731 }
732 
733 /* ========================================================================== */
734 /* Decode */
735 /* ========================================================================== */
736 
738  const int32_t *ids,
739  int num_ids,
740  char *text,
741  int max_len) {
742  int len = 0;
743 
744  for (int i = 0; i < num_ids; i++) {
745  /* Skip special tokens */
746  if (ids[i] == tok->bos_id || ids[i] == tok->eos_id || ids[i] == tok->pad_id) {
747  continue;
748  }
749 
750  const char *token = ck_tokenizer_id_to_token(tok, ids[i]);
751  if (!token) continue;
752 
753  int token_len = (int)strlen(token);
754 
755  /* Handle byte tokens <0xXX> */
756  if (token_len == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x') {
757  char hex[3] = {token[3], token[4], 0};
758  unsigned int byte = (unsigned int)strtol(hex, NULL, 16);
759  if (len < max_len - 1) {
760  text[len++] = (char)byte;
761  }
762  continue;
763  }
764 
765  /* Handle GPT-style space prefix (Ġ = 0xC4 0xA0 in UTF-8) */
766  const char *src = token;
767  if ((unsigned char)token[0] == 0xC4 && (unsigned char)token[1] == 0xA0) {
768  if (len < max_len - 1) {
769  text[len++] = ' ';
770  }
771  src = token + 2;
772  token_len -= 2;
773  }
774 
775  /* Copy token */
776  for (int j = 0; j < token_len && len < max_len - 1; j++) {
777  text[len++] = src[j];
778  }
779  }
780 
781  text[len] = '\0';
782  return len;
783 }
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
void ck_pool_init(CKMemPool *pool)
Definition: ck_tokenizer.c:51
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: ck_tokenizer.c:737
int ck_tokenizer_init(CKTokenizer *tok)
Definition: ck_tokenizer.c:148
static uint32_t hash_string(const char *s, int len)
Definition: ck_tokenizer.c:123
static void json_skip_whitespace(JSONParser *p)
Definition: ck_tokenizer.c:296
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
static CKPoolBlock * pool_new_block(size_t capacity)
Definition: ck_tokenizer.c:55
int ck_tokenizer_load(CKTokenizer *tok, const char *path)
Definition: ck_tokenizer.c:432
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69
static int json_match_char(JSONParser *p, char c)
Definition: ck_tokenizer.c:302
char * ck_pool_strdup(CKMemPool *pool, const char *s, int len)
Definition: ck_tokenizer.c:98
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: ck_tokenizer.c:638
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:196
void ck_pool_free(CKMemPool *pool)
Definition: ck_tokenizer.c:107
static int json_parse_string(JSONParser *p, char *buf, int max_len)
Definition: ck_tokenizer.c:311
void ck_tokenizer_free(CKTokenizer *tok)
Definition: ck_tokenizer.c:183
int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right)
Definition: ck_tokenizer.c:276
static uint32_t hash_pair(int32_t left, int32_t right)
Definition: ck_tokenizer.c:133
static void json_skip_value(JSONParser *p)
Definition: ck_tokenizer.c:385
static int json_parse_int(JSONParser *p, int *out)
Definition: ck_tokenizer.c:363
#define CK_MAX_VOCAB_SIZE
Definition: ck_tokenizer.h:28
#define CK_POOL_BLOCK_SIZE
Definition: ck_tokenizer.h:31
#define CK_MAX_TOKEN_LEN
Definition: ck_tokenizer.h:25
CKPoolBlock * current
Definition: ck_tokenizer.h:46
CKPoolBlock * head
Definition: ck_tokenizer.h:45
size_t total_allocated
Definition: ck_tokenizer.h:47
int32_t left
Definition: ck_tokenizer.h:67
int32_t right
Definition: ck_tokenizer.h:68
int32_t merged
Definition: ck_tokenizer.h:69
uint8_t * data
Definition: ck_tokenizer.h:38
struct CKPoolBlock * next
Definition: ck_tokenizer.h:41
size_t used
Definition: ck_tokenizer.h:39
size_t capacity
Definition: ck_tokenizer.h:40
int32_t bos_id
Definition: ck_tokenizer.h:98
CKMemPool pool
Definition: ck_tokenizer.h:78
int32_t unk_id
Definition: ck_tokenizer.h:97
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int32_t eos_id
Definition: ck_tokenizer.h:99
int vocab_hash_size
Definition: ck_tokenizer.h:83
int merge_hash_size
Definition: ck_tokenizer.h:94
CKMergeRule * merges
Definition: ck_tokenizer.h:89
char ** id_to_token
Definition: ck_tokenizer.h:86
int * merge_hash
Definition: ck_tokenizer.h:93
int32_t pad_id
Definition: ck_tokenizer.h:100
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541
const int32_t * ids
Definition: tokenizer.h:443
int32_t id
Definition: tokenizer.h:315
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
const int32_t int int * out_len
Definition: tokenizer.h:445
const int32_t int char int max_len
Definition: true_bpe.h:280
const char int text_len
Definition: true_bpe.h:262
const char * left
Definition: true_bpe.h:130
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114
const char int int32_t int max_ids
Definition: true_bpe.h:264
const char const char * right
Definition: true_bpe.h:131
uint32_t end
Definition: utf8.c:215