← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_tokenizer.h File Reference
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>

Go to the source code of this file.

Data Structures

struct  CKMemPool
 
struct  CKMergeRule
 
struct  CKPoolBlock
 
struct  CKTokenizer
 
struct  CKVocabEntry
 

Macros

#define CK_MAX_TOKEN_LEN   256
 
#define CK_MAX_VOCAB_SIZE   256000
 
#define CK_POOL_BLOCK_SIZE   (1024 * 1024) /* 1MB */
 

Functions

void * ck_pool_alloc (CKMemPool *pool, size_t size)
 
void ck_pool_free (CKMemPool *pool)
 
void ck_pool_init (CKMemPool *pool)
 
char * ck_pool_strdup (CKMemPool *pool, const char *s, int len)
 
int ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
 
int32_t ck_tokenizer_add_token (CKTokenizer *tok, const char *token, int len)
 
int ck_tokenizer_decode (const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
 
int ck_tokenizer_encode (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
void ck_tokenizer_free (CKTokenizer *tok)
 
const char * ck_tokenizer_id_to_token (const CKTokenizer *tok, int32_t id)
 
int ck_tokenizer_init (CKTokenizer *tok)
 
int ck_tokenizer_load (CKTokenizer *tok, const char *path)
 
int32_t ck_tokenizer_lookup (const CKTokenizer *tok, const char *token, int len)
 
int ck_tokenizer_lookup_merge (const CKTokenizer *tok, int32_t left, int32_t right)
 
static int ck_tokenizer_vocab_size (const CKTokenizer *tok)
 

Macro Definition Documentation

◆ CK_MAX_TOKEN_LEN

#define CK_MAX_TOKEN_LEN   256

Definition at line 25 of file ck_tokenizer.h.

◆ CK_MAX_VOCAB_SIZE

#define CK_MAX_VOCAB_SIZE   256000

Definition at line 28 of file ck_tokenizer.h.

◆ CK_POOL_BLOCK_SIZE

#define CK_POOL_BLOCK_SIZE   (1024 * 1024) /* 1MB */

Definition at line 31 of file ck_tokenizer.h.

Function Documentation

◆ ck_pool_alloc()

void* ck_pool_alloc ( CKMemPool pool,
size_t  size 
)

Definition at line 69 of file ck_tokenizer.c.

69  {
70  /* Align to 8 bytes */
71  size = (size + 7) & ~7;
72 
73  /* Check if current block has space */
74  if (pool->current && pool->current->used + size <= pool->current->capacity) {
75  void *ptr = pool->current->data + pool->current->used;
76  pool->current->used += size;
77  pool->total_allocated += size;
78  return ptr;
79  }
80 
81  /* Need new block */
82  size_t block_size = CK_POOL_BLOCK_SIZE;
83  if (size > block_size) block_size = size;
84 
85  CKPoolBlock *block = pool_new_block(block_size);
86  if (!block) return NULL;
87 
88  block->next = pool->head;
89  pool->head = block;
90  pool->current = block;
91 
92  void *ptr = block->data;
93  block->used = size;
94  pool->total_allocated += size;
95  return ptr;
96 }
static CKPoolBlock * pool_new_block(size_t capacity)
Definition: ck_tokenizer.c:55
#define CK_POOL_BLOCK_SIZE
Definition: ck_tokenizer.h:31
CKPoolBlock * current
Definition: ck_tokenizer.h:46
CKPoolBlock * head
Definition: ck_tokenizer.h:45
size_t total_allocated
Definition: ck_tokenizer.h:47
uint8_t * data
Definition: ck_tokenizer.h:38
struct CKPoolBlock * next
Definition: ck_tokenizer.h:41
size_t used
Definition: ck_tokenizer.h:39

References CK_POOL_BLOCK_SIZE, CKMemPool::current, CKPoolBlock::data, CKMemPool::head, CKPoolBlock::next, pool_new_block(), CKMemPool::total_allocated, and CKPoolBlock::used.

Referenced by ck_pool_strdup(), ck_tokenizer_add_token(), ck_tokenizer_load(), and ck_tokenizer_load_binary().

◆ ck_pool_free()

void ck_pool_free ( CKMemPool pool)

Definition at line 107 of file ck_tokenizer.c.

107  {
108  CKPoolBlock *block = pool->head;
109  while (block) {
110  CKPoolBlock *next = block->next;
111  free(block->data);
112  free(block);
113  block = next;
114  }
115  memset(pool, 0, sizeof(*pool));
116 }

References CKPoolBlock::data, CKMemPool::head, and CKPoolBlock::next.

Referenced by ck_tokenizer_free().

◆ ck_pool_init()

void ck_pool_init ( CKMemPool pool)

Definition at line 51 of file ck_tokenizer.c.

51  {
52  memset(pool, 0, sizeof(*pool));
53 }

Referenced by ck_tokenizer_init().

◆ ck_pool_strdup()

char* ck_pool_strdup ( CKMemPool pool,
const char *  s,
int  len 
)

Definition at line 98 of file ck_tokenizer.c.

98  {
99  if (len < 0) len = (int)strlen(s);
100  char *copy = (char *)ck_pool_alloc(pool, len + 1);
101  if (!copy) return NULL;
102  memcpy(copy, s, len);
103  copy[len] = '\0';
104  return copy;
105 }
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69

References ck_pool_alloc().

Referenced by ck_tokenizer_add_token(), and ck_tokenizer_load().

◆ ck_tokenizer_add_merge()

int ck_tokenizer_add_merge ( CKTokenizer tok,
int32_t  left,
int32_t  right,
int32_t  merged 
)

Definition at line 248 of file ck_tokenizer.c.

248  {
249  int idx = tok->num_merges;
250 
251  /* Grow merges array if needed */
252  if (idx % 4096 == 0) {
253  size_t new_cap = (idx + 4096) * sizeof(CKMergeRule);
254  CKMergeRule *new_merges = (CKMergeRule *)realloc(tok->merges, new_cap);
255  if (!new_merges) return -1;
256  tok->merges = new_merges;
257  }
258 
259  tok->merges[idx].left = left;
260  tok->merges[idx].right = right;
261  tok->merges[idx].merged = merged;
262  tok->merges[idx].priority = idx; /* Earlier = higher priority */
263 
264  /* Add to hash table */
265  uint32_t bucket = hash_pair(left, right) % tok->merge_hash_size;
266  /* Linear probing */
267  while (tok->merge_hash[bucket] >= 0) {
268  bucket = (bucket + 1) % tok->merge_hash_size;
269  }
270  tok->merge_hash[bucket] = idx;
271 
272  tok->num_merges++;
273  return 0;
274 }
static uint32_t hash_pair(int32_t left, int32_t right)
Definition: ck_tokenizer.c:133
int32_t left
Definition: ck_tokenizer.h:67
int32_t right
Definition: ck_tokenizer.h:68
int32_t merged
Definition: ck_tokenizer.h:69
int merge_hash_size
Definition: ck_tokenizer.h:94
CKMergeRule * merges
Definition: ck_tokenizer.h:89
int * merge_hash
Definition: ck_tokenizer.h:93
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131

References hash_pair(), CKMergeRule::left, left, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, CKMergeRule::priority, CKMergeRule::right, and right.

Referenced by ck_tokenizer_load(), and ck_tokenizer_load_binary().

◆ ck_tokenizer_add_token()

int32_t ck_tokenizer_add_token ( CKTokenizer tok,
const char *  token,
int  len 
)

Definition at line 196 of file ck_tokenizer.c.

196  {
197  if (len < 0) len = (int)strlen(token);
198  if (tok->vocab_size >= CK_MAX_VOCAB_SIZE) return -1;
199 
200  /* Check if already exists */
201  int32_t existing = ck_tokenizer_lookup(tok, token, len);
202  if (existing != tok->unk_id || (len == 0)) {
203  return existing;
204  }
205 
206  /* Create new entry */
207  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
208  if (!entry) return -1;
209 
210  entry->token = ck_pool_strdup(&tok->pool, token, len);
211  if (!entry->token) return -1;
212  entry->token_len = len;
213  entry->id = tok->vocab_size;
214 
215  /* Add to hash table */
216  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
217  entry->next = tok->vocab_hash[bucket];
218  tok->vocab_hash[bucket] = entry;
219 
220  /* Add to reverse lookup */
221  tok->id_to_token[tok->vocab_size] = entry->token;
222 
223  tok->vocab_size++;
224  return entry->id;
225 }
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
static uint32_t hash_string(const char *s, int len)
Definition: ck_tokenizer.c:123
char * ck_pool_strdup(CKMemPool *pool, const char *s, int len)
Definition: ck_tokenizer.c:98
#define CK_MAX_VOCAB_SIZE
Definition: ck_tokenizer.h:28
CKMemPool pool
Definition: ck_tokenizer.h:78
int32_t unk_id
Definition: ck_tokenizer.h:97
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int vocab_hash_size
Definition: ck_tokenizer.h:83
char ** id_to_token
Definition: ck_tokenizer.h:86
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
const char * token
Definition: tokenizer.h:306

References CK_MAX_VOCAB_SIZE, ck_pool_alloc(), ck_pool_strdup(), ck_tokenizer_lookup(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, CKVocabEntry::next, CKTokenizer::pool, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::unk_id, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_load(), and main().

◆ ck_tokenizer_decode()

int ck_tokenizer_decode ( const CKTokenizer tok,
const int32_t *  ids,
int  num_ids,
char *  text,
int  max_len 
)

Definition at line 737 of file ck_tokenizer.c.

741  {
742  int len = 0;
743 
744  for (int i = 0; i < num_ids; i++) {
745  /* Skip special tokens */
746  if (ids[i] == tok->bos_id || ids[i] == tok->eos_id || ids[i] == tok->pad_id) {
747  continue;
748  }
749 
750  const char *token = ck_tokenizer_id_to_token(tok, ids[i]);
751  if (!token) continue;
752 
753  int token_len = (int)strlen(token);
754 
755  /* Handle byte tokens <0xXX> */
756  if (token_len == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x') {
757  char hex[3] = {token[3], token[4], 0};
758  unsigned int byte = (unsigned int)strtol(hex, NULL, 16);
759  if (len < max_len - 1) {
760  text[len++] = (char)byte;
761  }
762  continue;
763  }
764 
765  /* Handle GPT-style space prefix (Ġ = 0xC4 0xA0 in UTF-8) */
766  const char *src = token;
767  if ((unsigned char)token[0] == 0xC4 && (unsigned char)token[1] == 0xA0) {
768  if (len < max_len - 1) {
769  text[len++] = ' ';
770  }
771  src = token + 2;
772  token_len -= 2;
773  }
774 
775  /* Copy token */
776  for (int j = 0; j < token_len && len < max_len - 1; j++) {
777  text[len++] = src[j];
778  }
779  }
780 
781  text[len] = '\0';
782  return len;
783 }
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
int32_t bos_id
Definition: ck_tokenizer.h:98
int32_t eos_id
Definition: ck_tokenizer.h:99
int32_t pad_id
Definition: ck_tokenizer.h:100
const int32_t * ids
Definition: tokenizer.h:443
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const int32_t int char int max_len
Definition: true_bpe.h:280

References CKTokenizer::bos_id, ck_tokenizer_id_to_token(), CKTokenizer::eos_id, ids, max_len, num_ids, CKTokenizer::pad_id, text, and token.

Referenced by main().

◆ ck_tokenizer_encode()

int ck_tokenizer_encode ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)

Definition at line 638 of file ck_tokenizer.c.

642  {
643  if (text_len < 0) text_len = (int)strlen(text);
644 
645  /* Pre-tokenize: split on whitespace, keep spaces as tokens */
646  /* For simplicity, treat each byte as initial token, then apply BPE */
647 
648  /* Initial tokens: one per byte */
649  int32_t *tokens = (int32_t *)malloc(text_len * sizeof(int32_t));
650  int num_tokens = 0;
651 
652  for (int i = 0; i < text_len; i++) {
653  /* Look up single-character token */
654  char c[2] = {text[i], '\0'};
655  int32_t id = ck_tokenizer_lookup(tok, c, 1);
656 
657  /* Handle special byte tokens like <0xXX> */
658  if (id == tok->unk_id) {
659  char byte_token[8];
660  snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)text[i]);
661  id = ck_tokenizer_lookup(tok, byte_token, -1);
662  }
663 
664  /* Try UTF-8 multi-byte sequences */
665  if (id == tok->unk_id && (unsigned char)text[i] >= 0x80) {
666  int utf8_len = 1;
667  if ((text[i] & 0xE0) == 0xC0) utf8_len = 2;
668  else if ((text[i] & 0xF0) == 0xE0) utf8_len = 3;
669  else if ((text[i] & 0xF8) == 0xF0) utf8_len = 4;
670 
671  if (i + utf8_len <= text_len) {
672  id = ck_tokenizer_lookup(tok, text + i, utf8_len);
673  if (id != tok->unk_id) {
674  tokens[num_tokens++] = id;
675  i += utf8_len - 1;
676  continue;
677  }
678  }
679  }
680 
681  tokens[num_tokens++] = id;
682  }
683 
684  /* Apply BPE merges iteratively */
685  bool changed = true;
686  while (changed && num_tokens > 1) {
687  changed = false;
688 
689  /* Find best merge (lowest priority = earliest in merge list) */
690  int best_pos = -1;
691  int best_priority = tok->num_merges;
692 
693  for (int i = 0; i < num_tokens - 1; i++) {
694  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[i], tokens[i + 1]);
695  if (merge_idx >= 0 && tok->merges[merge_idx].priority < best_priority) {
696  best_pos = i;
697  best_priority = tok->merges[merge_idx].priority;
698  }
699  }
700 
701  if (best_pos >= 0) {
702  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[best_pos], tokens[best_pos + 1]);
703  tokens[best_pos] = tok->merges[merge_idx].merged;
704 
705  /* Shift remaining tokens */
706  for (int i = best_pos + 1; i < num_tokens - 1; i++) {
707  tokens[i] = tokens[i + 1];
708  }
709  num_tokens--;
710  changed = true;
711  }
712  }
713 
714  /* Copy to output */
715  int out_len = 0;
716 
717  if (tok->add_bos && out_len < max_ids) {
718  ids[out_len++] = tok->bos_id;
719  }
720 
721  for (int i = 0; i < num_tokens && out_len < max_ids; i++) {
722  ids[out_len++] = tokens[i];
723  }
724 
725  if (tok->add_eos && out_len < max_ids) {
726  ids[out_len++] = tok->eos_id;
727  }
728 
729  free(tokens);
730  return out_len;
731 }
int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right)
Definition: ck_tokenizer.c:276
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541
int32_t id
Definition: tokenizer.h:315
const int32_t int int * out_len
Definition: tokenizer.h:445
const char int text_len
Definition: true_bpe.h:262
const char int int32_t int max_ids
Definition: true_bpe.h:264

References CKTokenizer::add_bos, CKTokenizer::add_eos, CKTokenizer::bos_id, ck_tokenizer_lookup(), ck_tokenizer_lookup_merge(), CKTokenizer::eos_id, id, ids, max_ids, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, out_len, CKMergeRule::priority, text, text_len, CKTokenizer::unk_id, and utf8_len().

Referenced by main(), and run_inference().

◆ ck_tokenizer_free()

void ck_tokenizer_free ( CKTokenizer tok)

◆ ck_tokenizer_id_to_token()

const char* ck_tokenizer_id_to_token ( const CKTokenizer tok,
int32_t  id 
)

Definition at line 239 of file ck_tokenizer.c.

239  {
240  if (id < 0 || id >= tok->vocab_size) return NULL;
241  return tok->id_to_token[id];
242 }

References id, CKTokenizer::id_to_token, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_decode(), ck_tokenizer_encode_spm_impl(), main(), and run_inference().

◆ ck_tokenizer_init()

int ck_tokenizer_init ( CKTokenizer tok)

Definition at line 148 of file ck_tokenizer.c.

148  {
149  memset(tok, 0, sizeof(*tok));
150  ck_pool_init(&tok->pool);
151 
152  /* Default special tokens */
153  tok->unk_id = 0;
154  tok->bos_id = 1;
155  tok->eos_id = 2;
156  tok->pad_id = 3;
157 
158  /* Allocate vocab hash table */
159  tok->vocab_hash_size = 65536; /* 64K buckets */
160  tok->vocab_hash = (CKVocabEntry **)calloc(tok->vocab_hash_size, sizeof(CKVocabEntry *));
161  if (!tok->vocab_hash) return -1;
162 
163  /* Allocate reverse vocab */
164  tok->id_to_token = (char **)calloc(CK_MAX_VOCAB_SIZE, sizeof(char *));
165  if (!tok->id_to_token) {
166  free(tok->vocab_hash);
167  return -1;
168  }
169 
170  /* Allocate merge hash table */
171  tok->merge_hash_size = 262144; /* 256K buckets */
172  tok->merge_hash = (int *)malloc(tok->merge_hash_size * sizeof(int));
173  if (!tok->merge_hash) {
174  free(tok->vocab_hash);
175  free(tok->id_to_token);
176  return -1;
177  }
178  memset(tok->merge_hash, -1, tok->merge_hash_size * sizeof(int));
179 
180  return 0;
181 }
void ck_pool_init(CKMemPool *pool)
Definition: ck_tokenizer.c:51

References CKTokenizer::bos_id, CK_MAX_VOCAB_SIZE, ck_pool_init(), CKTokenizer::eos_id, CKTokenizer::id_to_token, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKTokenizer::pad_id, CKTokenizer::pool, CKTokenizer::unk_id, CKTokenizer::vocab_hash, and CKTokenizer::vocab_hash_size.

Referenced by run_inference().

◆ ck_tokenizer_load()

int ck_tokenizer_load ( CKTokenizer tok,
const char *  path 
)

Definition at line 432 of file ck_tokenizer.c.

432  {
433  FILE *f = fopen(path, "rb");
434  if (!f) {
435  fprintf(stderr, "Failed to open tokenizer: %s\n", path);
436  return -1;
437  }
438 
439  fseek(f, 0, SEEK_END);
440  long size = ftell(f);
441  fseek(f, 0, SEEK_SET);
442 
443  char *data = (char *)malloc(size + 1);
444  if (!data) {
445  fclose(f);
446  return -1;
447  }
448  fread(data, 1, size, f);
449  data[size] = '\0';
450  fclose(f);
451 
452  JSONParser parser = {data, data, data + size};
453  JSONParser *p = &parser;
454 
455  /* Parse top-level object */
456  if (!json_match_char(p, '{')) {
457  free(data);
458  return -1;
459  }
460 
461  char key[256];
462  while (p->pos < p->end && *p->pos != '}') {
463  if (json_parse_string(p, key, sizeof(key)) < 0) break;
464  if (!json_match_char(p, ':')) break;
465 
466  if (strcmp(key, "model") == 0) {
467  /* Parse model object */
468  if (!json_match_char(p, '{')) {
469  json_skip_value(p);
470  json_match_char(p, ',');
471  continue;
472  }
473 
474  while (p->pos < p->end && *p->pos != '}') {
475  if (json_parse_string(p, key, sizeof(key)) < 0) break;
476  if (!json_match_char(p, ':')) break;
477 
478  if (strcmp(key, "vocab") == 0) {
479  /* Parse vocab object: {"token": id, ...} */
480  if (!json_match_char(p, '{')) {
481  json_skip_value(p);
482  json_match_char(p, ',');
483  continue;
484  }
485 
486  char token[CK_MAX_TOKEN_LEN];
487  while (p->pos < p->end && *p->pos != '}') {
488  int token_len = json_parse_string(p, token, sizeof(token));
489  if (token_len < 0) break;
490  if (!json_match_char(p, ':')) break;
491 
492  int id;
493  if (json_parse_int(p, &id) < 0) break;
494 
495  /* Ensure we have space up to this ID */
496  while (tok->vocab_size <= id) {
497  ck_tokenizer_add_token(tok, "", 0);
498  }
499 
500  /* Add/update token */
501  uint32_t bucket = hash_string(token, token_len) % tok->vocab_hash_size;
502  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
503  entry->token = ck_pool_strdup(&tok->pool, token, token_len);
504  entry->token_len = token_len;
505  entry->id = id;
506  entry->next = tok->vocab_hash[bucket];
507  tok->vocab_hash[bucket] = entry;
508  tok->id_to_token[id] = entry->token;
509  if (id >= tok->vocab_size) tok->vocab_size = id + 1;
510 
511  json_match_char(p, ',');
512  }
513  json_match_char(p, '}');
514 
515  } else if (strcmp(key, "merges") == 0) {
516  /* Parse merges array: ["tok1 tok2", ...] */
517  if (!json_match_char(p, '[')) {
518  json_skip_value(p);
519  json_match_char(p, ',');
520  continue;
521  }
522 
523  char merge_str[512];
524  while (p->pos < p->end && *p->pos != ']') {
525  int merge_len = json_parse_string(p, merge_str, sizeof(merge_str));
526  if (merge_len < 0) break;
527 
528  /* Parse "token1 token2" */
529  char *space = strchr(merge_str, ' ');
530  if (space) {
531  *space = '\0';
532  char *tok1 = merge_str;
533  char *tok2 = space + 1;
534 
535  int32_t id1 = ck_tokenizer_lookup(tok, tok1, -1);
536  int32_t id2 = ck_tokenizer_lookup(tok, tok2, -1);
537 
538  /* Create merged token */
539  char merged[512];
540  snprintf(merged, sizeof(merged), "%s%s", tok1, tok2);
541  int32_t merged_id = ck_tokenizer_lookup(tok, merged, -1);
542 
543  if (merged_id == tok->unk_id) {
544  merged_id = ck_tokenizer_add_token(tok, merged, -1);
545  }
546 
547  ck_tokenizer_add_merge(tok, id1, id2, merged_id);
548  }
549 
550  json_match_char(p, ',');
551  }
552  json_match_char(p, ']');
553 
554  } else {
555  json_skip_value(p);
556  }
557 
558  json_match_char(p, ',');
559  }
560  json_match_char(p, '}');
561 
562  } else if (strcmp(key, "added_tokens") == 0) {
563  /* Parse added_tokens array for special tokens */
564  if (!json_match_char(p, '[')) {
565  json_skip_value(p);
566  json_match_char(p, ',');
567  continue;
568  }
569 
570  while (p->pos < p->end && *p->pos != ']') {
571  if (!json_match_char(p, '{')) {
572  json_skip_value(p);
573  json_match_char(p, ',');
574  continue;
575  }
576 
577  char content[256] = "";
578  int id = -1;
579  bool special = false;
580 
581  while (p->pos < p->end && *p->pos != '}') {
582  if (json_parse_string(p, key, sizeof(key)) < 0) break;
583  if (!json_match_char(p, ':')) break;
584 
585  if (strcmp(key, "content") == 0) {
586  json_parse_string(p, content, sizeof(content));
587  } else if (strcmp(key, "id") == 0) {
588  json_parse_int(p, &id);
589  } else if (strcmp(key, "special") == 0) {
591  special = (p->pos < p->end && *p->pos == 't');
592  json_skip_value(p);
593  } else {
594  json_skip_value(p);
595  }
596  json_match_char(p, ',');
597  }
598  json_match_char(p, '}');
599 
600  if (id >= 0 && content[0]) {
601  /* Identify special tokens */
602  if (strcmp(content, "<unk>") == 0 || strcmp(content, "[UNK]") == 0) {
603  tok->unk_id = id;
604  } else if (strcmp(content, "<s>") == 0 || strcmp(content, "<bos>") == 0 ||
605  strcmp(content, "[BOS]") == 0) {
606  tok->bos_id = id;
607  } else if (strcmp(content, "</s>") == 0 || strcmp(content, "<eos>") == 0 ||
608  strcmp(content, "[EOS]") == 0 || strcmp(content, "<|endoftext|>") == 0) {
609  tok->eos_id = id;
610  } else if (strcmp(content, "<pad>") == 0 || strcmp(content, "[PAD]") == 0) {
611  tok->pad_id = id;
612  }
613  }
614 
615  json_match_char(p, ',');
616  }
617  json_match_char(p, ']');
618 
619  } else {
620  json_skip_value(p);
621  }
622 
623  json_match_char(p, ',');
624  }
625 
626  free(data);
627 
628  printf("Loaded tokenizer: %d tokens, %d merges\n", tok->vocab_size, tok->num_merges);
629  printf(" UNK=%d BOS=%d EOS=%d PAD=%d\n", tok->unk_id, tok->bos_id, tok->eos_id, tok->pad_id);
630 
631  return 0;
632 }
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
static void json_skip_whitespace(JSONParser *p)
Definition: ck_tokenizer.c:296
static int json_match_char(JSONParser *p, char c)
Definition: ck_tokenizer.c:302
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:196
static int json_parse_string(JSONParser *p, char *buf, int max_len)
Definition: ck_tokenizer.c:311
static void json_skip_value(JSONParser *p)
Definition: ck_tokenizer.c:385
static int json_parse_int(JSONParser *p, int *out)
Definition: ck_tokenizer.c:363
#define CK_MAX_TOKEN_LEN
Definition: ck_tokenizer.h:25
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114

References CKTokenizer::bos_id, CK_MAX_TOKEN_LEN, ck_pool_alloc(), ck_pool_strdup(), ck_tokenizer_add_merge(), ck_tokenizer_add_token(), ck_tokenizer_lookup(), CKTokenizer::eos_id, hash_string(), CKVocabEntry::id, id, CKTokenizer::id_to_token, json_match_char(), json_parse_int(), json_parse_string(), json_skip_value(), json_skip_whitespace(), merged_id, CKVocabEntry::next, CKTokenizer::num_merges, CKTokenizer::pad_id, CKTokenizer::pool, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::unk_id, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, and CKTokenizer::vocab_size.

Referenced by run_inference().

◆ ck_tokenizer_lookup()

int32_t ck_tokenizer_lookup ( const CKTokenizer tok,
const char *  token,
int  len 
)

Definition at line 227 of file ck_tokenizer.c.

227  {
228  if (len < 0) len = (int)strlen(token);
229  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
230 
231  for (CKVocabEntry *e = tok->vocab_hash[bucket]; e; e = e->next) {
232  if (e->token_len == len && memcmp(e->token, token, len) == 0) {
233  return e->id;
234  }
235  }
236  return tok->unk_id;
237 }

References hash_string(), CKVocabEntry::next, token, CKTokenizer::unk_id, CKTokenizer::vocab_hash, and CKTokenizer::vocab_hash_size.

Referenced by ck_tokenizer_add_token(), ck_tokenizer_encode(), ck_tokenizer_load(), and main().

◆ ck_tokenizer_lookup_merge()

int ck_tokenizer_lookup_merge ( const CKTokenizer tok,
int32_t  left,
int32_t  right 
)

Definition at line 276 of file ck_tokenizer.c.

276  {
277  uint32_t bucket = hash_pair(left, right) % tok->merge_hash_size;
278 
279  /* Linear probing */
280  int probes = 0;
281  while (tok->merge_hash[bucket] >= 0 && probes < tok->merge_hash_size) {
282  int idx = tok->merge_hash[bucket];
283  if (tok->merges[idx].left == left && tok->merges[idx].right == right) {
284  return idx;
285  }
286  bucket = (bucket + 1) % tok->merge_hash_size;
287  probes++;
288  }
289  return -1;
290 }

References hash_pair(), CKMergeRule::left, left, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKTokenizer::merges, CKMergeRule::right, and right.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_vocab_size()

static int ck_tokenizer_vocab_size ( const CKTokenizer tok)
inlinestatic

Definition at line 196 of file ck_tokenizer.h.

196  {
197  return tok->vocab_size;
198 }

References CKTokenizer::vocab_size.

Referenced by main(), and run_inference().