← Back to C-Kernel-Engine Docs Doxygen Source Documentation
v2_legacy/ck_tokenizer_v2.c
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <stdint.h>
4 #include <string.h>
5 #include <ctype.h>
6 
7 #include "ck_tokenizer.h"
8 
9 static uint32_t hash_string(const char *s, int len) {
10  uint32_t hash = 2166136261u;
11  for (int i = 0; i < len; i++) {
12  hash ^= (uint8_t)s[i];
13  hash *= 16777619u;
14  }
15  return hash;
16 }
17 
19  int vocab_size,
20  const int32_t *offsets,
21  const char *strings,
22  int num_merges,
23  const int32_t *merges) {
24  if (!tok || !offsets || !strings) return -1;
25 
26  // We assume ck_tokenizer_init was already called to alloc hash tables
27  tok->vocab_size = 0;
28 
29  for (int i = 0; i < vocab_size; i++) {
30  const char *token = strings + offsets[i];
31  int len = (int)strlen(token);
32 
33  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
34  entry->token = (char *)token;
35  entry->token_len = len;
36  entry->id = i;
37 
38  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
39  entry->next = tok->vocab_hash[bucket];
40  tok->vocab_hash[bucket] = entry;
41 
42  tok->id_to_token[i] = entry->token;
43  tok->vocab_size++;
44  }
45 
46  if (merges && num_merges > 0) {
47  for (int i = 0; i < num_merges; i++) {
48  int32_t left = merges[i*3 + 0];
49  int32_t right = merges[i*3 + 1];
50  int32_t merged = merges[i*3 + 2];
51  ck_tokenizer_add_merge(tok, left, right, merged);
52  }
53  }
54 
55  return 0;
56 }
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69
CKMemPool pool
Definition: ck_tokenizer.h:78
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int vocab_hash_size
Definition: ck_tokenizer.h:83
char ** id_to_token
Definition: ck_tokenizer.h:86
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
const char * token
Definition: tokenizer.h:306
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
static uint32_t hash_string(const char *s, int len)