← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_tokenizer.h
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine BPE Tokenizer
3  *
4  * Pure C implementation of Byte-Pair Encoding tokenizer
5  * compatible with HuggingFace tokenizer.json format.
6  *
7  * The tokenizer uses a memory pool for allocations and maps tokens
8  * directly to dense embedding indices (token_id == embedding row).
9  *
10  * By Anthony Shivakumar
11  */
12 
13 #ifndef CK_TOKENIZER_H
14 #define CK_TOKENIZER_H
15 
16 #include <stddef.h>
17 #include <stdint.h>
18 #include <stdbool.h>
19 
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23 
24 /* Maximum token length in bytes */
25 #define CK_MAX_TOKEN_LEN 256
26 
27 /* Maximum vocabulary size */
28 #define CK_MAX_VOCAB_SIZE 256000
29 
30 /* Memory pool block size */
31 #define CK_POOL_BLOCK_SIZE (1024 * 1024) /* 1MB */
32 
33 /*
34  * Memory pool for tokenizer allocations.
35  * Avoids malloc overhead for small allocations.
36  */
37 typedef struct CKPoolBlock {
38  uint8_t *data;
39  size_t used;
40  size_t capacity;
41  struct CKPoolBlock *next;
42 } CKPoolBlock;
43 
44 typedef struct {
48 } CKMemPool;
49 
50 /*
51  * Vocabulary entry.
52  * Token string -> ID mapping.
53  * IDs are dense indices into the embedding table.
54  */
55 typedef struct CKVocabEntry {
56  char *token; /* Token string (UTF-8) */
57  int token_len; /* Length in bytes */
58  int32_t id; /* Dense embedding index */
59  struct CKVocabEntry *next; /* Hash chain */
60 } CKVocabEntry;
61 
62 /*
63  * BPE merge rule.
64  * Pair of token IDs -> merged token ID.
65  */
66 typedef struct {
67  int32_t left;
68  int32_t right;
69  int32_t merged;
70  int priority; /* Lower = higher priority (earlier in merges list) */
71 } CKMergeRule;
72 
73 /*
74  * Tokenizer state.
75  */
76 typedef struct {
77  /* Memory pool */
79 
80  /* Vocabulary: token string -> ID */
82  CKVocabEntry **vocab_hash; /* Hash table for string -> ID */
84 
85  /* Reverse vocabulary: ID -> token string */
86  char **id_to_token;
87 
88  /* BPE merge rules */
91 
92  /* Merge lookup: (left_id, right_id) -> merge index */
93  int *merge_hash;
95 
96  /* Special tokens */
97  int32_t unk_id;
98  int32_t bos_id;
99  int32_t eos_id;
100  int32_t pad_id;
101 
102  /* Config */
103  bool add_bos;
104  bool add_eos;
105 } CKTokenizer;
106 
107 /*
108  * Initialize memory pool.
109  */
110 void ck_pool_init(CKMemPool *pool);
111 
112 /*
113  * Allocate from memory pool.
114  */
115 void *ck_pool_alloc(CKMemPool *pool, size_t size);
116 
117 /*
118  * Allocate and copy string.
119  */
120 char *ck_pool_strdup(CKMemPool *pool, const char *s, int len);
121 
122 /*
123  * Free memory pool.
124  */
125 void ck_pool_free(CKMemPool *pool);
126 
127 /*
128  * Initialize tokenizer.
129  * Returns 0 on success, -1 on error.
130  */
132 
133 /*
134  * Load tokenizer from HuggingFace tokenizer.json.
135  * Returns 0 on success, -1 on error.
136  */
137 int ck_tokenizer_load(CKTokenizer *tok, const char *path);
138 
139 /*
140  * Add a token to the vocabulary.
141  * Returns the token ID.
142  */
143 int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len);
144 
145 /*
146  * Look up token ID by string.
147  * Returns token ID or unk_id if not found.
148  */
149 int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len);
150 
151 /*
152  * Add a BPE merge rule.
153  */
154 int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged);
155 
156 /*
157  * Look up merge rule for a pair.
158  * Returns merge index or -1 if no merge.
159  */
160 int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right);
161 
162 /*
163  * Encode text to token IDs.
164  * Returns number of tokens written to `ids`.
165  */
166 int ck_tokenizer_encode(const CKTokenizer *tok,
167  const char *text,
168  int text_len,
169  int32_t *ids,
170  int max_ids);
171 
172 /*
173  * Decode token IDs to text.
174  * Returns number of bytes written to `text`.
175  */
176 int ck_tokenizer_decode(const CKTokenizer *tok,
177  const int32_t *ids,
178  int num_ids,
179  char *text,
180  int max_len);
181 
182 /*
183  * Get token string for an ID.
184  * Returns NULL if ID is invalid.
185  */
186 const char *ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id);
187 
188 /*
189  * Free tokenizer resources.
190  */
191 void ck_tokenizer_free(CKTokenizer *tok);
192 
193 /*
194  * Get vocabulary size.
195  */
196 static inline int ck_tokenizer_vocab_size(const CKTokenizer *tok) {
197  return tok->vocab_size;
198 }
199 
200 #ifdef __cplusplus
201 }
202 #endif
203 
204 #endif /* CK_TOKENIZER_H */
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
void ck_pool_init(CKMemPool *pool)
Definition: ck_tokenizer.c:51
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: ck_tokenizer.c:737
int ck_tokenizer_init(CKTokenizer *tok)
Definition: ck_tokenizer.c:148
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
int ck_tokenizer_load(CKTokenizer *tok, const char *path)
Definition: ck_tokenizer.c:432
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69
char * ck_pool_strdup(CKMemPool *pool, const char *s, int len)
Definition: ck_tokenizer.c:98
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: ck_tokenizer.c:638
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:196
void ck_pool_free(CKMemPool *pool)
Definition: ck_tokenizer.c:107
void ck_tokenizer_free(CKTokenizer *tok)
Definition: ck_tokenizer.c:183
int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right)
Definition: ck_tokenizer.c:276
static int ck_tokenizer_vocab_size(const CKTokenizer *tok)
Definition: ck_tokenizer.h:196
CKPoolBlock * current
Definition: ck_tokenizer.h:46
CKPoolBlock * head
Definition: ck_tokenizer.h:45
size_t total_allocated
Definition: ck_tokenizer.h:47
int32_t left
Definition: ck_tokenizer.h:67
int32_t right
Definition: ck_tokenizer.h:68
int32_t merged
Definition: ck_tokenizer.h:69
uint8_t * data
Definition: ck_tokenizer.h:38
struct CKPoolBlock * next
Definition: ck_tokenizer.h:41
size_t used
Definition: ck_tokenizer.h:39
size_t capacity
Definition: ck_tokenizer.h:40
int32_t bos_id
Definition: ck_tokenizer.h:98
CKMemPool pool
Definition: ck_tokenizer.h:78
int32_t unk_id
Definition: ck_tokenizer.h:97
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int32_t eos_id
Definition: ck_tokenizer.h:99
int vocab_hash_size
Definition: ck_tokenizer.h:83
int merge_hash_size
Definition: ck_tokenizer.h:94
CKMergeRule * merges
Definition: ck_tokenizer.h:89
char ** id_to_token
Definition: ck_tokenizer.h:86
int * merge_hash
Definition: ck_tokenizer.h:93
int32_t pad_id
Definition: ck_tokenizer.h:100
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
const int32_t * ids
Definition: tokenizer.h:443
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
const int32_t int char int max_len
Definition: true_bpe.h:280
const char int text_len
Definition: true_bpe.h:262
const char * left
Definition: true_bpe.h:130
const char int int32_t int max_ids
Definition: true_bpe.h:264
const char const char * right
Definition: true_bpe.h:131