← Back to C-Kernel-Engine Docs Doxygen Source Documentation
true_bpe.h
Go to the documentation of this file.
1 /*
2  * True BPE (Byte-Pair Encoding) Tokenizer
3  *
4  * Implements the actual BPE algorithm used by GPT-2, LLaMA, Qwen, etc.
5  * Unlike greedy longest-match, this applies merge rules in priority order.
6  *
7  * Algorithm:
8  * 1. Split text into initial tokens (characters or bytes)
9  * 2. Find the highest-priority merge that can be applied
10  * 3. Apply that merge (combine two adjacent tokens into one)
11  * 4. Repeat until no more merges possible
12  * 5. Look up final tokens in vocabulary to get IDs
13  *
14  * This provides 100% parity with HuggingFace tokenizers when vocabulary
15  * and merge rules are loaded correctly.
16  *
17  * By Anthony Shivakumar
18  */
19 
20 #ifndef CK_TRUE_BPE_H
21 #define CK_TRUE_BPE_H
22 
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdbool.h>
26 
27 #include "tokenizer/hash_table.h"
28 
29 #ifdef __cplusplus
30 extern "C" {
31 #endif
32 
33 /* Export macro */
34 #ifdef _WIN32
35 #define CK_TRUE_BPE_API __declspec(dllexport)
36 #else
37 #define CK_TRUE_BPE_API __attribute__((visibility("default")))
38 #endif
39 
40 /* ═══════════════════════════════════════════════════════════════════════════════
41  * Types
42  * ═══════════════════════════════════════════════════════════════════════════════ */
43 
44 /* Space prefix style for BPE tokenizers (same as tokenizer.h for compatibility) */
45 typedef enum {
46  CK_SPACE_PREFIX_AUTO = 0, /* Auto-detect from vocabulary */
47  CK_SPACE_PREFIX_GPT2 = 1, /* GPT-2 style: Ġ (U+0120, bytes 0xC4 0xA0) */
48  CK_SPACE_PREFIX_SPM = 2 /* SentencePiece style: ▁ (U+2581, bytes 0xE2 0x96 0x81) */
50 
51 /* True BPE configuration */
52 typedef struct {
53  bool add_bos; /* Add beginning-of-sequence token */
54  bool add_eos; /* Add end-of-sequence token */
55  bool byte_fallback; /* Fall back to byte tokens for unknown chars */
56  CKSpacePrefixStyle space_prefix_style; /* Space prefix style (Ġ vs ▁) */
57 } CKBPEConfig;
58 
59 /* Opaque tokenizer handle */
60 typedef struct CKTrueBPE CKTrueBPE;
61 
62 /* ═══════════════════════════════════════════════════════════════════════════════
63  * Creation and Destruction
64  * ═══════════════════════════════════════════════════════════════════════════════ */
65 
66 /**
67  * Create a new True BPE tokenizer.
68  *
69  * @return Newly allocated tokenizer, or NULL on error
70  */
72 
73 /**
74  * Free a True BPE tokenizer.
75  *
76  * @param bpe Tokenizer to free
77  */
78 CK_TRUE_BPE_API void ck_true_bpe_free(CKTrueBPE *bpe);
79 
80 /* ═══════════════════════════════════════════════════════════════════════════════
81  * Vocabulary Management
82  * ═══════════════════════════════════════════════════════════════════════════════ */
83 
84 /**
85  * Add a token to the vocabulary.
86  *
87  * @param bpe Tokenizer
88  * @param token Token string (UTF-8)
89  * @param id Token ID
90  * @param score Token score (for unigram models, 0.0 for BPE)
91  * @return 0 on success, -1 on error
92  */
93 CK_TRUE_BPE_API int ck_true_bpe_add_token(CKTrueBPE *bpe,
94  const char *token,
95  int32_t id,
96  float score);
97 
98 /**
99  * Add a BPE merge rule by token IDs.
100  *
101  * Merge rules define how tokens are combined during encoding.
102  * Rules with lower priority numbers are applied first.
103  *
104  * @param bpe Tokenizer
105  * @param left_id Left token ID
106  * @param right_id Right token ID
107  * @param merged_id Resulting merged token ID
108  * @param priority Merge priority (lower = applied first)
109  * @return 0 on success, -1 on error
110  */
111 CK_TRUE_BPE_API int ck_true_bpe_add_merge(CKTrueBPE *bpe,
112  int32_t left_id,
113  int32_t right_id,
114  int32_t merged_id,
115  int32_t priority);
116 
117 /**
118  * Add a BPE merge rule by token strings.
119  *
120  * This looks up the token IDs automatically and determines the merged token.
121  * The merged token must already exist in the vocabulary.
122  *
123  * @param bpe Tokenizer
124  * @param left Left token string
125  * @param right Right token string
126  * @param priority Merge priority (lower = applied first)
127  * @return 0 on success, -1 on error
128  */
130  const char *left,
131  const char *right,
132  int32_t priority);
133 
134 /**
135  * Set special token IDs.
136  *
137  * @param bpe Tokenizer
138  * @param unk Unknown token ID (-1 to disable)
139  * @param bos Beginning-of-sequence token ID (-1 to disable)
140  * @param eos End-of-sequence token ID (-1 to disable)
141  * @param pad Padding token ID (-1 to disable)
142  */
143 CK_TRUE_BPE_API void ck_true_bpe_set_special_ids(CKTrueBPE *bpe,
144  int32_t unk,
145  int32_t bos,
146  int32_t eos,
147  int32_t pad);
148 
149 /**
150  * Add a special token that should be matched BEFORE BPE encoding.
151  *
152  * Special tokens like <|im_start|>, <|im_end|>, <|endoftext|> are matched
153  * literally in the input text before BPE processing. Without this, BPE would
154  * break them into individual characters.
155  *
156  * @param bpe Tokenizer
157  * @param token Token string to match literally (e.g., "<|im_end|>")
158  * @param id Token ID to output when matched
159  * @return 0 on success, -1 on error
160  */
162  const char *token,
163  int32_t id);
164 
165 /**
166  * Set tokenizer configuration.
167  *
168  * @param bpe Tokenizer
169  * @param config Configuration to apply
170  */
172 
173 /**
174  * Load vocabulary + merges from binary buffers.
175  *
176  * @param bpe Tokenizer
177  * @param vocab_size Number of tokens
178  * @param offsets Offsets array (length vocab_size)
179  * @param strings Null-terminated token strings blob
180  * @param num_merges Number of merge rules
181  * @param merges Merge triples [left_id, right_id, merged_id] (length num_merges*3)
182  * @return 0 on success, -1 on error
183  */
184 CK_TRUE_BPE_API int ck_true_bpe_load_binary(CKTrueBPE *bpe,
186  const int32_t *offsets,
187  const char *strings,
189  const int32_t *merges);
190 
191 /* ═══════════════════════════════════════════════════════════════════════════════
192  * Token Lookup
193  * ═══════════════════════════════════════════════════════════════════════════════ */
194 
195 /**
196  * Look up a token ID by string.
197  *
198  * @param bpe Tokenizer
199  * @param token Token string
200  * @return Token ID, or unk_id if not found
201  */
202 CK_TRUE_BPE_API int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token);
203 
204 /**
205  * Get a token string by ID.
206  *
207  * @param bpe Tokenizer
208  * @param id Token ID
209  * @return Token string, or NULL if invalid
210  */
211 CK_TRUE_BPE_API const char *ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id);
212 
213 /**
214  * Get vocabulary size.
215  *
216  * @param bpe Tokenizer
217  * @return Number of tokens in vocabulary
218  */
219 CK_TRUE_BPE_API size_t ck_true_bpe_vocab_size(const CKTrueBPE *bpe);
220 
221 /**
222  * Get number of merge rules.
223  *
224  * @param bpe Tokenizer
225  * @return Number of merge rules
226  */
227 CK_TRUE_BPE_API int32_t ck_true_bpe_num_merges(const CKTrueBPE *bpe);
228 
229 /* ═══════════════════════════════════════════════════════════════════════════════
230  * Space Prefix Detection
231  * ═══════════════════════════════════════════════════════════════════════════════ */
232 
233 /**
234  * Auto-detect space prefix style from vocabulary.
235  *
236  * Counts tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece) to determine style.
237  * The detected style is cached in the config.
238  *
239  * @param bpe Tokenizer
240  * @return Detected style (GPT2 or SPM)
241  */
243 
244 /* ═══════════════════════════════════════════════════════════════════════════════
245  * Encoding and Decoding
246  * ═══════════════════════════════════════════════════════════════════════════════ */
247 
248 /**
249  * Encode text to token IDs using true BPE algorithm.
250  *
251  * This applies merge rules in priority order (not greedy longest-match).
252  *
253  * @param bpe Tokenizer
254  * @param text Input text (UTF-8)
255  * @param text_len Text length in bytes, or -1 for null-terminated
256  * @param ids Output token IDs array
257  * @param max_ids Maximum IDs to write
258  * @return Number of tokens written
259  */
260 CK_TRUE_BPE_API int ck_true_bpe_encode(CKTrueBPE *bpe,
261  const char *text,
262  int text_len,
263  int32_t *ids,
264  int max_ids);
265 
266 /**
267  * Decode token IDs to text.
268  *
269  * @param bpe Tokenizer
270  * @param ids Input token IDs
271  * @param num_ids Number of IDs
272  * @param text Output text buffer
273  * @param max_len Maximum text length
274  * @return Number of bytes written (excluding null terminator)
275  */
276 CK_TRUE_BPE_API int ck_true_bpe_decode(const CKTrueBPE *bpe,
277  const int32_t *ids,
278  int num_ids,
279  char *text,
280  int max_len);
281 
282 #ifdef __cplusplus
283 }
284 #endif
285 
286 #endif /* CK_TRUE_BPE_H */
bool byte_fallback
Definition: true_bpe.h:55
CKSpacePrefixStyle space_prefix_style
Definition: true_bpe.h:56
bool add_bos
Definition: true_bpe.h:53
bool add_eos
Definition: true_bpe.h:54
CKSpacePrefixStyle
Definition: tokenizer.h:60
int ck_true_bpe_decode(const CKTrueBPE *bpe, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: true_bpe.c:1439
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1338
void ck_true_bpe_set_config(CKTrueBPE *bpe, const CKBPEConfig *config)
Definition: true_bpe.c:560
CKSpacePrefixStyle ck_true_bpe_detect_space_style(CKTrueBPE *bpe)
Definition: true_bpe.c:654
void ck_true_bpe_free(CKTrueBPE *bpe)
Definition: true_bpe.c:405
CKTrueBPE * ck_true_bpe_create(void)
Definition: true_bpe.c:342
void ck_true_bpe_set_special_ids(CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad)
Definition: true_bpe.c:552
int ck_true_bpe_add_merge(CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
Definition: true_bpe.c:497
int32_t ck_true_bpe_num_merges(const CKTrueBPE *bpe)
Definition: true_bpe.c:1510
int ck_true_bpe_add_special_token(CKTrueBPE *bpe, const char *token, int32_t id)
Definition: true_bpe.c:565
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: true_bpe.c:606
size_t ck_true_bpe_vocab_size(const CKTrueBPE *bpe)
Definition: true_bpe.c:1506
int ck_true_bpe_add_token(CKTrueBPE *bpe, const char *token, int32_t id, float score)
Definition: true_bpe.c:449
int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token)
Definition: true_bpe.c:638
int ck_true_bpe_add_merge_by_tokens(CKTrueBPE *bpe, const char *left, const char *right, int32_t priority)
Definition: true_bpe.c:514
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
const CKBPEConfig * config
Definition: true_bpe.h:171
const char * token
Definition: true_bpe.h:94
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
CKSpacePrefixStyle
Definition: true_bpe.h:45
@ CK_SPACE_PREFIX_AUTO
Definition: true_bpe.h:46
@ CK_SPACE_PREFIX_SPM
Definition: true_bpe.h:48
@ CK_SPACE_PREFIX_GPT2
Definition: true_bpe.h:47
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
const int32_t int num_ids
Definition: true_bpe.h:278
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const char int32_t float score
Definition: true_bpe.h:96
const char int int32_t * ids
Definition: true_bpe.h:263
const int32_t int char int max_len
Definition: true_bpe.h:280
int32_t left_id
Definition: true_bpe.h:112
const char * text
Definition: true_bpe.h:261
const char int text_len
Definition: true_bpe.h:262
int32_t unk
Definition: true_bpe.h:144
int vocab_size
Definition: true_bpe.h:185
int32_t int32_t right_id
Definition: true_bpe.h:113
#define CK_TRUE_BPE_API
Definition: true_bpe.h:37
int const int32_t * offsets
Definition: true_bpe.h:186
const char * left
Definition: true_bpe.h:130
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114
const char int int32_t int max_ids
Definition: true_bpe.h:264
int32_t int32_t int32_t eos
Definition: true_bpe.h:146
int32_t int32_t int32_t int32_t pad
Definition: true_bpe.h:147
const char const char * right
Definition: true_bpe.h:131
int32_t int32_t bos
Definition: true_bpe.h:145