← Back to C-Kernel-Engine Docs Doxygen Source Documentation
tokenizer.h
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine Tokenizer
3  *
4  * High-performance tokenizer supporting:
5  * - BPE (Byte-Pair Encoding): GPT-2, LLaMA, Qwen
6  * - WordPiece: BERT, RoBERTa
7  * - SentencePiece (unigram): LLaMA, T5
8  *
9  * Features:
10  * - MurmurHash3 hashing
11  * - AVX-512 optimized string comparison
12  * - Greedy longest-match encoding
13  * - Full UTF-8 support
14  * - GGUF vocab loading
15  *
16  * By Anthony Shivakumar
17  */
18 
19 #ifndef CK_TOKENIZER_H
20 #define CK_TOKENIZER_H
21 
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdbool.h>
25 
26 #include "tokenizer/murmurhash3.h"
27 #include "tokenizer/memory_pool.h"
28 #include "tokenizer/hash_table.h"
29 #include "tokenizer/utf8.h"
30 #include "data_structures/tries/trie.h"
31 
32 #ifdef __cplusplus
33 extern "C" {
34 #endif
35 
36 /* Export macro */
37 #ifdef _WIN32
38 #define CK_TOKENIZER_API __declspec(dllexport)
39 #else
40 #define CK_TOKENIZER_API __attribute__((visibility("default")))
41 #endif
42 
43 /* Maximum token length */
44 #define CK_TOKENIZER_MAX_TOKEN_LEN 256
45 
46 /* Maximum vocabulary size */
47 #define CK_TOKENIZER_MAX_VOCAB_SIZE 256000
48 
49 /* Default hash table size */
50 #define CK_TOKENIZER_DEFAULT_HT_SIZE 65536
51 
52 /* Tokenizer model type */
53 typedef enum {
54  CK_TOKENIZER_BPE = 0, /* Byte-Pair Encoding (GPT-2, LLaMA, Qwen) */
55  CK_TOKENIZER_WORDPIECE = 1, /* WordPiece (BERT, RoBERTa) */
56  CK_TOKENIZER_SPM = 2 /* SentencePiece (unigram) */
58 
59 /* Space prefix style for BPE tokenizers */
60 typedef enum {
61  CK_SPACE_PREFIX_AUTO = 0, /* Auto-detect from vocabulary */
62  CK_SPACE_PREFIX_GPT2 = 1, /* GPT-2 style: Ġ (U+0120, bytes 0xC4 0xA0) */
63  CK_SPACE_PREFIX_SPM = 2 /* SentencePiece style: ▁ (U+2581, bytes 0xE2 0x96 0x81) */
65 
66 /* SentencePiece mode */
67 typedef enum {
68  CK_SPM_MODE_UNIGRAM = 0, /* SentencePiece unigram/Viterbi */
69  CK_SPM_MODE_LLAMA = 1 /* llama.cpp merge-style SPM */
71 
72 /* Tokenizer configuration */
73 typedef struct {
74  CKTokenizerType type; /* Tokenization algorithm */
75  bool add_bos; /* Add beginning-of-sequence token */
76  bool add_eos; /* Add end-of-sequence token */
77  bool add_space_prefix; /* For SPM: add ▁ at start (SentencePiece) */
78  bool lowercase; /* Convert text to lowercase before tokenizing */
79  bool treat_whitespace_as_suffix; /* For SentencePiece */
80  float unk_score; /* Unknown token score (for SPM) */
81  bool use_trie; /* Use trie for lookups (faster), false = use hash table */
82  CKSpacePrefixStyle space_prefix_style; /* Space prefix style (GPT-2 Ġ vs SentencePiece ▁) */
83  bool space_prefix_detected; /* True if auto-detection has run */
84  CKSpmMode spm_mode; /* SPM mode: unigram or llama-style */
86 
87 /* Vocabulary entry */
88 typedef struct {
89  int32_t id; /* Token ID */
90  float score; /* Score (for SPM) */
91  bool is_special; /* Is special token */
93 
94 /* Main tokenizer structure */
95 typedef struct CKTokenizer {
96  /* Configuration */
98 
99  /* Vocabulary: token string -> token info */
101 
102  /* Trie for fast longest-match lookups (O(k) instead of O(n*k)) */
104 
105  /* Reverse vocabulary: ID -> token string */
106  char **id_to_token;
107  size_t vocab_size;
109 
110  /* Token scores for SPM (Viterbi/DP encoding) */
111  float *scores;
112  size_t scores_size; /* Allocated size for scores array */
113  uint8_t *types; /* Token type (GGUF: 1=normal, 2=unknown, 3=control, 4=user_defined, 6=byte) */
114  size_t types_size; /* Allocated size for types array */
115 
116  /* Byte token lookup table for SPM (built during load) */
117  int32_t *byte_token_id; /* Map byte value (0-255) to token ID, -1 = not found */
118 
119  /* Special token IDs */
120  int32_t unk_id;
121  int32_t bos_id;
122  int32_t eos_id;
123  int32_t pad_id;
124  int32_t mask_id;
125 
126  /* Memory pool for allocations */
128 
129  /* For BPE: merge rules */
130  int32_t *merge_pairs; /* left_id * vocab_size + right_id -> merge priority */
132  int32_t *merge_result; /* merge priority -> merged token ID */
134  int32_t num_merges;
135 
136  /* Cache for encoding */
139 } CKTokenizer;
140 
141 /* ============================================================================
142  * Initialization and Cleanup
143  * ============================================================================ */
144 
145 /**
146  * Create a new tokenizer.
147  *
148  * @param type Tokenizer type (BPE, WordPiece, SPM)
149  * @return Newly allocated tokenizer, or NULL on error
150  */
152 
153 /**
154  * Create tokenizer with default BPE config.
155  */
156 static inline CKTokenizer *ck_tokenizer_create_bpe(void) {
158 }
159 
160 /**
161  * Create tokenizer with default WordPiece config.
162  */
165 }
166 
167 /**
168  * Create tokenizer with default SPM config.
169  */
170 static inline CKTokenizer *ck_tokenizer_create_spm(void) {
172 }
173 
174 /**
175  * Free a tokenizer.
176  *
177  * @param tok Tokenizer to free
178  */
180 
181 /**
182  * Reset tokenizer state (clear vocab but keep config).
183  *
184  * @param tok Tokenizer to reset
185  */
187 
188 /* ============================================================================
189  * Vocabulary Management
190  * ============================================================================ */
191 
192 /**
193  * Add a token to vocabulary.
194  *
195  * @param tok Tokenizer
196  * @param token Token string
197  * @param id Token ID
198  * @param score Token score (for SPM)
199  * @return 0 on success, -1 on error
200  */
202  const char *token,
203  int32_t id,
204  float score);
205 
206 /**
207  * Add special token (UNK, BOS, EOS, PAD, MASK).
208  *
209  * @param tok Tokenizer
210  * @param name Special token name ("unk", "bos", "eos", "pad", "mask")
211  * @param id Token ID
212  * @return 0 on success, -1 on error
213  */
215  const char *name,
216  int32_t id);
217 
218 /**
219  * Set special token IDs.
220  *
221  * @param tok Tokenizer
222  * @param unk Unknown token ID
223  * @param bos Beginning-of-sequence token ID
224  * @param eos End-of-sequence token ID
225  * @param pad Padding token ID
226  * @param mask Mask token ID
227  */
229  int32_t unk,
230  int32_t bos,
231  int32_t eos,
232  int32_t pad,
233  int32_t mask);
234 
235 /**
236  * Set whether to add BOS/EOS tokens during encoding.
237  *
238  * @param tok Tokenizer
239  * @param add_bos If true, prepend BOS token (if available)
240  * @param add_eos If true, append EOS token (if available)
241  */
243 
244 /**
245  * Set whether to add the SentencePiece space prefix (▁) at the start.
246  *
247  * This mirrors SentencePiece's add_dummy_prefix behavior.
248  *
249  * @param tok Tokenizer
250  * @param add_space_prefix If true, add leading ▁ when appropriate
251  */
253 
254 /**
255  * Set SentencePiece mode.
256  *
257  * @param tok Tokenizer
258  * @param spm_mode SPM mode (unigram or llama-style)
259  */
261 
262 /**
263  * Set whether to lowercase input text before tokenizing.
264  *
265  * @param tok Tokenizer
266  * @param lowercase If true, convert text to lowercase
267  */
268 CK_TOKENIZER_API void ck_tokenizer_set_lowercase(CKTokenizer *tok, bool lowercase);
269 
270 /**
271  * Set lookup method (trie vs hash table).
272  *
273  * @param tok Tokenizer
274  * @param use_trie If true, use trie (faster for longest-match), false = hash table
275  */
277 
278 /**
279  * Set space prefix style for BPE tokenizers.
280  *
281  * GPT-2/Qwen use Ġ (U+0120), LLaMA/SentencePiece use ▁ (U+2581).
282  * Default is AUTO which auto-detects from vocabulary.
283  *
284  * @param tok Tokenizer
285  * @param style Space prefix style (AUTO, GPT2, or SPM)
286  */
288 
289 /**
290  * Auto-detect space prefix style from vocabulary.
291  *
292  * Checks for presence of tokens starting with Ġ vs ▁ to determine style.
293  *
294  * @param tok Tokenizer
295  * @return Detected style (GPT2 or SPM)
296  */
298 
299 /**
300  * Look up token ID by string.
301  *
302  * @param tok Tokenizer
303  * @param token Token string
304  * @return Token ID, or unk_id if not found
305  */
306 CK_TOKENIZER_API int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token);
307 
308 /**
309  * Get token string by ID.
310  *
311  * @param tok Tokenizer
312  * @param id Token ID
313  * @return Token string, or NULL if invalid
314  */
315 CK_TOKENIZER_API const char *ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id);
316 
317 /**
318  * Get token info by ID.
319  *
320  * @param tok Tokenizer
321  * @param id Token ID
322  * @param score Output: token score
323  * @return Token string, or NULL if invalid
324  */
325 CK_TOKENIZER_API const char *ck_tokenizer_id_to_token_info(const CKTokenizer *tok,
326  int32_t id,
327  float *score);
328 
329 /**
330  * Get vocabulary size.
331  */
332 static inline size_t ck_tokenizer_vocab_size(const CKTokenizer *tok) {
333  return tok ? tok->vocab_size : 0;
334 }
335 
336 /* ============================================================================
337  * BPE Merge Rules
338  * ============================================================================ */
339 
340 /**
341  * Add a BPE merge rule.
342  *
343  * @param tok Tokenizer
344  * @param left_id Left token ID
345  * @param right_id Right token ID
346  * @param merged_id Merged token ID
347  * @param priority Lower = higher priority (applied first)
348  * @return 0 on success, -1 on error
349  */
351  int32_t left_id,
352  int32_t right_id,
353  int32_t merged_id,
354  int32_t priority);
355 
356 /* ============================================================================
357  * Encoding (Text -> Token IDs)
358  * ============================================================================ */
359 
360 /**
361  * Encode text to token IDs using greedy longest-match.
362  *
363  * For BPE: applies merge rules iteratively.
364  * For WordPiece/SPM: greedy longest-match from vocabulary.
365  *
366  * @param tok Tokenizer
367  * @param text Input text
368  * @param text_len Text length, or -1 for null-terminated
369  * @param ids Output token IDs
370  * @param max_ids Maximum IDs to write
371  * @return Number of tokens written
372  */
373 int ck_tokenizer_encode(const CKTokenizer *tok,
374  const char *text,
375  int text_len,
376  int32_t *ids,
377  int max_ids);
378 
379 /**
380  * Encode with special token handling.
381  *
382  * @param tok Tokenizer
383  * @param text Input text
384  * @param text_len Text length, or -1 for null-terminated
385  * @param ids Output token IDs
386  * @param max_ids Maximum IDs to write
387  * @param add_special Add BOS/EOS tokens
388  * @return Number of tokens written
389  */
391  const char *text,
392  int text_len,
393  int32_t *ids,
394  int max_ids,
395  bool add_special);
396 
397 /**
398  * Encode and return tokens as array of strings.
399  *
400  * @param tok Tokenizer
401  * @param text Input text
402  * @param text_len Text length
403  * @param out_tokens Output token strings (caller must free each)
404  * @param max_tokens Maximum tokens
405  * @return Number of tokens written
406  */
408  const char *text,
409  int text_len,
410  const char **out_tokens,
411  int max_tokens);
412 
413 /* ============================================================================
414  * Decoding (Token IDs -> Text)
415  * ============================================================================ */
416 
417 /**
418  * Decode token IDs to text.
419  *
420  * @param tok Tokenizer
421  * @param ids Input token IDs
422  * @param num_ids Number of IDs
423  * @param text Output text buffer
424  * @param max_len Maximum text length
425  * @return Number of bytes written
426  */
427 int ck_tokenizer_decode(const CKTokenizer *tok,
428  const int32_t *ids,
429  int num_ids,
430  char *text,
431  int max_len);
432 
433 /**
434  * Decode to buffer allocated by caller.
435  *
436  * @param tok Tokenizer
437  * @param ids Input token IDs
438  * @param num_ids Number of IDs
439  * @param out_len Output: length of decoded string
440  * @return Newly allocated string, or NULL on error
441  */
442 CK_TOKENIZER_API char *ck_tokenizer_decode_alloc(const CKTokenizer *tok,
443  const int32_t *ids,
444  int num_ids,
445  int *out_len);
446 
447 /* ============================================================================
448  * File Loading
449  * ============================================================================ */
450 
451 /**
452  * Load vocabulary from memory-mapped binary data.
453  *
454  * @param tok Tokenizer
455  * @param vocab_size Number of tokens
456  * @param offsets Array of offsets into strings pool
457  * @param strings String pool containing null-terminated tokens
458  * @param num_merges Number of BPE merges
459  * @param merges Merge rules as (left, right, merged) triplets
460  * @return 0 on success, -1 on error
461  */
463  int vocab_size,
464  const int32_t *offsets,
465  const char *strings,
466  int num_merges,
467  const int32_t *merges);
468 
469 /**
470  * Load vocabulary from memory-mapped binary data with scores and types.
471  *
472  * This extended version supports SPM (SentencePiece) tokenizers which require
473  * token scores for Viterbi/DP encoding.
474  *
475  * @param tok Tokenizer
476  * @param vocab_size Number of tokens
477  * @param offsets Array of offsets into strings pool
478  * @param strings String pool containing null-terminated tokens
479  * @param scores Array of token scores (float32), can be NULL
480  * @param types Array of token types (uint8), can be NULL
481  * @param num_merges Number of BPE merges
482  * @param merges Merge rules as (left, right, merged) triplets
483  * @return 0 on success, -1 on error
484  */
486  int vocab_size,
487  const int32_t *offsets,
488  const char *strings,
489  const float *scores,
490  const uint8_t *types,
491  int num_merges,
492  const int32_t *merges);
493 
494 /**
495  * Load vocabulary from GGUF file.
496  *
497  * @param tok Tokenizer
498  * @param path Path to GGUF file
499  * @return 0 on success, -1 on error
500  */
501 int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path);
502 
503 /**
504  * Load vocabulary from JSON file (HuggingFace format).
505  *
506  * @param tok Tokenizer
507  * @param path Path to vocab.json or tokenizer.json
508  * @return 0 on success, -1 on error
509  */
510 int ck_tokenizer_load_json(CKTokenizer *tok, const char *path);
511 
512 /**
513  * Load vocabulary from text file (one token per line).
514  *
515  * Format: token_string [id] [score]
516  * Lines starting with # are comments.
517  *
518  * @param tok Tokenizer
519  * @param path Path to vocabulary file
520  * @return 0 on success, -1 on error
521  */
522 int ck_tokenizer_load_text(CKTokenizer *tok, const char *path);
523 
524 /**
525  * Load BPE merges from text file.
526  *
527  * Format: token1 token2 (one merge per line)
528  *
529  * @param tok Tokenizer
530  * @param path Path to merges.txt
531  * @return 0 on success, -1 on error
532  */
533 int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path);
534 
535 /* ============================================================================
536  * Utility Functions
537  * ============================================================================ */
538 
539 /**
540  * Get the tokenizer type name.
541  *
542  * @param tok Tokenizer
543  * @return Type name string
544  */
545 CK_TOKENIZER_API const char *ck_tokenizer_type_name(const CKTokenizer *tok);
546 
547 /**
548  * Check if token is special.
549  *
550  * @param tok Tokenizer
551  * @param id Token ID
552  * @return true if special token
553  */
554 CK_TOKENIZER_API bool ck_tokenizer_is_special(const CKTokenizer *tok, int32_t id);
555 
556 /**
557  * Estimate encoded token count.
558  *
559  * @param tok Tokenizer
560  * @param text Input text
561  * @return Estimated number of tokens
562  */
563 CK_TOKENIZER_API size_t ck_tokenizer_estimate_tokens(const CKTokenizer *tok, const char *text);
564 
565 /**
566  * Get last error message.
567  *
568  * @return Last error message, or NULL if no error
569  */
570 CK_TOKENIZER_API const char *ck_tokenizer_last_error(void);
571 
572 #ifdef __cplusplus
573 }
574 #endif
575 
576 #endif /* CK_TOKENIZER_H */
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
void ck_tokenizer_free(CKTokenizer *tok)
Definition: ck_tokenizer.c:183
bool treat_whitespace_as_suffix
Definition: tokenizer.h:79
bool add_space_prefix
Definition: tokenizer.h:77
CKTokenizerType type
Definition: tokenizer.h:74
CKSpmMode spm_mode
Definition: tokenizer.h:84
bool space_prefix_detected
Definition: tokenizer.h:83
CKSpacePrefixStyle space_prefix_style
Definition: tokenizer.h:82
int32_t bos_id
Definition: ck_tokenizer.h:98
float * scores
Definition: tokenizer.h:111
size_t types_size
Definition: tokenizer.h:114
int32_t * merge_result
Definition: tokenizer.h:132
size_t vocab_size
Definition: tokenizer.h:107
int32_t * byte_token_id
Definition: tokenizer.h:117
CKTokenizerMemPool pool
Definition: tokenizer.h:127
int32_t unk_id
Definition: ck_tokenizer.h:97
int32_t num_merges
Definition: tokenizer.h:134
CKTrie * vocab_trie
Definition: tokenizer.h:103
int32_t eos_id
Definition: ck_tokenizer.h:99
size_t merge_pairs_size
Definition: tokenizer.h:131
size_t merge_result_size
Definition: tokenizer.h:133
char * encode_buffer
Definition: tokenizer.h:137
int32_t * merge_pairs
Definition: tokenizer.h:130
size_t encode_buffer_size
Definition: tokenizer.h:138
CKTokenizerHashTable * vocab
Definition: tokenizer.h:100
uint8_t * types
Definition: tokenizer.h:113
size_t vocab_capacity
Definition: tokenizer.h:108
size_t scores_size
Definition: tokenizer.h:112
char ** id_to_token
Definition: ck_tokenizer.h:86
int32_t mask_id
Definition: tokenizer.h:124
CKTokenizerConfig config
Definition: tokenizer.h:97
int32_t pad_id
Definition: ck_tokenizer.h:100
void ck_tokenizer_set_add_bos_eos(CKTokenizer *tok, bool add_bos, bool add_eos)
Definition: tokenizer.c:243
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok)
Definition: tokenizer.c:276
void ck_tokenizer_set_spm_mode(CKTokenizer *tok, CKSpmMode spm_mode)
Definition: tokenizer.c:254
CKTokenizer * ck_tokenizer_create(CKTokenizerType type)
Definition: tokenizer.c:34
void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)
Definition: tokenizer.c:234
void ck_tokenizer_reset(CKTokenizer *tok)
Definition: tokenizer.c:125
void ck_tokenizer_set_use_trie(CKTokenizer *tok, bool use_trie)
Definition: tokenizer.c:260
void ck_tokenizer_set_add_space_prefix(CKTokenizer *tok, bool add_space_prefix)
Definition: tokenizer.c:249
void ck_tokenizer_set_space_prefix_style(CKTokenizer *tok, CKSpacePrefixStyle style)
Definition: tokenizer.c:266
int32_t int32_t int32_t int32_t int32_t mask
Definition: tokenizer.h:233
const int32_t * ids
Definition: tokenizer.h:443
static CKTokenizer * ck_tokenizer_create_bpe(void)
Definition: tokenizer.h:156
int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1252
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: ck_tokenizer.c:737
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
Definition: tokenizer.c:157
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62
const int32_t int num_ids
Definition: tokenizer.h:444
CKTokenizerType
Definition: tokenizer.h:53
@ CK_TOKENIZER_BPE
Definition: tokenizer.h:54
@ CK_TOKENIZER_SPM
Definition: tokenizer.h:56
@ CK_TOKENIZER_WORDPIECE
Definition: tokenizer.h:55
int ck_tokenizer_load_text(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1334
int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1332
int ck_tokenizer_load_json(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1333
const char * text
Definition: tokenizer.h:563
bool bool add_eos
Definition: tokenizer.h:242
bool add_space_prefix
Definition: tokenizer.h:252
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
bool lowercase
Definition: tokenizer.h:268
static CKTokenizer * ck_tokenizer_create_wordpiece(void)
Definition: tokenizer.h:163
CKSpmMode spm_mode
Definition: tokenizer.h:260
static size_t ck_tokenizer_vocab_size(const CKTokenizer *tok)
Definition: tokenizer.h:332
bool add_bos
Definition: tokenizer.h:242
const char * token
Definition: tokenizer.h:306
int32_t float * score
Definition: tokenizer.h:327
CKSpmMode
Definition: tokenizer.h:67
@ CK_SPM_MODE_UNIGRAM
Definition: tokenizer.h:68
@ CK_SPM_MODE_LLAMA
Definition: tokenizer.h:69
static CKTokenizer * ck_tokenizer_create_spm(void)
Definition: tokenizer.h:170
int32_t unk
Definition: tokenizer.h:229
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: ck_tokenizer.c:638
int ck_tokenizer_encode_tokens(const CKTokenizer *tok, const char *text, int text_len, const char **out_tokens, int max_tokens)
bool use_trie
Definition: tokenizer.h:276
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
Definition: tokenizer.c:1336
int ck_tokenizer_encode_with_special(CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids, bool add_special)
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
Definition: tokenizer.c:213
int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1335
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
#define CK_TOKENIZER_API
Definition: tokenizer.h:40
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232
CKSpacePrefixStyle style
Definition: tokenizer.h:287
int32_t int32_t bos
Definition: tokenizer.h:230
const int32_t int int * out_len
Definition: tokenizer.h:445
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const int32_t int char int max_len
Definition: true_bpe.h:280
int32_t left_id
Definition: true_bpe.h:112
const char int text_len
Definition: true_bpe.h:262
int vocab_size
Definition: true_bpe.h:185
int32_t int32_t right_id
Definition: true_bpe.h:113
int const int32_t * offsets
Definition: true_bpe.h:186
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114
const char int int32_t int max_ids
Definition: true_bpe.h:264