13 #ifndef CK_TOKENIZER_H
14 #define CK_TOKENIZER_H
25 #define CK_MAX_TOKEN_LEN 256
28 #define CK_MAX_VOCAB_SIZE 256000
31 #define CK_POOL_BLOCK_SIZE (1024 * 1024)
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
void ck_pool_init(CKMemPool *pool)
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
int ck_tokenizer_init(CKTokenizer *tok)
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
int ck_tokenizer_load(CKTokenizer *tok, const char *path)
void * ck_pool_alloc(CKMemPool *pool, size_t size)
char * ck_pool_strdup(CKMemPool *pool, const char *s, int len)
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
void ck_pool_free(CKMemPool *pool)
void ck_tokenizer_free(CKTokenizer *tok)
int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right)
static int ck_tokenizer_vocab_size(const CKTokenizer *tok)
struct CKPoolBlock * next
CKVocabEntry ** vocab_hash
struct CKVocabEntry * next
const int32_t int num_ids
const int32_t int char int max_len
const char int int32_t int max_ids
const char const char * right