19 #ifndef CK_TOKENIZER_H
20 #define CK_TOKENIZER_H
30 #include "data_structures/tries/trie.h"
38 #define CK_TOKENIZER_API __declspec(dllexport)
40 #define CK_TOKENIZER_API __attribute__((visibility("default")))
44 #define CK_TOKENIZER_MAX_TOKEN_LEN 256
47 #define CK_TOKENIZER_MAX_VOCAB_SIZE 256000
50 #define CK_TOKENIZER_DEFAULT_HT_SIZE 65536
410 const char **out_tokens,
490 const uint8_t *types,
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
void ck_tokenizer_free(CKTokenizer *tok)
bool treat_whitespace_as_suffix
bool space_prefix_detected
CKSpacePrefixStyle space_prefix_style
size_t encode_buffer_size
CKTokenizerHashTable * vocab
void ck_tokenizer_set_add_bos_eos(CKTokenizer *tok, bool add_bos, bool add_eos)
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok)
void ck_tokenizer_set_spm_mode(CKTokenizer *tok, CKSpmMode spm_mode)
CKTokenizer * ck_tokenizer_create(CKTokenizerType type)
void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)
void ck_tokenizer_reset(CKTokenizer *tok)
void ck_tokenizer_set_use_trie(CKTokenizer *tok, bool use_trie)
void ck_tokenizer_set_add_space_prefix(CKTokenizer *tok, bool add_space_prefix)
void ck_tokenizer_set_space_prefix_style(CKTokenizer *tok, CKSpacePrefixStyle style)
int32_t int32_t int32_t int32_t int32_t mask
static CKTokenizer * ck_tokenizer_create_bpe(void)
int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
const int32_t int num_ids
int ck_tokenizer_load_text(CKTokenizer *tok, const char *path)
int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path)
int ck_tokenizer_load_json(CKTokenizer *tok, const char *path)
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
static CKTokenizer * ck_tokenizer_create_wordpiece(void)
static size_t ck_tokenizer_vocab_size(const CKTokenizer *tok)
static CKTokenizer * ck_tokenizer_create_spm(void)
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
int ck_tokenizer_encode_tokens(const CKTokenizer *tok, const char *text, int text_len, const char **out_tokens, int max_tokens)
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
int ck_tokenizer_encode_with_special(CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids, bool add_special)
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path)
int32_t int32_t int32_t eos
int32_t int32_t int32_t int32_t pad
const int32_t int int * out_len
int const int32_t const char int num_merges
int const int32_t const char * strings
int const int32_t const char int const int32_t * merges
int32_t int32_t int32_t int32_t priority
const int32_t int char int max_len
int const int32_t * offsets
int32_t int32_t int32_t merged_id
const char int int32_t int max_ids