#include <ck_tokenizer.h>
Data Fields | |
| bool | add_bos |
| bool | add_eos |
| int32_t | bos_id |
| int32_t * | byte_token_id |
| CKTokenizerConfig | config |
| char * | encode_buffer |
| size_t | encode_buffer_size |
| int32_t | eos_id |
| char ** | id_to_token |
| int32_t | mask_id |
| int * | merge_hash |
| int | merge_hash_size |
| int32_t * | merge_pairs |
| size_t | merge_pairs_size |
| int32_t * | merge_result |
| size_t | merge_result_size |
| CKMergeRule * | merges |
| int | num_merges |
| int32_t | num_merges |
| int32_t | pad_id |
| CKMemPool | pool |
| CKTokenizerMemPool | pool |
| float * | scores |
| size_t | scores_size |
| uint8_t * | types |
| size_t | types_size |
| int32_t | unk_id |
| CKTokenizerHashTable * | vocab |
| size_t | vocab_capacity |
| CKVocabEntry ** | vocab_hash |
| int | vocab_hash_size |
| int | vocab_size |
| size_t | vocab_size |
| CKTrie * | vocab_trie |
Definition at line 76 of file ck_tokenizer.h.
| bool CKTokenizer::add_bos |
Definition at line 103 of file ck_tokenizer.h.
Referenced by ck_tokenizer_encode().
| bool CKTokenizer::add_eos |
Definition at line 104 of file ck_tokenizer.h.
Referenced by ck_tokenizer_encode().
| int32_t CKTokenizer::bos_id |
Definition at line 98 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_create(), ck_tokenizer_decode(), ck_tokenizer_encode(), ck_tokenizer_init(), ck_tokenizer_load(), and ck_tokenizer_set_special_ids().
| int32_t* CKTokenizer::byte_token_id |
Definition at line 117 of file tokenizer.h.
Referenced by ck_tokenizer_free(), ck_tokenizer_reset(), spm_build_byte_lookup(), and spm_get_byte_token().
| CKTokenizerConfig CKTokenizer::config |
Definition at line 97 of file tokenizer.h.
Referenced by ck_tokenizer_create(), ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_set_add_bos_eos(), ck_tokenizer_set_add_space_prefix(), ck_tokenizer_set_space_prefix_style(), ck_tokenizer_set_spm_mode(), ck_tokenizer_set_use_trie(), find_longest_match(), and main().
| char* CKTokenizer::encode_buffer |
Definition at line 137 of file tokenizer.h.
| size_t CKTokenizer::encode_buffer_size |
Definition at line 138 of file tokenizer.h.
| int32_t CKTokenizer::eos_id |
Definition at line 99 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_create(), ck_tokenizer_decode(), ck_tokenizer_encode(), ck_tokenizer_init(), ck_tokenizer_load(), and ck_tokenizer_set_special_ids().
| char ** CKTokenizer::id_to_token |
Definition at line 86 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_create(), ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_free(), ck_tokenizer_id_to_token(), ck_tokenizer_init(), ck_tokenizer_load(), ck_tokenizer_load_binary(), and ck_tokenizer_reset().
| int32_t CKTokenizer::mask_id |
Definition at line 124 of file tokenizer.h.
Referenced by ck_tokenizer_create(), and ck_tokenizer_set_special_ids().
| int* CKTokenizer::merge_hash |
Definition at line 93 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_merge(), ck_tokenizer_free(), ck_tokenizer_init(), and ck_tokenizer_lookup_merge().
| int CKTokenizer::merge_hash_size |
Definition at line 94 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_merge(), ck_tokenizer_init(), and ck_tokenizer_lookup_merge().
| int32_t* CKTokenizer::merge_pairs |
Definition at line 130 of file tokenizer.h.
| size_t CKTokenizer::merge_pairs_size |
Definition at line 131 of file tokenizer.h.
| int32_t* CKTokenizer::merge_result |
Definition at line 132 of file tokenizer.h.
| size_t CKTokenizer::merge_result_size |
Definition at line 133 of file tokenizer.h.
| CKMergeRule* CKTokenizer::merges |
Definition at line 89 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_merge(), ck_tokenizer_encode(), ck_tokenizer_free(), and ck_tokenizer_lookup_merge().
| int CKTokenizer::num_merges |
Definition at line 90 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_merge(), ck_tokenizer_encode(), and ck_tokenizer_load().
| int32_t CKTokenizer::num_merges |
Definition at line 134 of file tokenizer.h.
| int32_t CKTokenizer::pad_id |
Definition at line 100 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_create(), ck_tokenizer_decode(), ck_tokenizer_init(), ck_tokenizer_load(), and ck_tokenizer_set_special_ids().
| CKMemPool CKTokenizer::pool |
Definition at line 78 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_create(), ck_tokenizer_free(), ck_tokenizer_init(), ck_tokenizer_load(), and ck_tokenizer_load_binary().
| CKTokenizerMemPool CKTokenizer::pool |
Definition at line 127 of file tokenizer.h.
| float* CKTokenizer::scores |
Definition at line 111 of file tokenizer.h.
Referenced by ck_tokenizer_create(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_free(), ck_tokenizer_load_binary_with_scores(), and ck_tokenizer_reset().
| size_t CKTokenizer::scores_size |
Definition at line 112 of file tokenizer.h.
Referenced by ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_load_binary_with_scores(), and ck_tokenizer_reset().
| uint8_t* CKTokenizer::types |
Definition at line 113 of file tokenizer.h.
Referenced by ck_tokenizer_create(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_free(), ck_tokenizer_load_binary_with_scores(), ck_tokenizer_reset(), spm_build_byte_lookup(), spm_is_byte_token(), and spm_token_allowed_in_dp().
| size_t CKTokenizer::types_size |
Definition at line 114 of file tokenizer.h.
Referenced by ck_tokenizer_encode_spm_impl(), ck_tokenizer_load_binary_with_scores(), and ck_tokenizer_reset().
| int32_t CKTokenizer::unk_id |
Definition at line 97 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_create(), ck_tokenizer_encode(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_init(), ck_tokenizer_load(), ck_tokenizer_lookup(), ck_tokenizer_set_special_ids(), find_longest_match_hash(), find_longest_match_trie(), spm_count_unknown_run(), spm_encode_byte_fallback(), spm_find_candidates_at_pos(), spm_get_byte_token(), and spm_llama_resegment_node().
| CKTokenizerHashTable* CKTokenizer::vocab |
Definition at line 100 of file tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_create(), ck_tokenizer_free(), ck_tokenizer_lookup(), ck_tokenizer_lookup_exact(), ck_tokenizer_reset(), find_longest_match_hash(), spm_count_unknown_run(), and spm_find_candidates_at_pos().
| size_t CKTokenizer::vocab_capacity |
Definition at line 108 of file tokenizer.h.
Referenced by ck_tokenizer_add_token(), and ck_tokenizer_create().
| CKVocabEntry** CKTokenizer::vocab_hash |
Definition at line 82 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_free(), ck_tokenizer_init(), ck_tokenizer_load(), ck_tokenizer_load_binary(), and ck_tokenizer_lookup().
| int CKTokenizer::vocab_hash_size |
Definition at line 83 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_init(), ck_tokenizer_load(), ck_tokenizer_load_binary(), and ck_tokenizer_lookup().
| int CKTokenizer::vocab_size |
Definition at line 81 of file ck_tokenizer.h.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_free(), ck_tokenizer_id_to_token(), ck_tokenizer_load(), ck_tokenizer_load_binary(), ck_tokenizer_reset(), ck_tokenizer_vocab_size(), spm_is_byte_token(), and spm_token_allowed_in_dp().
| size_t CKTokenizer::vocab_size |
Definition at line 107 of file tokenizer.h.
| CKTrie* CKTokenizer::vocab_trie |
Definition at line 103 of file tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_create(), ck_tokenizer_free(), ck_tokenizer_reset(), and find_longest_match_trie().