#include <stddef.h>#include <stdint.h>#include <stdbool.h>#include "tokenizer/murmurhash3.h"#include "tokenizer/memory_pool.h"#include "tokenizer/hash_table.h"#include "tokenizer/utf8.h"#include "data_structures/tries/trie.h"Go to the source code of this file.
Data Structures | |
| struct | CKTokenizer |
| struct | CKTokenizerConfig |
| struct | CKTokenizerToken |
Macros | |
| #define | CK_TOKENIZER_API __attribute__((visibility("default"))) |
| #define | CK_TOKENIZER_DEFAULT_HT_SIZE 65536 |
| #define | CK_TOKENIZER_MAX_TOKEN_LEN 256 |
| #define | CK_TOKENIZER_MAX_VOCAB_SIZE 256000 |
Enumerations | |
| enum | CKSpacePrefixStyle { CK_SPACE_PREFIX_AUTO = 0 , CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2 , CK_SPACE_PREFIX_AUTO = 0 , CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2 } |
| enum | CKSpmMode { CK_SPM_MODE_UNIGRAM = 0 , CK_SPM_MODE_LLAMA = 1 } |
| enum | CKTokenizerType { CK_TOKENIZER_BPE = 0 , CK_TOKENIZER_WORDPIECE = 1 , CK_TOKENIZER_SPM = 2 } |
Variables | |
| bool | add_bos |
| bool bool | add_eos |
| bool | add_space_prefix |
| int32_t int32_t | bos |
| int32_t int32_t int32_t | eos |
| int32_t | id |
| const int32_t * | ids |
| bool | lowercase |
| int32_t int32_t int32_t int32_t int32_t | mask |
| const int32_t int | num_ids |
| const int32_t int int * | out_len |
| int32_t int32_t int32_t int32_t | pad |
| int32_t float * | score |
| CKSpmMode | spm_mode |
| CKSpacePrefixStyle | style |
| const char * | text |
| const char * | token |
| int32_t | unk |
| bool | use_trie |
| #define CK_TOKENIZER_API __attribute__((visibility("default"))) |
Definition at line 40 of file tokenizer.h.
| #define CK_TOKENIZER_DEFAULT_HT_SIZE 65536 |
Definition at line 50 of file tokenizer.h.
| #define CK_TOKENIZER_MAX_TOKEN_LEN 256 |
Definition at line 44 of file tokenizer.h.
| #define CK_TOKENIZER_MAX_VOCAB_SIZE 256000 |
Definition at line 47 of file tokenizer.h.
| enum CKSpacePrefixStyle |
| Enumerator | |
|---|---|
| CK_SPACE_PREFIX_AUTO | |
| CK_SPACE_PREFIX_GPT2 | |
| CK_SPACE_PREFIX_SPM | |
| CK_SPACE_PREFIX_AUTO | |
| CK_SPACE_PREFIX_GPT2 | |
| CK_SPACE_PREFIX_SPM | |
Definition at line 60 of file tokenizer.h.
| enum CKSpmMode |
| enum CKTokenizerType |
| Enumerator | |
|---|---|
| CK_TOKENIZER_BPE | |
| CK_TOKENIZER_WORDPIECE | |
| CK_TOKENIZER_SPM | |
Definition at line 53 of file tokenizer.h.
| __attribute__ | ( | (visibility("default")) | ) |
Create a new tokenizer.
| type | Tokenizer type (BPE, WordPiece, SPM) |
Free a tokenizer.
| tok | Tokenizer to free |
Reset tokenizer state (clear vocab but keep config).
| tok | Tokenizer to reset |
Set special token IDs.
| tok | Tokenizer |
| unk | Unknown token ID |
| bos | Beginning-of-sequence token ID |
| eos | End-of-sequence token ID |
| pad | Padding token ID |
| mask | Mask token ID |
Set whether to add BOS/EOS tokens during encoding.
| tok | Tokenizer |
| add_bos | If true, prepend BOS token (if available) |
| add_eos | If true, append EOS token (if available) |
Set whether to add the SentencePiece space prefix (▁) at the start.
This mirrors SentencePiece's add_dummy_prefix behavior.
| tok | Tokenizer |
| add_space_prefix | If true, add leading ▁ when appropriate |
Set SentencePiece mode.
| tok | Tokenizer |
| spm_mode | SPM mode (unigram or llama-style) |
Set whether to lowercase input text before tokenizing.
| tok | Tokenizer |
| lowercase | If true, convert text to lowercase |
Set lookup method (trie vs hash table).
| tok | Tokenizer |
| use_trie | If true, use trie (faster for longest-match), false = hash table |
Set space prefix style for BPE tokenizers.
GPT-2/Qwen use Ġ (U+0120), LLaMA/SentencePiece use ▁ (U+2581). Default is AUTO which auto-detects from vocabulary.
| tok | Tokenizer |
| style | Space prefix style (AUTO, GPT2, or SPM) |
Auto-detect space prefix style from vocabulary.
Checks for presence of tokens starting with Ġ vs ▁ to determine style.
| tok | Tokenizer |
Look up token ID by string.
| tok | Tokenizer |
| token | Token string |
Get token string by ID.
| tok | Tokenizer |
| id | Token ID |
Get token info by ID.
| tok | Tokenizer |
| id | Token ID |
| score | Output: token score |
Decode to buffer allocated by caller.
| tok | Tokenizer |
| ids | Input token IDs |
| num_ids | Number of IDs |
| out_len | Output: length of decoded string |
Get the tokenizer type name.
| tok | Tokenizer |
Check if token is special.
| tok | Tokenizer |
| id | Token ID |
Estimate encoded token count.
| tok | Tokenizer |
| text | Input text |
Get last error message.
Free a True BPE tokenizer.
| bpe | Tokenizer to free |
Add a token to the vocabulary.
| bpe | Tokenizer |
| token | Token string (UTF-8) |
| id | Token ID |
| score | Token score (for unigram models, 0.0 for BPE) |
Add a BPE merge rule by token IDs.
Merge rules define how tokens are combined during encoding. Rules with lower priority numbers are applied first.
| bpe | Tokenizer |
| left_id | Left token ID |
| right_id | Right token ID |
| merged_id | Resulting merged token ID |
| priority | Merge priority (lower = applied first) |
Add a BPE merge rule by token strings.
This looks up the token IDs automatically and determines the merged token. The merged token must already exist in the vocabulary.
| bpe | Tokenizer |
| left | Left token string |
| right | Right token string |
| priority | Merge priority (lower = applied first) |
Set special token IDs.
| bpe | Tokenizer |
| unk | Unknown token ID (-1 to disable) |
| bos | Beginning-of-sequence token ID (-1 to disable) |
| eos | End-of-sequence token ID (-1 to disable) |
| pad | Padding token ID (-1 to disable) |
Add a special token that should be matched BEFORE BPE encoding.
Special tokens like <|im_start|>, <|im_end|>, <|endoftext|> are matched literally in the input text before BPE processing. Without this, BPE would break them into individual characters.
| bpe | Tokenizer |
| token | Token string to match literally (e.g., "<|im_end|>") |
| id | Token ID to output when matched |
Set tokenizer configuration.
| bpe | Tokenizer |
| config | Configuration to apply |
Load vocabulary + merges from binary buffers.
| bpe | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Offsets array (length vocab_size) |
| strings | Null-terminated token strings blob |
| num_merges | Number of merge rules |
| merges | Merge triples [left_id, right_id, merged_id] (length num_merges*3) |
Look up a token ID by string.
| bpe | Tokenizer |
| token | Token string |
Get a token string by ID.
| bpe | Tokenizer |
| id | Token ID |
Get vocabulary size.
| bpe | Tokenizer |
Get number of merge rules.
| bpe | Tokenizer |
Auto-detect space prefix style from vocabulary.
Counts tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece) to determine style. The detected style is cached in the config.
| bpe | Tokenizer |
Encode text to token IDs using true BPE algorithm.
This applies merge rules in priority order (not greedy longest-match).
| bpe | Tokenizer |
| text | Input text (UTF-8) |
| text_len | Text length in bytes, or -1 for null-terminated |
| ids | Output token IDs array |
| max_ids | Maximum IDs to write |
Decode token IDs to text.
| bpe | Tokenizer |
| ids | Input token IDs |
| num_ids | Number of IDs |
| text | Output text buffer |
| max_len | Maximum text length |
Referenced by fused_mlp_swiglu_decode(), fused_mlp_swiglu_decode_tiled(), fused_mlp_swiglu_decode_v2(), geglu_forward_fp32(), gelu_backward_exact(), gelu_backward_fast(), gelu_fast_inplace(), swiglu_backward(), and swiglu_forward().
| int ck_tokenizer_add_merge | ( | CKTokenizer * | tok, |
| int32_t | left_id, | ||
| int32_t | right_id, | ||
| int32_t | merged_id, | ||
| int32_t | priority | ||
| ) |
Add a BPE merge rule.
| tok | Tokenizer |
| left_id | Left token ID |
| right_id | Right token ID |
| merged_id | Merged token ID |
| priority | Lower = higher priority (applied first) |
Definition at line 1336 of file tokenizer.c.
| int ck_tokenizer_add_special_token | ( | CKTokenizer * | tok, |
| const char * | name, | ||
| int32_t | id | ||
| ) |
Add special token (UNK, BOS, EOS, PAD, MASK).
| tok | Tokenizer |
| name | Special token name ("unk", "bos", "eos", "pad", "mask") |
| id | Token ID |
Definition at line 213 of file tokenizer.c.
References CKTokenizer::bos_id, ck_tokenizer_add_token(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), CKTokenizer::eos_id, id, CKTokenizer::pad_id, CKTokenizer::unk_id, CKTokenizer::vocab, and CKTokenizer::vocab_trie.
Referenced by main().
| int ck_tokenizer_add_token | ( | CKTokenizer * | tok, |
| const char * | token, | ||
| int32_t | id, | ||
| float | score | ||
| ) |
Add a token to vocabulary.
| tok | Tokenizer |
| token | Token string |
| id | Token ID |
| score | Token score (for SPM) |
Definition at line 157 of file tokenizer.c.
References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), id, CKTokenizer::id_to_token, score, token, CKTokenizer::vocab, CKTokenizer::vocab_capacity, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.
Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_load_binary_with_scores().
|
inlinestatic |
Create tokenizer with default BPE config.
Definition at line 156 of file tokenizer.h.
References CK_TOKENIZER_BPE, and ck_tokenizer_create().
Referenced by main().
|
inlinestatic |
Create tokenizer with default SPM config.
Definition at line 170 of file tokenizer.h.
References ck_tokenizer_create(), and CK_TOKENIZER_SPM.
|
inlinestatic |
Create tokenizer with default WordPiece config.
Definition at line 163 of file tokenizer.h.
References ck_tokenizer_create(), and CK_TOKENIZER_WORDPIECE.
| int ck_tokenizer_decode | ( | const CKTokenizer * | tok, |
| const int32_t * | ids, | ||
| int | num_ids, | ||
| char * | text, | ||
| int | max_len | ||
| ) |
Decode token IDs to text.
| tok | Tokenizer |
| ids | Input token IDs |
| num_ids | Number of IDs |
| text | Output text buffer |
| max_len | Maximum text length |
Definition at line 737 of file ck_tokenizer.c.
References ck_tokenizer_id_to_token(), ids, max_len, num_ids, text, and token.
| int ck_tokenizer_encode | ( | const CKTokenizer * | tok, |
| const char * | text, | ||
| int | text_len, | ||
| int32_t * | ids, | ||
| int | max_ids | ||
| ) |
Encode text to token IDs using greedy longest-match.
For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.
| tok | Tokenizer |
| text | Input text |
| text_len | Text length, or -1 for null-terminated |
| ids | Output token IDs |
| max_ids | Maximum IDs to write |
Definition at line 638 of file ck_tokenizer.c.
References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, CKTokenizer::bos_id, CK_SPM_MODE_LLAMA, CK_TOKENIZER_BPE, ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), CK_TOKENIZER_SPM, CKTokenizer::config, config, CKTokenizer::eos_id, find_longest_match(), id, ids, max_ids, preprocess_bpe_spaces(), CKTokenizerConfig::spm_mode, style, text, text_len, CKTokenizerConfig::type, and CKTokenizer::unk_id.
| int ck_tokenizer_encode_tokens | ( | const CKTokenizer * | tok, |
| const char * | text, | ||
| int | text_len, | ||
| const char ** | out_tokens, | ||
| int | max_tokens | ||
| ) |
Encode and return tokens as array of strings.
| tok | Tokenizer |
| text | Input text |
| text_len | Text length |
| out_tokens | Output token strings (caller must free each) |
| max_tokens | Maximum tokens |
| int ck_tokenizer_encode_with_special | ( | CKTokenizer * | tok, |
| const char * | text, | ||
| int | text_len, | ||
| int32_t * | ids, | ||
| int | max_ids, | ||
| bool | add_special | ||
| ) |
Encode with special token handling.
| tok | Tokenizer |
| text | Input text |
| text_len | Text length, or -1 for null-terminated |
| ids | Output token IDs |
| max_ids | Maximum IDs to write |
| add_special | Add BOS/EOS tokens |
| int ck_tokenizer_load_binary | ( | CKTokenizer * | tok, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Load vocabulary from memory-mapped binary data.
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
Definition at line 18 of file ck_tokenizer_v2.c.
References ck_pool_alloc(), ck_tokenizer_add_merge(), ck_tokenizer_load_binary_with_scores(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, left, merges, CKVocabEntry::next, num_merges, offsets, CKTokenizer::pool, right, strings, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, CKTokenizer::vocab_size, and vocab_size.
Referenced by main().
| int ck_tokenizer_load_binary_with_scores | ( | CKTokenizer * | tok, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| const float * | scores, | ||
| const uint8_t * | types, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Load vocabulary from memory-mapped binary data with scores and types.
This extended version supports SPM (SentencePiece) tokenizers which require token scores for Viterbi/DP encoding.
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| scores | Array of token scores (float32), can be NULL |
| types | Array of token types (uint8), can be NULL |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
Definition at line 1252 of file tokenizer.c.
References ck_tokenizer_add_token(), ck_tokenizer_reset(), GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_NORMAL, GGUF_TOKEN_UNKNOWN, merges, num_merges, offsets, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_build_byte_lookup(), strings, token, CKTokenizer::types, CKTokenizer::types_size, and vocab_size.
Referenced by ck_tokenizer_load_binary().
| int ck_tokenizer_load_gguf | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from GGUF file.
| tok | Tokenizer |
| path | Path to GGUF file |
Definition at line 1332 of file tokenizer.c.
| int ck_tokenizer_load_json | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from JSON file (HuggingFace format).
| tok | Tokenizer |
| path | Path to vocab.json or tokenizer.json |
Definition at line 1333 of file tokenizer.c.
| int ck_tokenizer_load_merges | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load BPE merges from text file.
Format: token1 token2 (one merge per line)
| tok | Tokenizer |
| path | Path to merges.txt |
Definition at line 1335 of file tokenizer.c.
| int ck_tokenizer_load_text | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from text file (one token per line).
Format: token_string [id] [score] Lines starting with # are comments.
| tok | Tokenizer |
| path | Path to vocabulary file |
Definition at line 1334 of file tokenizer.c.
|
inlinestatic |
Get vocabulary size.
Definition at line 332 of file tokenizer.h.
References CKTokenizer::vocab_size.
| bool add_bos |
Definition at line 242 of file tokenizer.h.
Referenced by ck_tokenizer_set_add_bos_eos().
| bool bool add_eos |
Definition at line 242 of file tokenizer.h.
Referenced by ck_tokenizer_set_add_bos_eos().
| bool add_space_prefix |
Definition at line 252 of file tokenizer.h.
Referenced by ck_tokenizer_set_add_space_prefix(), preprocess_spm_llama_text(), and preprocess_spm_text().
| int32_t int32_t bos |
Definition at line 230 of file tokenizer.h.
Referenced by ck_tokenizer_set_special_ids(), ck_true_bpe_set_special_ids(), and load_eos_from_vocab_json().
| int32_t int32_t int32_t eos |
Definition at line 231 of file tokenizer.h.
Referenced by ck_tokenizer_set_special_ids(), ck_true_bpe_set_special_ids(), and load_eos_from_vocab_json().
| int32_t id |
Definition at line 315 of file tokenizer.h.
Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_encode(), ck_tokenizer_id_to_token(), ck_tokenizer_load(), ck_tokenizer_lookup_exact_n(), ck_true_bpe_add_special_token(), ck_true_bpe_add_token(), ck_true_bpe_id_to_token(), encode_chunk(), quantize_row_q8_0(), token_list_append(), and topology_discover_cpu().
| const int32_t * ids |
Definition at line 443 of file tokenizer.h.
Referenced by ck_tokenizer_decode(), ck_tokenizer_encode(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_true_bpe_decode(), ck_true_bpe_encode(), encode_chunk(), encode_text_segment(), main(), run_prompt(), spm_encode_byte_fallback(), and spm_llama_resegment_node().
| bool lowercase |
Definition at line 268 of file tokenizer.h.
| int32_t int32_t int32_t int32_t int32_t mask |
Definition at line 233 of file tokenizer.h.
Referenced by ck_dtype_supported(), ck_tokenizer_set_special_ids(), relu_backward(), and topology_discover_affinity().
| const int32_t int num_ids |
Definition at line 444 of file tokenizer.h.
Referenced by ck_tokenizer_decode(), ck_true_bpe_decode(), and main().
| const int32_t int int* out_len |
Definition at line 445 of file tokenizer.h.
Referenced by ck_tokenizer_encode(), ck_utf8_next_char(), decode_bpe_token(), eos_pattern_process(), find_object_range(), preprocess_bpe_spaces(), preprocess_spm_llama_text(), preprocess_spm_text(), preprocess_text(), and run_prompt().
| int32_t int32_t int32_t int32_t pad |
Definition at line 232 of file tokenizer.h.
Referenced by ck_tokenizer_set_special_ids(), and ck_true_bpe_set_special_ids().
| int32_t float* score |
Definition at line 327 of file tokenizer.h.
Referenced by attention_flash_decode_scalar(), attention_flash_query_causal(), attention_flash_query_sliding(), attention_mlp_fused_fp32(), attention_mlp_fused_q4k(), attention_mlp_separate_fp32(), ck_tokenizer_add_token(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_load_binary_with_scores(), ck_true_bpe_add_token(), layer_fused_attn_mlp_qkv_q4k(), and simple_attention().
| CKSpmMode spm_mode |
Definition at line 260 of file tokenizer.h.
Referenced by ck_tokenizer_set_spm_mode().
| CKSpacePrefixStyle style |
Definition at line 287 of file tokenizer.h.
Referenced by ck_tokenizer_encode(), ck_tokenizer_set_space_prefix_style(), preprocess_bpe_spaces(), and preprocess_text().
| const int32_t int char * text |
Definition at line 563 of file tokenizer.h.
Referenced by ck_tokenizer_decode(), ck_tokenizer_encode(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_lookup_exact_n(), ck_trie_find_longest(), ck_trie_has_prefix(), ck_true_bpe_decode(), ck_true_bpe_encode(), encode_text_segment(), find_longest_match(), find_longest_match_hash(), find_longest_match_trie(), gpt2_pretokenize(), init_tokens_from_text(), main(), match_special_token(), output_append(), preprocess_bpe_spaces(), preprocess_spm_llama_text(), preprocess_spm_text(), preprocess_text(), spm_count_unknown_run(), spm_encode_byte_fallback(), spm_find_candidates_at_pos(), and tokenize().
| const char * token |
Definition at line 306 of file tokenizer.h.
Referenced by ck_model_decode(), ck_tokenizer_add_token(), ck_tokenizer_decode(), ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_load(), ck_tokenizer_load_binary(), ck_tokenizer_load_binary_with_scores(), ck_tokenizer_lookup(), ck_tokenizer_lookup_exact(), ck_trie_insert(), ck_true_bpe_add_special_token(), ck_true_bpe_add_token(), ck_true_bpe_decode(), ck_true_bpe_detect_space_style(), ck_true_bpe_load_binary(), ck_true_bpe_lookup(), decode_bpe_token(), eos_is_potential_prefix(), is_eos_token(), main(), model_decode(), model_decode_token(), output_token(), qwen2_0_5b_decode_decode(), qwen2_0_5b_decode_decode_token(), run_benchmark(), run_generation_test(), run_inference(), spm_build_byte_lookup(), spm_token_is_byte_format(), and topology_discover_numa().
| int32_t unk |
Definition at line 229 of file tokenizer.h.
Referenced by ck_tokenizer_set_special_ids(), and ck_true_bpe_set_special_ids().
| bool use_trie |
Definition at line 276 of file tokenizer.h.
Referenced by ck_tokenizer_set_use_trie().