#include <stdio.h>#include <stdlib.h>#include <string.h>#include <stdint.h>#include <stdbool.h>#include <ctype.h>#include "tokenizer/tokenizer.h"#include "tokenizer/murmurhash3.h"#include "tokenizer/hash_table.h"Go to the source code of this file.
Macros | |
| #define | GGUF_TOKEN_BYTE 6 |
| #define | GGUF_TOKEN_CONTROL 3 |
| #define | GGUF_TOKEN_NORMAL 1 |
| #define | GGUF_TOKEN_UNKNOWN 2 |
| #define | GGUF_TOKEN_UNUSED 5 |
| #define | GGUF_TOKEN_USER_DEFINED 4 |
Functions | |
| int | ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority) |
| int | ck_tokenizer_add_special_token (CKTokenizer *tok, const char *name, int32_t id) |
| int | ck_tokenizer_add_token (CKTokenizer *tok, const char *token, int32_t id, float score) |
| CKTokenizer * | ck_tokenizer_create (CKTokenizerType type) |
| int | ck_tokenizer_decode (const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len) |
| CKSpacePrefixStyle | ck_tokenizer_detect_space_prefix_style (CKTokenizer *tok) |
| int | ck_tokenizer_encode (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) |
| static int | ck_tokenizer_encode_spm_impl (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) |
| static int | ck_tokenizer_encode_spm_llama_impl (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) |
| void | ck_tokenizer_free (CKTokenizer *tok) |
| const char * | ck_tokenizer_id_to_token (const CKTokenizer *tok, int32_t id) |
| int | ck_tokenizer_load_binary (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges) |
| int | ck_tokenizer_load_binary_with_scores (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges) |
| int | ck_tokenizer_load_gguf (CKTokenizer *tok, const char *path) |
| int | ck_tokenizer_load_json (CKTokenizer *tok, const char *path) |
| int | ck_tokenizer_load_merges (CKTokenizer *tok, const char *path) |
| int | ck_tokenizer_load_text (CKTokenizer *tok, const char *path) |
| int32_t | ck_tokenizer_lookup (const CKTokenizer *tok, const char *token) |
| static int32_t | ck_tokenizer_lookup_exact (const CKTokenizer *tok, const char *token) |
| static int32_t | ck_tokenizer_lookup_exact_n (const CKTokenizer *tok, const char *text, int text_len) |
| void | ck_tokenizer_reset (CKTokenizer *tok) |
| void | ck_tokenizer_set_add_bos_eos (CKTokenizer *tok, bool add_bos, bool add_eos) |
| void | ck_tokenizer_set_add_space_prefix (CKTokenizer *tok, bool add_space_prefix) |
| void | ck_tokenizer_set_space_prefix_style (CKTokenizer *tok, CKSpacePrefixStyle style) |
| void | ck_tokenizer_set_special_ids (CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask) |
| void | ck_tokenizer_set_spm_mode (CKTokenizer *tok, CKSpmMode spm_mode) |
| void | ck_tokenizer_set_use_trie (CKTokenizer *tok, bool use_trie) |
| static int32_t | find_longest_match (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) |
| static int32_t | find_longest_match_hash (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) |
| static int32_t | find_longest_match_trie (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) |
| static int | preprocess_bpe_spaces (const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style) |
| static int | preprocess_spm_llama_text (const char *text, int text_len, char *out, int out_max, bool add_space_prefix) |
| static int | preprocess_spm_text (const char *text, int text_len, char *out, int out_max, bool add_space_prefix) |
| static void | spm_build_byte_lookup (CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size) |
| static int | spm_count_unknown_run (const CKTokenizer *tok, const char *text, int text_len, size_t pos) |
| static int | spm_encode_byte_fallback (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) |
| static int | spm_find_candidates_at_pos (const CKTokenizer *tok, const char *text, int text_len, size_t pos, int32_t *candidates, int max_candidates) |
| static int32_t | spm_get_byte_token (const CKTokenizer *tok, unsigned char byte_val) |
| static bool | spm_is_byte_token (const CKTokenizer *tok, int32_t token_id) |
| static int | spm_llama_resegment_node (const CKTokenizer *tok, const SpmLlamaNode *nodes, int node_id, int32_t *ids, int max_ids, int out_idx) |
| static bool | spm_token_allowed_in_dp (const CKTokenizer *tok, int32_t token_id) |
| static bool | spm_token_is_byte_format (const char *token) |
| static int | utf8_len (unsigned char c) |
| #define GGUF_TOKEN_BYTE 6 |
Definition at line 465 of file tokenizer.c.
| #define GGUF_TOKEN_CONTROL 3 |
Definition at line 462 of file tokenizer.c.
| #define GGUF_TOKEN_NORMAL 1 |
Definition at line 460 of file tokenizer.c.
| #define GGUF_TOKEN_UNKNOWN 2 |
Definition at line 461 of file tokenizer.c.
| #define GGUF_TOKEN_UNUSED 5 |
Definition at line 464 of file tokenizer.c.
| #define GGUF_TOKEN_USER_DEFINED 4 |
Definition at line 463 of file tokenizer.c.
| int ck_tokenizer_add_merge | ( | CKTokenizer * | tok, |
| int32_t | left_id, | ||
| int32_t | right_id, | ||
| int32_t | merged_id, | ||
| int32_t | priority | ||
| ) |
Add a BPE merge rule.
| tok | Tokenizer |
| left_id | Left token ID |
| right_id | Right token ID |
| merged_id | Merged token ID |
| priority | Lower = higher priority (applied first) |
Definition at line 1336 of file tokenizer.c.
| int ck_tokenizer_add_special_token | ( | CKTokenizer * | tok, |
| const char * | name, | ||
| int32_t | id | ||
| ) |
Add special token (UNK, BOS, EOS, PAD, MASK).
| tok | Tokenizer |
| name | Special token name ("unk", "bos", "eos", "pad", "mask") |
| id | Token ID |
Definition at line 213 of file tokenizer.c.
References CKTokenizer::bos_id, ck_tokenizer_add_token(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), CKTokenizer::eos_id, id, CKTokenizer::pad_id, CKTokenizer::unk_id, CKTokenizer::vocab, and CKTokenizer::vocab_trie.
Referenced by main().
| int ck_tokenizer_add_token | ( | CKTokenizer * | tok, |
| const char * | token, | ||
| int32_t | id, | ||
| float | score | ||
| ) |
Add a token to vocabulary.
| tok | Tokenizer |
| token | Token string |
| id | Token ID |
| score | Token score (for SPM) |
Definition at line 157 of file tokenizer.c.
References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), id, CKTokenizer::id_to_token, score, token, CKTokenizer::vocab, CKTokenizer::vocab_capacity, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.
Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_load_binary_with_scores().
| CKTokenizer* ck_tokenizer_create | ( | CKTokenizerType | type | ) |
Definition at line 34 of file tokenizer.c.
References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, CKTokenizerConfig::add_space_prefix, CKTokenizer::bos_id, CK_SPM_MODE_UNIGRAM, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, ck_tokenizer_mempool_init(), ck_trie_create(), CKTokenizer::config, CKTokenizer::eos_id, CKTokenizer::id_to_token, CKTokenizer::mask_id, CKTokenizer::pad_id, CKTokenizer::pool, CKTokenizer::scores, CKTokenizerConfig::spm_mode, CKTokenizerConfig::type, CKTokenizer::types, CKTokenizer::unk_id, CKTokenizerConfig::unk_score, CKTokenizer::vocab, CKTokenizer::vocab_capacity, and CKTokenizer::vocab_trie.
Referenced by ck_tokenizer_create_bpe(), ck_tokenizer_create_spm(), ck_tokenizer_create_wordpiece(), and main().
| int ck_tokenizer_decode | ( | const CKTokenizer * | tok, |
| const int32_t * | ids, | ||
| int | num_ids, | ||
| char * | text, | ||
| int | max_len | ||
| ) |
Decode token IDs to text.
| tok | Tokenizer |
| ids | Input token IDs |
| num_ids | Number of IDs |
| text | Output text buffer |
| max_len | Maximum text length |
Definition at line 1211 of file tokenizer.c.
References CKTokenizer::bos_id, ck_tokenizer_id_to_token(), CKTokenizer::eos_id, ids, max_len, num_ids, CKTokenizer::pad_id, text, and token.
Referenced by main().
| CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style | ( | CKTokenizer * | tok | ) |
Definition at line 276 of file tokenizer.c.
References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, CKTokenizer::config, CKTokenizer::id_to_token, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, token, and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_encode().
| int ck_tokenizer_encode | ( | const CKTokenizer * | tok, |
| const char * | text, | ||
| int | text_len, | ||
| int32_t * | ids, | ||
| int | max_ids | ||
| ) |
Encode text to token IDs using greedy longest-match.
For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.
| tok | Tokenizer |
| text | Input text |
| text_len | Text length, or -1 for null-terminated |
| ids | Output token IDs |
| max_ids | Maximum IDs to write |
Definition at line 1132 of file tokenizer.c.
References CKTokenizer::add_bos, CKTokenizerConfig::add_bos, CKTokenizer::add_eos, CKTokenizerConfig::add_eos, CKTokenizer::bos_id, CK_SPM_MODE_LLAMA, CK_TOKENIZER_BPE, ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_lookup(), ck_tokenizer_lookup_merge(), CK_TOKENIZER_SPM, CKTokenizer::config, config, CKTokenizer::eos_id, find_longest_match(), id, ids, max_ids, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, out_len, preprocess_bpe_spaces(), CKMergeRule::priority, CKTokenizerConfig::spm_mode, style, text, text_len, CKTokenizerConfig::type, CKTokenizer::unk_id, and utf8_len().
Referenced by main(), and run_inference().
|
static |
Definition at line 832 of file tokenizer.c.
References CKTokenizerConfig::add_space_prefix, ck_tokenizer_id_to_token(), CKTokenizer::config, GGUF_TOKEN_USER_DEFINED, ids, max_ids, preprocess_spm_text(), CKTokenizer::scores, spm_count_unknown_run(), spm_encode_byte_fallback(), spm_find_candidates_at_pos(), spm_token_allowed_in_dp(), text, text_len, token, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::unk_id, and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_encode().
|
static |
Definition at line 642 of file tokenizer.c.
References CKTokenizerConfig::add_space_prefix, ck_tokenizer_lookup_exact_n(), CKTokenizer::config, ids, left, max_ids, preprocess_spm_llama_text(), right, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_llama_resegment_node(), text, text_len, utf8_len(), and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_encode().
| void ck_tokenizer_free | ( | CKTokenizer * | tok | ) |
Definition at line 91 of file tokenizer.c.
References CKTokenizer::byte_token_id, ck_pool_free(), ck_tokenizer_hash_table_free(), ck_tokenizer_mempool_free(), ck_trie_free(), CKTokenizer::id_to_token, CKTokenizer::merge_hash, CKTokenizer::merges, CKTokenizer::pool, CKTokenizer::scores, CKTokenizer::types, CKTokenizer::vocab, CKTokenizer::vocab_hash, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.
Referenced by main(), and run_inference().
| const char* ck_tokenizer_id_to_token | ( | const CKTokenizer * | tok, |
| int32_t | id | ||
| ) |
Definition at line 353 of file tokenizer.c.
References id, CKTokenizer::id_to_token, and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_decode(), ck_tokenizer_encode_spm_impl(), main(), and run_inference().
| int ck_tokenizer_load_binary | ( | CKTokenizer * | tok, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Load vocabulary from memory-mapped binary data.
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
Definition at line 1242 of file tokenizer.c.
References ck_tokenizer_load_binary_with_scores(), merges, num_merges, offsets, strings, and vocab_size.
| int ck_tokenizer_load_binary_with_scores | ( | CKTokenizer * | tok, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| const float * | scores, | ||
| const uint8_t * | types, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Load vocabulary from memory-mapped binary data with scores and types.
This extended version supports SPM (SentencePiece) tokenizers which require token scores for Viterbi/DP encoding.
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| scores | Array of token scores (float32), can be NULL |
| types | Array of token types (uint8), can be NULL |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
Definition at line 1252 of file tokenizer.c.
References ck_tokenizer_add_token(), ck_tokenizer_reset(), GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_NORMAL, GGUF_TOKEN_UNKNOWN, merges, num_merges, offsets, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_build_byte_lookup(), strings, token, CKTokenizer::types, CKTokenizer::types_size, and vocab_size.
Referenced by ck_tokenizer_load_binary().
| int ck_tokenizer_load_gguf | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from GGUF file.
| tok | Tokenizer |
| path | Path to GGUF file |
Definition at line 1332 of file tokenizer.c.
| int ck_tokenizer_load_json | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from JSON file (HuggingFace format).
| tok | Tokenizer |
| path | Path to vocab.json or tokenizer.json |
Definition at line 1333 of file tokenizer.c.
| int ck_tokenizer_load_merges | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load BPE merges from text file.
Format: token1 token2 (one merge per line)
| tok | Tokenizer |
| path | Path to merges.txt |
Definition at line 1335 of file tokenizer.c.
| int ck_tokenizer_load_text | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Load vocabulary from text file (one token per line).
Format: token_string [id] [score] Lines starting with # are comments.
| tok | Tokenizer |
| path | Path to vocabulary file |
Definition at line 1334 of file tokenizer.c.
| int32_t ck_tokenizer_lookup | ( | const CKTokenizer * | tok, |
| const char * | token | ||
| ) |
Definition at line 323 of file tokenizer.c.
References ck_tokenizer_hash_table_lookup(), token, CKTokenizer::unk_id, and CKTokenizer::vocab.
Referenced by spm_get_byte_token().
|
static |
Definition at line 330 of file tokenizer.c.
References ck_tokenizer_hash_table_lookup(), token, and CKTokenizer::vocab.
Referenced by ck_tokenizer_lookup_exact_n().
|
static |
Definition at line 337 of file tokenizer.c.
References ck_tokenizer_lookup_exact(), id, text, and text_len.
Referenced by ck_tokenizer_encode_spm_llama_impl(), and spm_llama_resegment_node().
| void ck_tokenizer_reset | ( | CKTokenizer * | tok | ) |
Definition at line 125 of file tokenizer.c.
References CKTokenizer::byte_token_id, ck_tokenizer_hash_table_clear(), ck_trie_clear(), CKTokenizer::id_to_token, CKTokenizer::scores, CKTokenizer::scores_size, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::vocab, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.
Referenced by ck_tokenizer_load_binary_with_scores().
| void ck_tokenizer_set_add_bos_eos | ( | CKTokenizer * | tok, |
| bool | add_bos, | ||
| bool | add_eos | ||
| ) |
Definition at line 243 of file tokenizer.c.
References CKTokenizerConfig::add_bos, add_bos, CKTokenizerConfig::add_eos, add_eos, and CKTokenizer::config.
| void ck_tokenizer_set_add_space_prefix | ( | CKTokenizer * | tok, |
| bool | add_space_prefix | ||
| ) |
Definition at line 249 of file tokenizer.c.
References CKTokenizerConfig::add_space_prefix, add_space_prefix, and CKTokenizer::config.
| void ck_tokenizer_set_space_prefix_style | ( | CKTokenizer * | tok, |
| CKSpacePrefixStyle | style | ||
| ) |
Definition at line 266 of file tokenizer.c.
References CK_SPACE_PREFIX_AUTO, CKTokenizer::config, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, and style.
| void ck_tokenizer_set_special_ids | ( | CKTokenizer * | tok, |
| int32_t | unk, | ||
| int32_t | bos, | ||
| int32_t | eos, | ||
| int32_t | pad, | ||
| int32_t | mask | ||
| ) |
Definition at line 234 of file tokenizer.c.
References bos, CKTokenizer::bos_id, eos, CKTokenizer::eos_id, mask, CKTokenizer::mask_id, pad, CKTokenizer::pad_id, unk, and CKTokenizer::unk_id.
| void ck_tokenizer_set_spm_mode | ( | CKTokenizer * | tok, |
| CKSpmMode | spm_mode | ||
| ) |
Definition at line 254 of file tokenizer.c.
References CKTokenizer::config, CKTokenizerConfig::spm_mode, and spm_mode.
| void ck_tokenizer_set_use_trie | ( | CKTokenizer * | tok, |
| bool | use_trie | ||
| ) |
Definition at line 260 of file tokenizer.c.
References CKTokenizer::config, CKTokenizerConfig::use_trie, and use_trie.
|
static |
Definition at line 400 of file tokenizer.c.
References CKTokenizer::config, find_longest_match_hash(), find_longest_match_trie(), text, text_len, and CKTokenizerConfig::use_trie.
Referenced by ck_tokenizer_encode().
|
static |
Definition at line 370 of file tokenizer.c.
References ck_tokenizer_hash_table_lookup(), max_len, text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.
Referenced by find_longest_match().
|
static |
Definition at line 359 of file tokenizer.c.
References ck_trie_find_longest(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab_trie.
Referenced by find_longest_match().
|
static |
Definition at line 412 of file tokenizer.c.
References CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.
Referenced by ck_tokenizer_encode().
|
static |
Definition at line 554 of file tokenizer.c.
References add_space_prefix, out_len, text, and text_len.
Referenced by ck_tokenizer_encode_spm_llama_impl().
|
static |
Definition at line 763 of file tokenizer.c.
References add_space_prefix, out_len, text, and text_len.
Referenced by ck_tokenizer_encode_spm_impl().
|
static |
Definition at line 507 of file tokenizer.c.
References CKTokenizer::byte_token_id, GGUF_TOKEN_BYTE, offsets, spm_token_is_byte_format(), strings, token, CKTokenizer::types, and vocab_size.
Referenced by ck_tokenizer_load_binary_with_scores().
|
static |
Definition at line 1098 of file tokenizer.c.
References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.
Referenced by ck_tokenizer_encode_spm_impl().
|
static |
Definition at line 1024 of file tokenizer.c.
References ids, max_ids, spm_get_byte_token(), text, text_len, and CKTokenizer::unk_id.
Referenced by ck_tokenizer_encode_spm_impl().
|
static |
Definition at line 1047 of file tokenizer.c.
References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.
Referenced by ck_tokenizer_encode_spm_impl().
|
inlinestatic |
Definition at line 487 of file tokenizer.c.
References CKTokenizer::byte_token_id, ck_tokenizer_lookup(), and CKTokenizer::unk_id.
Referenced by spm_encode_byte_fallback(), and spm_llama_resegment_node().
|
inlinestatic |
Definition at line 479 of file tokenizer.c.
References GGUF_TOKEN_BYTE, CKTokenizer::types, and CKTokenizer::vocab_size.
|
static |
Definition at line 611 of file tokenizer.c.
References ck_tokenizer_lookup_exact_n(), ids, max_ids, spm_get_byte_token(), and CKTokenizer::unk_id.
Referenced by ck_tokenizer_encode_spm_llama_impl().
|
inlinestatic |
Definition at line 469 of file tokenizer.c.
References GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_UNUSED, CKTokenizer::types, and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_encode_spm_impl(), spm_count_unknown_run(), and spm_find_candidates_at_pos().
|
inlinestatic |
|
inlinestatic |
Definition at line 541 of file tokenizer.c.
Referenced by ck_tokenizer_encode(), and ck_tokenizer_encode_spm_llama_impl().