#include "ck_tokenizer.h"#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>Go to the source code of this file.
Functions | |
| void * | ck_pool_alloc (CKMemPool *pool, size_t size) |
| void | ck_pool_free (CKMemPool *pool) |
| void | ck_pool_init (CKMemPool *pool) |
| char * | ck_pool_strdup (CKMemPool *pool, const char *s, int len) |
| int | ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left, int32_t right, int32_t merged) |
| int32_t | ck_tokenizer_add_token (CKTokenizer *tok, const char *token, int len) |
| int | ck_tokenizer_decode (const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len) |
| int | ck_tokenizer_encode (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) |
| void | ck_tokenizer_free (CKTokenizer *tok) |
| const char * | ck_tokenizer_id_to_token (const CKTokenizer *tok, int32_t id) |
| int | ck_tokenizer_init (CKTokenizer *tok) |
| int | ck_tokenizer_load (CKTokenizer *tok, const char *path) |
| int32_t | ck_tokenizer_lookup (const CKTokenizer *tok, const char *token, int len) |
| int | ck_tokenizer_lookup_merge (const CKTokenizer *tok, int32_t left, int32_t right) |
| static uint32_t | hash_pair (int32_t left, int32_t right) |
| static uint32_t | hash_string (const char *s, int len) |
| static int | json_match_char (JSONParser *p, char c) |
| static int | json_parse_int (JSONParser *p, int *out) |
| static int | json_parse_string (JSONParser *p, char *buf, int max_len) |
| static void | json_skip_value (JSONParser *p) |
| static void | json_skip_whitespace (JSONParser *p) |
| static CKPoolBlock * | pool_new_block (size_t capacity) |
| void* ck_pool_alloc | ( | CKMemPool * | pool, |
| size_t | size | ||
| ) |
Definition at line 69 of file ck_tokenizer.c.
References CK_POOL_BLOCK_SIZE, CKMemPool::current, CKPoolBlock::data, CKMemPool::head, CKPoolBlock::next, pool_new_block(), CKMemPool::total_allocated, and CKPoolBlock::used.
Referenced by ck_pool_strdup(), ck_tokenizer_add_token(), ck_tokenizer_load(), and ck_tokenizer_load_binary().
| void ck_pool_free | ( | CKMemPool * | pool | ) |
Definition at line 107 of file ck_tokenizer.c.
References CKPoolBlock::data, CKMemPool::head, and CKPoolBlock::next.
Referenced by ck_tokenizer_free().
| void ck_pool_init | ( | CKMemPool * | pool | ) |
Definition at line 51 of file ck_tokenizer.c.
Referenced by ck_tokenizer_init().
| char* ck_pool_strdup | ( | CKMemPool * | pool, |
| const char * | s, | ||
| int | len | ||
| ) |
Definition at line 98 of file ck_tokenizer.c.
References ck_pool_alloc().
Referenced by ck_tokenizer_add_token(), and ck_tokenizer_load().
| int ck_tokenizer_add_merge | ( | CKTokenizer * | tok, |
| int32_t | left, | ||
| int32_t | right, | ||
| int32_t | merged | ||
| ) |
Definition at line 248 of file ck_tokenizer.c.
References hash_pair(), CKMergeRule::left, left, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, CKMergeRule::priority, CKMergeRule::right, and right.
Referenced by ck_tokenizer_load(), and ck_tokenizer_load_binary().
| int32_t ck_tokenizer_add_token | ( | CKTokenizer * | tok, |
| const char * | token, | ||
| int | len | ||
| ) |
Definition at line 196 of file ck_tokenizer.c.
References CK_MAX_VOCAB_SIZE, ck_pool_alloc(), ck_pool_strdup(), ck_tokenizer_lookup(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, CKVocabEntry::next, CKTokenizer::pool, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::unk_id, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, and CKTokenizer::vocab_size.
Referenced by ck_tokenizer_load(), and main().
| int ck_tokenizer_decode | ( | const CKTokenizer * | tok, |
| const int32_t * | ids, | ||
| int | num_ids, | ||
| char * | text, | ||
| int | max_len | ||
| ) |
Decode token IDs to text.
| tok | Tokenizer |
| ids | Input token IDs |
| num_ids | Number of IDs |
| text | Output text buffer |
| max_len | Maximum text length |
Definition at line 737 of file ck_tokenizer.c.
| int ck_tokenizer_encode | ( | const CKTokenizer * | tok, |
| const char * | text, | ||
| int | text_len, | ||
| int32_t * | ids, | ||
| int | max_ids | ||
| ) |
Encode text to token IDs using greedy longest-match.
For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.
| tok | Tokenizer |
| text | Input text |
| text_len | Text length, or -1 for null-terminated |
| ids | Output token IDs |
| max_ids | Maximum IDs to write |
Definition at line 638 of file ck_tokenizer.c.
| void ck_tokenizer_free | ( | CKTokenizer * | tok | ) |
Definition at line 183 of file ck_tokenizer.c.
| const char* ck_tokenizer_id_to_token | ( | const CKTokenizer * | tok, |
| int32_t | id | ||
| ) |
Definition at line 239 of file ck_tokenizer.c.
Referenced by ck_tokenizer_decode().
| int ck_tokenizer_init | ( | CKTokenizer * | tok | ) |
Definition at line 148 of file ck_tokenizer.c.
References CKTokenizer::bos_id, CK_MAX_VOCAB_SIZE, ck_pool_init(), CKTokenizer::eos_id, CKTokenizer::id_to_token, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKTokenizer::pad_id, CKTokenizer::pool, CKTokenizer::unk_id, CKTokenizer::vocab_hash, and CKTokenizer::vocab_hash_size.
Referenced by run_inference().
| int ck_tokenizer_load | ( | CKTokenizer * | tok, |
| const char * | path | ||
| ) |
Definition at line 432 of file ck_tokenizer.c.
References CKTokenizer::bos_id, CK_MAX_TOKEN_LEN, ck_pool_alloc(), ck_pool_strdup(), ck_tokenizer_add_merge(), ck_tokenizer_add_token(), ck_tokenizer_lookup(), CKTokenizer::eos_id, hash_string(), CKVocabEntry::id, id, CKTokenizer::id_to_token, json_match_char(), json_parse_int(), json_parse_string(), json_skip_value(), json_skip_whitespace(), merged_id, CKVocabEntry::next, CKTokenizer::num_merges, CKTokenizer::pad_id, CKTokenizer::pool, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::unk_id, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, and CKTokenizer::vocab_size.
Referenced by run_inference().
| int32_t ck_tokenizer_lookup | ( | const CKTokenizer * | tok, |
| const char * | token, | ||
| int | len | ||
| ) |
Definition at line 227 of file ck_tokenizer.c.
References hash_string(), CKVocabEntry::next, token, CKTokenizer::unk_id, CKTokenizer::vocab_hash, and CKTokenizer::vocab_hash_size.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_encode(), ck_tokenizer_load(), and main().
| int ck_tokenizer_lookup_merge | ( | const CKTokenizer * | tok, |
| int32_t | left, | ||
| int32_t | right | ||
| ) |
Definition at line 276 of file ck_tokenizer.c.
References hash_pair(), CKMergeRule::left, left, CKTokenizer::merge_hash, CKTokenizer::merge_hash_size, CKTokenizer::merges, CKMergeRule::right, and right.
Referenced by ck_tokenizer_encode().
|
static |
Definition at line 133 of file ck_tokenizer.c.
Referenced by ck_tokenizer_add_merge(), and ck_tokenizer_lookup_merge().
|
static |
Definition at line 123 of file ck_tokenizer.c.
Referenced by ck_tokenizer_add_token(), ck_tokenizer_load(), and ck_tokenizer_lookup().
|
static |
Definition at line 302 of file ck_tokenizer.c.
References json_skip_whitespace().
Referenced by ck_tokenizer_load().
|
static |
Definition at line 363 of file ck_tokenizer.c.
References json_skip_whitespace().
Referenced by ck_tokenizer_load().
|
static |
Definition at line 311 of file ck_tokenizer.c.
References json_skip_whitespace(), and max_len.
Referenced by ck_tokenizer_load(), and json_skip_value().
|
static |
Definition at line 385 of file ck_tokenizer.c.
References json_parse_string(), and json_skip_whitespace().
Referenced by ck_tokenizer_load().
|
static |
Definition at line 296 of file ck_tokenizer.c.
Referenced by ck_tokenizer_load(), json_match_char(), json_parse_int(), json_parse_string(), and json_skip_value().
|
static |
Definition at line 55 of file ck_tokenizer.c.
References CKPoolBlock::capacity, CKPoolBlock::data, CKPoolBlock::next, and CKPoolBlock::used.
Referenced by ck_pool_alloc().