#include <stdio.h>#include <stdlib.h>#include <string.h>#include <stdint.h>#include <stdbool.h>#include <limits.h>#include "tokenizer/true_bpe.h"Go to the source code of this file.
Macros | |
| #define | INITIAL_TOKEN_CAPACITY 256 /* Initial capacity for token list */ |
| #define | MAX_SPECIAL_TOKENS 32 /* Maximum number of special tokens */ |
| #define | MAX_TOKEN_LEN 128 /* Maximum length of a single token string */ |
| #define | MERGE_HASH_SIZE 65536 /* Size of merge lookup hash table */ |
Enumerations | |
| enum | ChunkType { CHUNK_WORD , CHUNK_NUMBER , CHUNK_WHITESPACE , CHUNK_OTHER } |
Functions | |
| static int | apply_bpe_merges (CKTrueBPE *bpe, CKBPETokenList *list) |
| static int | byte_to_gpt2 (unsigned char byte, char *out) |
| int | ck_true_bpe_add_merge (CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority) |
| int | ck_true_bpe_add_merge_by_tokens (CKTrueBPE *bpe, const char *left, const char *right, int32_t priority) |
| int | ck_true_bpe_add_special_token (CKTrueBPE *bpe, const char *token, int32_t id) |
| int | ck_true_bpe_add_token (CKTrueBPE *bpe, const char *token, int32_t id, float score) |
| CKTrueBPE * | ck_true_bpe_create (void) |
| int | ck_true_bpe_decode (const CKTrueBPE *bpe, const int32_t *ids, int num_ids, char *text, int max_len) |
| CKSpacePrefixStyle | ck_true_bpe_detect_space_style (CKTrueBPE *bpe) |
| int | ck_true_bpe_encode (CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids) |
| void | ck_true_bpe_free (CKTrueBPE *bpe) |
| const char * | ck_true_bpe_id_to_token (const CKTrueBPE *bpe, int32_t id) |
| int | ck_true_bpe_load_binary (CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges) |
| int32_t | ck_true_bpe_lookup (const CKTrueBPE *bpe, const char *token) |
| int32_t | ck_true_bpe_num_merges (const CKTrueBPE *bpe) |
| void | ck_true_bpe_set_config (CKTrueBPE *bpe, const CKBPEConfig *config) |
| void | ck_true_bpe_set_special_ids (CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad) |
| size_t | ck_true_bpe_vocab_size (const CKTrueBPE *bpe) |
| static int | encode_chunk (CKTrueBPE *bpe, const char *chunk, int chunk_len, int32_t *ids, int max_ids, CKBPETokenList *list) |
| static int | encode_text_segment (CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids) |
| static int | find_best_merge (const CKTrueBPE *bpe, const CKBPETokenList *list, size_t *best_pos, const CKBPEMerge **best_merge) |
| static int | gpt2_decode_byte (const unsigned char *s, int len) |
| static int | gpt2_pretokenize (const char *text, int text_len, PretokChunk *chunks, int max_chunks) |
| static int | init_tokens_from_text (CKTrueBPE *bpe, CKBPETokenList *list, const char *text, int text_len) |
| static bool | is_bpe_digit (const char *s, int len) |
| static bool | is_bpe_letter (const char *s, int len) |
| static bool | is_bpe_newline (const char *s, int len) |
| static bool | is_bpe_punct (const char *s, int len) |
| static bool | is_digit (unsigned char c) |
| static bool | is_gpt2_space (const char *s, int len) |
| static bool | is_letter (unsigned char c) |
| static bool | is_whitespace (unsigned char c) |
| static bool | is_word_prefix_char (const char *s, int len) |
| static int | match_special_token (const CKTrueBPE *bpe, const char *text, int text_len, int pos) |
| static size_t | merge_hash (uint64_t key, size_t num_buckets) |
| static uint64_t | merge_key (int32_t left_id, int32_t right_id) |
| static CKMergeTable * | merge_table_create (size_t num_buckets) |
| static void | merge_table_free (CKMergeTable *table) |
| static int | merge_table_insert (CKMergeTable *table, const CKBPEMerge *merge) |
| static const CKBPEMerge * | merge_table_lookup (const CKMergeTable *table, int32_t left_id, int32_t right_id) |
| static int | preprocess_text (const CKTrueBPE *bpe, const char *text, int text_len, char *out, int out_max) |
| static int | token_list_append (CKBPETokenList *list, const char *str, size_t len, int32_t id) |
| static void | token_list_clear (CKBPETokenList *list) |
| static CKBPETokenList * | token_list_create (size_t initial_capacity) |
| static void | token_list_free (CKBPETokenList *list) |
| static int | token_list_merge_at (CKBPETokenList *list, size_t pos, const char *merged_str, size_t merged_len, int32_t merged_id) |
| static int | utf8_char_len (unsigned char c) |
| #define INITIAL_TOKEN_CAPACITY 256 /* Initial capacity for token list */ |
Definition at line 64 of file true_bpe.c.
| #define MAX_SPECIAL_TOKENS 32 /* Maximum number of special tokens */ |
Definition at line 108 of file true_bpe.c.
| #define MAX_TOKEN_LEN 128 /* Maximum length of a single token string */ |
Definition at line 65 of file true_bpe.c.
| #define MERGE_HASH_SIZE 65536 /* Size of merge lookup hash table */ |
Definition at line 63 of file true_bpe.c.
| enum ChunkType |
| Enumerator | |
|---|---|
| CHUNK_WORD | |
| CHUNK_NUMBER | |
| CHUNK_WHITESPACE | |
| CHUNK_OTHER | |
Definition at line 851 of file true_bpe.c.
|
static |
Definition at line 1173 of file true_bpe.c.
References find_best_merge(), MAX_TOKEN_LEN, and token_list_merge_at().
Referenced by encode_chunk().
|
static |
| int ck_true_bpe_add_merge | ( | CKTrueBPE * | bpe, |
| int32_t | left_id, | ||
| int32_t | right_id, | ||
| int32_t | merged_id, | ||
| int32_t | priority | ||
| ) |
Definition at line 497 of file true_bpe.c.
References left_id, merge_table_insert(), merged_id, priority, and right_id.
Referenced by ck_true_bpe_add_merge_by_tokens(), and ck_true_bpe_load_binary().
| int ck_true_bpe_add_merge_by_tokens | ( | CKTrueBPE * | bpe, |
| const char * | left, | ||
| const char * | right, | ||
| int32_t | priority | ||
| ) |
Definition at line 514 of file true_bpe.c.
References ck_tokenizer_hash_table_lookup(), ck_true_bpe_add_merge(), left, merged_id, priority, and right.
| int ck_true_bpe_add_special_token | ( | CKTrueBPE * | bpe, |
| const char * | token, | ||
| int32_t | id | ||
| ) |
Definition at line 565 of file true_bpe.c.
References id, MAX_SPECIAL_TOKENS, and token.
Referenced by main().
| int ck_true_bpe_add_token | ( | CKTrueBPE * | bpe, |
| const char * | token, | ||
| int32_t | id, | ||
| float | score | ||
| ) |
Definition at line 449 of file true_bpe.c.
References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), id, score, and token.
Referenced by ck_true_bpe_load_binary().
| CKTrueBPE* ck_true_bpe_create | ( | void | ) |
Definition at line 342 of file true_bpe.c.
References CK_SPACE_PREFIX_AUTO, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, MAX_SPECIAL_TOKENS, MERGE_HASH_SIZE, merge_table_create(), and merge_table_free().
Referenced by main(), and run_inference().
| int ck_true_bpe_decode | ( | const CKTrueBPE * | bpe, |
| const int32_t * | ids, | ||
| int | num_ids, | ||
| char * | text, | ||
| int | max_len | ||
| ) |
Definition at line 1439 of file true_bpe.c.
References ck_true_bpe_id_to_token(), gpt2_decode_byte(), ids, max_len, num_ids, text, and token.
| CKSpacePrefixStyle ck_true_bpe_detect_space_style | ( | CKTrueBPE * | bpe | ) |
Definition at line 654 of file true_bpe.c.
References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, and token.
Referenced by ck_true_bpe_encode().
| int ck_true_bpe_encode | ( | CKTrueBPE * | bpe, |
| const char * | text, | ||
| int | text_len, | ||
| int32_t * | ids, | ||
| int | max_ids | ||
| ) |
Definition at line 1338 of file true_bpe.c.
References CK_SPACE_PREFIX_AUTO, ck_true_bpe_detect_space_style(), encode_text_segment(), ids, match_special_token(), max_ids, text, and text_len.
Referenced by main(), run_inference(), and run_prompt().
| void ck_true_bpe_free | ( | CKTrueBPE * | bpe | ) |
Definition at line 405 of file true_bpe.c.
References ck_tokenizer_hash_table_free(), and merge_table_free().
Referenced by main(), and run_inference().
| const char* ck_true_bpe_id_to_token | ( | const CKTrueBPE * | bpe, |
| int32_t | id | ||
| ) |
Definition at line 645 of file true_bpe.c.
References id.
Referenced by ck_true_bpe_decode(), main(), run_inference(), and run_prompt().
| int ck_true_bpe_load_binary | ( | CKTrueBPE * | bpe, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Definition at line 606 of file true_bpe.c.
References ck_true_bpe_add_merge(), ck_true_bpe_add_token(), left, merges, num_merges, offsets, right, strings, token, and vocab_size.
Referenced by main(), and run_inference().
| int32_t ck_true_bpe_lookup | ( | const CKTrueBPE * | bpe, |
| const char * | token | ||
| ) |
Definition at line 638 of file true_bpe.c.
References ck_tokenizer_hash_table_lookup(), and token.
Referenced by encode_chunk(), init_tokens_from_text(), and main().
| int32_t ck_true_bpe_num_merges | ( | const CKTrueBPE * | bpe | ) |
Definition at line 1510 of file true_bpe.c.
| void ck_true_bpe_set_config | ( | CKTrueBPE * | bpe, |
| const CKBPEConfig * | config | ||
| ) |
| void ck_true_bpe_set_special_ids | ( | CKTrueBPE * | bpe, |
| int32_t | unk, | ||
| int32_t | bos, | ||
| int32_t | eos, | ||
| int32_t | pad | ||
| ) |
Definition at line 552 of file true_bpe.c.
| size_t ck_true_bpe_vocab_size | ( | const CKTrueBPE * | bpe | ) |
Definition at line 1506 of file true_bpe.c.
|
static |
Definition at line 1213 of file true_bpe.c.
References apply_bpe_merges(), ck_true_bpe_lookup(), id, ids, init_tokens_from_text(), and max_ids.
Referenced by encode_text_segment().
|
static |
Definition at line 1270 of file true_bpe.c.
References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, encode_chunk(), gpt2_pretokenize(), ids, INITIAL_TOKEN_CAPACITY, max_ids, preprocess_text(), start, text, text_len, token_list_create(), and token_list_free().
Referenced by ck_true_bpe_encode().
|
static |
Definition at line 1149 of file true_bpe.c.
References left_id, merge_table_lookup(), and right_id.
Referenced by apply_bpe_merges().
|
static |
|
static |
Definition at line 915 of file true_bpe.c.
References CHUNK_NUMBER, CHUNK_OTHER, CHUNK_WHITESPACE, CHUNK_WORD, is_bpe_digit(), is_bpe_letter(), is_bpe_newline(), is_bpe_punct(), is_gpt2_space(), text, text_len, and utf8_char_len().
Referenced by encode_text_segment().
|
static |
Definition at line 1121 of file true_bpe.c.
References ck_true_bpe_lookup(), text, text_len, token_list_append(), token_list_clear(), and utf8_char_len().
Referenced by encode_chunk().
|
static |
|
static |
Definition at line 833 of file true_bpe.c.
References is_letter().
Referenced by gpt2_pretokenize().
|
static |
Definition at line 866 of file true_bpe.c.
Referenced by gpt2_pretokenize(), and is_word_prefix_char().
|
static |
Definition at line 884 of file true_bpe.c.
References is_digit(), is_gpt2_space(), and is_letter().
Referenced by gpt2_pretokenize().
|
static |
Definition at line 819 of file true_bpe.c.
Referenced by is_bpe_digit(), is_bpe_punct(), and is_word_prefix_char().
|
static |
Definition at line 828 of file true_bpe.c.
Referenced by gpt2_pretokenize(), is_bpe_punct(), and is_word_prefix_char().
|
static |
Definition at line 815 of file true_bpe.c.
Referenced by is_bpe_letter(), is_bpe_punct(), and is_word_prefix_char().
|
static |
Definition at line 823 of file true_bpe.c.
|
static |
Definition at line 871 of file true_bpe.c.
References is_bpe_newline(), is_digit(), is_gpt2_space(), and is_letter().
|
static |
Definition at line 1323 of file true_bpe.c.
References text, and text_len.
Referenced by ck_true_bpe_encode().
|
static |
Definition at line 157 of file true_bpe.c.
Referenced by merge_table_insert(), and merge_table_lookup().
|
static |
Definition at line 153 of file true_bpe.c.
References left_id, and right_id.
Referenced by merge_table_insert(), and merge_table_lookup().
|
static |
|
static |
Definition at line 182 of file true_bpe.c.
Referenced by ck_true_bpe_create(), and ck_true_bpe_free().
|
static |
Definition at line 198 of file true_bpe.c.
References merge_hash(), and merge_key().
Referenced by ck_true_bpe_add_merge().
|
static |
Definition at line 226 of file true_bpe.c.
References left_id, merge_hash(), merge_key(), and right_id.
Referenced by find_best_merge().
|
static |
Definition at line 749 of file true_bpe.c.
References byte_to_gpt2(), CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.
Referenced by encode_text_segment().
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
Definition at line 790 of file true_bpe.c.
Referenced by gpt2_pretokenize(), and init_tokens_from_text().