#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "tokenizer/hash_table.h"

Data Structures
struct	CKBPEConfig

Macros
#define	CK_TRUE_BPE_API __attribute__((visibility("default")))

Enumerations
enum	CKSpacePrefixStyle { CK_SPACE_PREFIX_AUTO = 0 , CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2 , CK_SPACE_PREFIX_AUTO = 0 , CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2 }

Functions
	__attribute__ ((visibility("default"))) CKTrueBPE *ck_true_bpe_create(void)

Variables
int32_t int32_t	bos

const CKBPEConfig *	config

int32_t int32_t int32_t	eos

const char int32_t	id

const char int int32_t *	ids

const char *	left

int32_t	left_id

const char int int32_t int	max_ids

const int32_t int char int	max_len

int32_t int32_t int32_t	merged_id

int const int32_t const char int const int32_t *	merges

const int32_t int	num_ids

int const int32_t const char int	num_merges

int const int32_t *	offsets

int32_t int32_t int32_t int32_t	pad

int32_t int32_t int32_t int32_t	priority

const char const char *	right

int32_t int32_t	right_id

const char int32_t float	score

int const int32_t const char *	strings

const char *	text

const char int	text_len

const char *	token

int32_t	unk

int	vocab_size

Macro Definition Documentation

◆ CK_TRUE_BPE_API

#define CK_TRUE_BPE_API __attribute__((visibility("default")))

Definition at line 37 of file true_bpe.h.

Enumeration Type Documentation

◆ CKSpacePrefixStyle

enum CKSpacePrefixStyle

Enumerator
CK_SPACE_PREFIX_AUTO
CK_SPACE_PREFIX_GPT2
CK_SPACE_PREFIX_SPM
CK_SPACE_PREFIX_AUTO
CK_SPACE_PREFIX_GPT2
CK_SPACE_PREFIX_SPM

Definition at line 45 of file true_bpe.h.

              {
     CK_SPACE_PREFIX_AUTO = 0,    /* Auto-detect from vocabulary */
     CK_SPACE_PREFIX_GPT2 = 1,    /* GPT-2 style: Ġ (U+0120, bytes 0xC4 0xA0) */
     CK_SPACE_PREFIX_SPM = 2      /* SentencePiece style: ▁ (U+2581, bytes 0xE2 0x96 0x81) */
 } CKSpacePrefixStyle;

Function Documentation

◆ attribute()

__attribute__ ( (visibility("default")) )

Create a new True BPE tokenizer.

Returns: Newly allocated tokenizer, or NULL on error

Free a True BPE tokenizer.

Parameters

bpe	Tokenizer to free

Add a token to the vocabulary.

Parameters

bpe	Tokenizer
token	Token string (UTF-8)
id	Token ID
score	Token score (for unigram models, 0.0 for BPE)

Returns: 0 on success, -1 on error

Add a BPE merge rule by token IDs.

Merge rules define how tokens are combined during encoding. Rules with lower priority numbers are applied first.

Parameters

bpe	Tokenizer
left_id	Left token ID
right_id	Right token ID
merged_id	Resulting merged token ID
priority	Merge priority (lower = applied first)

Returns: 0 on success, -1 on error

Add a BPE merge rule by token strings.

This looks up the token IDs automatically and determines the merged token. The merged token must already exist in the vocabulary.

Parameters

bpe	Tokenizer
left	Left token string
right	Right token string
priority	Merge priority (lower = applied first)

Returns: 0 on success, -1 on error

Set special token IDs.

Parameters

bpe	Tokenizer
unk	Unknown token ID (-1 to disable)
bos	Beginning-of-sequence token ID (-1 to disable)
eos	End-of-sequence token ID (-1 to disable)
pad	Padding token ID (-1 to disable)

Add a special token that should be matched BEFORE BPE encoding.

Parameters

bpe	Tokenizer
token	Token string to match literally (e.g., "<\|im_end\|>")
id	Token ID to output when matched

Returns: 0 on success, -1 on error

Set tokenizer configuration.

Parameters

bpe	Tokenizer
config	Configuration to apply

Load vocabulary + merges from binary buffers.

Parameters

bpe	Tokenizer
vocab_size	Number of tokens
offsets	Offsets array (length vocab_size)
strings	Null-terminated token strings blob
num_merges	Number of merge rules
merges	Merge triples [left_id, right_id, merged_id] (length num_merges*3)

Returns: 0 on success, -1 on error

Look up a token ID by string.

Parameters

bpe	Tokenizer
token	Token string

Returns: Token ID, or unk_id if not found

Get a token string by ID.

Parameters

bpe	Tokenizer
id	Token ID

Returns: Token string, or NULL if invalid

Get vocabulary size.

Parameters

bpe Tokenizer

Returns: Number of tokens in vocabulary

Get number of merge rules.

Parameters

bpe Tokenizer

Returns: Number of merge rules

Auto-detect space prefix style from vocabulary.

Counts tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece) to determine style. The detected style is cached in the config.

Parameters

bpe Tokenizer

Returns: Detected style (GPT2 or SPM)

Encode text to token IDs using true BPE algorithm.

This applies merge rules in priority order (not greedy longest-match).

Parameters

bpe	Tokenizer
text	Input text (UTF-8)
text_len	Text length in bytes, or -1 for null-terminated
ids	Output token IDs array
max_ids	Maximum IDs to write

Returns: Number of tokens written

Decode token IDs to text.

Parameters

bpe	Tokenizer
ids	Input token IDs
num_ids	Number of IDs
text	Output text buffer
max_len	Maximum text length

Returns: Number of bytes written (excluding null terminator)

Variable Documentation

◆ bos

int32_t int32_t bos

Definition at line 145 of file true_bpe.h.

◆ config

const CKBPEConfig* config

Definition at line 171 of file true_bpe.h.

Referenced by ck_tokenizer_encode(), and ck_true_bpe_set_config().

◆ eos

int32_t int32_t int32_t eos

Definition at line 146 of file true_bpe.h.

◆ id

int32_t id

Definition at line 95 of file true_bpe.h.

◆ ids

const int32_t* ids

Definition at line 263 of file true_bpe.h.

◆ left

const char* left

Definition at line 130 of file true_bpe.h.

Referenced by ck_tokenizer_add_merge(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_load_binary(), ck_tokenizer_lookup_merge(), ck_true_bpe_add_merge_by_tokens(), ck_true_bpe_load_binary(), and hash_pair().

◆ left_id

int32_t left_id

Definition at line 112 of file true_bpe.h.

Referenced by ck_true_bpe_add_merge(), find_best_merge(), merge_key(), and merge_table_lookup().

◆ max_ids

const char int int32_t int max_ids

Definition at line 264 of file true_bpe.h.

Referenced by ck_tokenizer_encode(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_true_bpe_encode(), encode_chunk(), encode_text_segment(), main(), spm_encode_byte_fallback(), and spm_llama_resegment_node().

◆ max_len

const int32_t int char int max_len

Definition at line 280 of file true_bpe.h.

Referenced by ck_tokenizer_decode(), ck_true_bpe_decode(), find_longest_match_hash(), json_parse_string(), spm_count_unknown_run(), and spm_find_candidates_at_pos().

◆ merged_id

int32_t int32_t int32_t merged_id

Definition at line 114 of file true_bpe.h.

Referenced by ck_tokenizer_load(), ck_true_bpe_add_merge(), ck_true_bpe_add_merge_by_tokens(), and token_list_merge_at().

◆ merges

int const int32_t const char int const int32_t* merges

Definition at line 189 of file true_bpe.h.

Referenced by ck_tokenizer_load_binary(), ck_tokenizer_load_binary_with_scores(), ck_true_bpe_load_binary(), and main().

◆ num_ids

const int32_t int num_ids

Definition at line 278 of file true_bpe.h.

◆ num_merges

int const int32_t const char int num_merges

Definition at line 188 of file true_bpe.h.

Referenced by ck_tokenizer_load_binary(), ck_tokenizer_load_binary_with_scores(), ck_true_bpe_load_binary(), main(), and run_inference().

◆ offsets

int const int32_t* offsets

Definition at line 186 of file true_bpe.h.

Referenced by ck_tokenizer_load_binary(), ck_tokenizer_load_binary_with_scores(), ck_true_bpe_load_binary(), main(), and spm_build_byte_lookup().

◆ pad

int32_t int32_t int32_t int32_t pad

Definition at line 147 of file true_bpe.h.

◆ priority

const char const char int32_t priority

Definition at line 115 of file true_bpe.h.

Referenced by ck_tokenizer_add_merge(), ck_trie_insert(), ck_true_bpe_add_merge(), and ck_true_bpe_add_merge_by_tokens().

◆ right

const char const char* right

Definition at line 131 of file true_bpe.h.

Referenced by ck_tokenizer_add_merge(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_load_binary(), ck_tokenizer_lookup_merge(), ck_true_bpe_add_merge_by_tokens(), ck_true_bpe_load_binary(), and hash_pair().

◆ right_id

int32_t int32_t right_id

Definition at line 113 of file true_bpe.h.

Referenced by ck_true_bpe_add_merge(), find_best_merge(), merge_key(), and merge_table_lookup().

◆ score

const char int32_t float score

Definition at line 96 of file true_bpe.h.

◆ strings

int const int32_t const char* strings

Definition at line 187 of file true_bpe.h.

Referenced by ck_tokenizer_load_binary(), ck_tokenizer_load_binary_with_scores(), ck_true_bpe_load_binary(), main(), and spm_build_byte_lookup().

◆ text

const int32_t int char* text

Definition at line 261 of file true_bpe.h.

◆ text_len

◆ token

const char* token

Definition at line 94 of file true_bpe.h.

◆ unk

int32_t unk

Definition at line 144 of file true_bpe.h.

Data Structures

Macros

Enumerations

Functions

Variables

Macro Definition Documentation

◆ CK_TRUE_BPE_API

Enumeration Type Documentation

◆ CKSpacePrefixStyle

Function Documentation

◆ __attribute__()

Variable Documentation

◆ bos

◆ config

◆ eos

◆ id

◆ ids

◆ left

◆ left_id

◆ max_ids

◆ max_len

◆ merged_id

◆ merges

◆ num_ids

◆ num_merges

◆ offsets

◆ pad

◆ priority

◆ right

◆ right_id

◆ score

◆ strings

◆ text

◆ text_len

◆ token

◆ unk

◆ vocab_size

◆ attribute()