#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <limits.h>
#include "tokenizer/true_bpe.h"

Macros
#define	INITIAL_TOKEN_CAPACITY 256 /* Initial capacity for token list */

#define	MAX_SPECIAL_TOKENS 32 /* Maximum number of special tokens */

#define	MAX_TOKEN_LEN 128 /* Maximum length of a single token string */

#define	MERGE_HASH_SIZE 65536 /* Size of merge lookup hash table */

Enumerations
enum	ChunkType { CHUNK_WORD , CHUNK_NUMBER , CHUNK_WHITESPACE , CHUNK_OTHER }

Functions
static int	apply_bpe_merges (CKTrueBPE bpe, CKBPETokenList list)

static int	byte_to_gpt2 (unsigned char byte, char *out)

int	ck_true_bpe_add_merge (CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)

int	ck_true_bpe_add_merge_by_tokens (CKTrueBPE bpe, const char left, const char *right, int32_t priority)

int	ck_true_bpe_add_special_token (CKTrueBPE bpe, const char token, int32_t id)

int	ck_true_bpe_add_token (CKTrueBPE bpe, const char token, int32_t id, float score)

CKTrueBPE *	ck_true_bpe_create (void)

int	ck_true_bpe_decode (const CKTrueBPE bpe, const int32_t ids, int num_ids, char *text, int max_len)

CKSpacePrefixStyle	ck_true_bpe_detect_space_style (CKTrueBPE *bpe)

int	ck_true_bpe_encode (CKTrueBPE bpe, const char text, int text_len, int32_t *ids, int max_ids)

void	ck_true_bpe_free (CKTrueBPE *bpe)

const char *	ck_true_bpe_id_to_token (const CKTrueBPE *bpe, int32_t id)

int	ck_true_bpe_load_binary (CKTrueBPE bpe, int vocab_size, const int32_t offsets, const char strings, int num_merges, const int32_t merges)

int32_t	ck_true_bpe_lookup (const CKTrueBPE bpe, const char token)

int32_t	ck_true_bpe_num_merges (const CKTrueBPE *bpe)

void	ck_true_bpe_set_config (CKTrueBPE bpe, const CKBPEConfig config)

void	ck_true_bpe_set_special_ids (CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad)

size_t	ck_true_bpe_vocab_size (const CKTrueBPE *bpe)

static int	encode_chunk (CKTrueBPE bpe, const char chunk, int chunk_len, int32_t ids, int max_ids, CKBPETokenList list)

static int	encode_text_segment (CKTrueBPE bpe, const char text, int text_len, int32_t *ids, int max_ids)

static int	find_best_merge (const CKTrueBPE bpe, const CKBPETokenList list, size_t best_pos, const CKBPEMerge *best_merge)

static int	gpt2_decode_byte (const unsigned char *s, int len)

static int	gpt2_pretokenize (const char text, int text_len, PretokChunk chunks, int max_chunks)

static int	init_tokens_from_text (CKTrueBPE bpe, CKBPETokenList list, const char *text, int text_len)

static bool	is_bpe_digit (const char *s, int len)

static bool	is_bpe_letter (const char *s, int len)

static bool	is_bpe_newline (const char *s, int len)

static bool	is_bpe_punct (const char *s, int len)

static bool	is_digit (unsigned char c)

static bool	is_gpt2_space (const char *s, int len)

static bool	is_letter (unsigned char c)

static bool	is_whitespace (unsigned char c)

static bool	is_word_prefix_char (const char *s, int len)

static int	match_special_token (const CKTrueBPE bpe, const char text, int text_len, int pos)

static size_t	merge_hash (uint64_t key, size_t num_buckets)

static uint64_t	merge_key (int32_t left_id, int32_t right_id)

static CKMergeTable *	merge_table_create (size_t num_buckets)

static void	merge_table_free (CKMergeTable *table)

static int	merge_table_insert (CKMergeTable table, const CKBPEMerge merge)

static const CKBPEMerge *	merge_table_lookup (const CKMergeTable *table, int32_t left_id, int32_t right_id)

static int	preprocess_text (const CKTrueBPE bpe, const char text, int text_len, char *out, int out_max)

static int	token_list_append (CKBPETokenList list, const char str, size_t len, int32_t id)

static void	token_list_clear (CKBPETokenList *list)

static CKBPETokenList *	token_list_create (size_t initial_capacity)

static void	token_list_free (CKBPETokenList *list)

static int	token_list_merge_at (CKBPETokenList list, size_t pos, const char merged_str, size_t merged_len, int32_t merged_id)

static int	utf8_char_len (unsigned char c)

Macro Definition Documentation

◆ INITIAL_TOKEN_CAPACITY

#define INITIAL_TOKEN_CAPACITY 256 /* Initial capacity for token list */

Definition at line 64 of file true_bpe.c.

◆ MAX_SPECIAL_TOKENS

#define MAX_SPECIAL_TOKENS 32 /* Maximum number of special tokens */

Definition at line 108 of file true_bpe.c.

◆ MAX_TOKEN_LEN

#define MAX_TOKEN_LEN 128 /* Maximum length of a single token string */

Definition at line 65 of file true_bpe.c.

◆ MERGE_HASH_SIZE

#define MERGE_HASH_SIZE 65536 /* Size of merge lookup hash table */

Definition at line 63 of file true_bpe.c.

Enumeration Type Documentation

◆ ChunkType

enum ChunkType

Enumerator
CHUNK_WORD
CHUNK_NUMBER
CHUNK_WHITESPACE
CHUNK_OTHER

Definition at line 851 of file true_bpe.c.

              {
     CHUNK_WORD,        /* Letters (with optional leading space) */
     CHUNK_NUMBER,      /* Digits */
     CHUNK_WHITESPACE,  /* Whitespace (not attached to word) */
     CHUNK_OTHER        /* Punctuation, etc. */
 } ChunkType;

Function Documentation

◆ apply_bpe_merges()

static int apply_bpe_merges	(	CKTrueBPE *	bpe,
		CKBPETokenList *	list
	)

static

Definition at line 1173 of file true_bpe.c.

                                                                   {
     char merged_buf[MAX_TOKEN_LEN * 2];
  
     while (list->count > 1) {
         size_t best_pos;
         const CKBPEMerge *best_merge;
  
         if (find_best_merge(bpe, list, &best_pos, &best_merge) != 0) {
             break;  /* No more merges possible */
         }
  
         /* Get merged token string */
         const char *merged_str = bpe->id_to_token[best_merge->merged_id];
         if (!merged_str) {
             /* Construct from left + right */
             size_t left_len = list->tokens[best_pos].len;
             size_t right_len = list->tokens[best_pos + 1].len;
  
             if (left_len + right_len >= sizeof(merged_buf)) {
                 break;  /* Too long */
             }
  
             memcpy(merged_buf, list->tokens[best_pos].str, left_len);
             memcpy(merged_buf + left_len, list->tokens[best_pos + 1].str, right_len);
             merged_buf[left_len + right_len] = '\0';
             merged_str = merged_buf;
         }
  
         /* Apply the merge */
         if (token_list_merge_at(list, best_pos, merged_str, strlen(merged_str), best_merge->merged_id) != 0) {
             break;
         }
     }
  
     return 0;
 }

References find_best_merge(), MAX_TOKEN_LEN, and token_list_merge_at().

Referenced by encode_chunk().

◆ byte_to_gpt2()

static int byte_to_gpt2	(	unsigned char	byte,
		char *	out
	)

static

Definition at line 705 of file true_bpe.c.

                                                        {
     if (byte >= 0x21 && byte <= 0x7E && byte != '!') {
         /* Printable ASCII (except control chars) stays as-is */
         out[0] = (char)byte;
         return 1;
     }
  
     /* Control chars and special bytes map to U+0100 + byte */
     /* This gives us characters like Ġ (U+0120), Ċ (U+010A), etc. */
     unsigned int codepoint;
  
     /* GPT-2's byte_encoder mapping */
     if (byte == '!') codepoint = byte;
     else if (byte == '"') codepoint = byte;
     else if (byte >= '#' && byte <= '~') codepoint = byte;
     else if (byte == 0x21) codepoint = '!';  /* Already handled above */
     else {
         /* Map 0x00-0x20, 0x7F-0xFF to offset range */
         if (byte <= 0x20) {
             codepoint = 0x100 + byte;  /* 0x00-0x20 → U+0100-U+0120 */
         } else if (byte >= 0x7F && byte <= 0xA0) {
             codepoint = 0x100 + byte;  /* 0x7F-0xA0 → U+017F-U+01A0 */
         } else {
             codepoint = byte;  /* Others: 0xA1-0xFF stay as-is */
         }
     }
  
     /* Encode as UTF-8 */
     if (codepoint < 0x80) {
         out[0] = (char)codepoint;
         return 1;
     } else if (codepoint < 0x800) {
         out[0] = (char)(0xC0 | (codepoint >> 6));
         out[1] = (char)(0x80 | (codepoint & 0x3F));
         return 2;
     } else {
         out[0] = (char)(0xE0 | (codepoint >> 12));
         out[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
         out[2] = (char)(0x80 | (codepoint & 0x3F));
         return 3;
     }
 }

Referenced by preprocess_text().

◆ ck_true_bpe_add_merge()

int ck_true_bpe_add_merge	(	CKTrueBPE *	bpe,
		int32_t	left_id,
		int32_t	right_id,
		int32_t	merged_id,
		int32_t	priority
	)

Definition at line 497 of file true_bpe.c.

                                                                                                                   {
     if (!bpe) return -1;
  
     CKBPEMerge merge = {
         .left_id = left_id,
         .right_id = right_id,
         .merged_id = merged_id,
         .priority = priority
     };
  
     int ret = merge_table_insert(bpe->merges, &merge);
     if (ret == 0) {
         bpe->num_merges++;
     }
     return ret;
 }

References left_id, merge_table_insert(), merged_id, priority, and right_id.

Referenced by ck_true_bpe_add_merge_by_tokens(), and ck_true_bpe_load_binary().

◆ ck_true_bpe_add_merge_by_tokens()

int ck_true_bpe_add_merge_by_tokens	(	CKTrueBPE *	bpe,
		const char *	left,
		const char *	right,
		int32_t	priority
	)

Definition at line 514 of file true_bpe.c.

                                                                                                            {
     if (!bpe || !left || !right) return -1;
  
     /* Look up token IDs */
     BPETokenInfo *left_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, left);
     BPETokenInfo *right_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, right);
  
     if (!left_info || !right_info) {
         return -1;  /* Tokens not in vocabulary */
     }
  
     /* Create merged token string */
     size_t left_len = strlen(left);
     size_t right_len = strlen(right);
     size_t merged_len = left_len + right_len;
  
     if (merged_len >= bpe->str_buffer_size) {
         return -1;  /* Too long */
     }
  
     memcpy(bpe->str_buffer, left, left_len);
     memcpy(bpe->str_buffer + left_len, right, right_len);
     bpe->str_buffer[merged_len] = '\0';
  
     /* Look up or create merged token */
     BPETokenInfo *merged_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, bpe->str_buffer);
     int32_t merged_id;
  
     if (merged_info) {
         merged_id = merged_info->id;
     } else {
         /* Merged token should already exist in vocabulary */
         return -1;
     }
  
     return ck_true_bpe_add_merge(bpe, left_info->id, right_info->id, merged_id, priority);
 }

References ck_tokenizer_hash_table_lookup(), ck_true_bpe_add_merge(), left, merged_id, priority, and right.

◆ ck_true_bpe_add_special_token()

int ck_true_bpe_add_special_token	(	CKTrueBPE *	bpe,
		const char *	token,
		int32_t	id
	)

Definition at line 565 of file true_bpe.c.

                                                                                  {
     if (!bpe || !token || id < 0) return -1;
     if (bpe->num_special_tokens >= MAX_SPECIAL_TOKENS) return -1;
  
     int token_len = (int)strlen(token);
     if (token_len == 0) return -1;
  
     /* Check if already exists */
     for (int i = 0; i < bpe->num_special_tokens; i++) {
         if (bpe->special_tokens[i].token &&
             strcmp(bpe->special_tokens[i].token, token) == 0) {
             /* Update ID for existing token */
             bpe->special_tokens[i].id = id;
             return 0;
         }
     }
  
     /* Find insertion point (keep sorted by length, longest first) */
     int insert_idx = bpe->num_special_tokens;
     for (int i = 0; i < bpe->num_special_tokens; i++) {
         if (token_len > bpe->special_tokens[i].len) {
             insert_idx = i;
             break;
         }
     }
  
     /* Shift existing entries down */
     for (int i = bpe->num_special_tokens; i > insert_idx; i--) {
         bpe->special_tokens[i] = bpe->special_tokens[i - 1];
     }
  
     /* Insert new entry */
     bpe->special_tokens[insert_idx].token = strdup(token);
     if (!bpe->special_tokens[insert_idx].token) return -1;
     bpe->special_tokens[insert_idx].id = id;
     bpe->special_tokens[insert_idx].len = token_len;
     bpe->num_special_tokens++;
  
     return 0;
 }

References id, MAX_SPECIAL_TOKENS, and token.

Referenced by main().

◆ ck_true_bpe_add_token()

int ck_true_bpe_add_token	(	CKTrueBPE *	bpe,
		const char *	token,
		int32_t	id,
		float	score
	)

Definition at line 449 of file true_bpe.c.

                                                                                       {
     if (!bpe || !token) return -1;
  
     /* Ensure reverse vocab has space */
     if (id >= (int32_t)bpe->vocab_capacity) {
         size_t new_cap = bpe->vocab_capacity * 2;
         while (new_cap <= (size_t)id) new_cap *= 2;
  
         char **new_array = (char **)realloc(bpe->id_to_token, new_cap * sizeof(char *));
         if (!new_array) return -1;
  
         memset(new_array + bpe->vocab_capacity, 0, (new_cap - bpe->vocab_capacity) * sizeof(char *));
         bpe->id_to_token = new_array;
         bpe->vocab_capacity = new_cap;
     }
  
     /* Check if token exists */
     BPETokenInfo *existing = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
     if (existing) {
         existing->id = id;
         existing->score = score;
         if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
         bpe->id_to_token[id] = strdup(token);
         return 0;
     }
  
     /* Create new token info */
     BPETokenInfo *info = (BPETokenInfo *)malloc(sizeof(BPETokenInfo));
     if (!info) return -1;
  
     info->id = id;
     info->score = score;
  
     if (ck_tokenizer_hash_table_insert(bpe->vocab, token, info) != 0) {
         free(info);
         return -1;
     }
  
     if (id >= (int32_t)bpe->vocab_size) {
         bpe->vocab_size = id + 1;
     }
  
     if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
     bpe->id_to_token[id] = strdup(token);
  
     return 0;
 }

References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), id, score, and token.

Referenced by ck_true_bpe_load_binary().

◆ ck_true_bpe_create()

CKTrueBPE* ck_true_bpe_create ( void )

Definition at line 342 of file true_bpe.c.

                                     {
     CKTrueBPE *bpe = (CKTrueBPE *)calloc(1, sizeof(CKTrueBPE));
     if (!bpe) return NULL;
  
     /* Create vocabulary hash table */
     bpe->vocab = ck_tokenizer_hash_table_create(CK_TOKENIZER_HT_BUCKETS_LARGE);
     if (!bpe->vocab) {
         free(bpe);
         return NULL;
     }
  
     /* Create merge table */
     bpe->merges = merge_table_create(MERGE_HASH_SIZE);
     if (!bpe->merges) {
         ck_tokenizer_hash_table_free(bpe->vocab, true);
         free(bpe);
         return NULL;
     }
  
     /* Initialize reverse vocabulary */
     bpe->vocab_capacity = 4096;
     bpe->id_to_token = (char **)calloc(bpe->vocab_capacity, sizeof(char *));
     if (!bpe->id_to_token) {
         merge_table_free(bpe->merges);
         ck_tokenizer_hash_table_free(bpe->vocab, true);
         free(bpe);
         return NULL;
     }
  
     /* String buffer for token operations */
     bpe->str_buffer_size = 4096;
     bpe->str_buffer = (char *)malloc(bpe->str_buffer_size);
     if (!bpe->str_buffer) {
         free(bpe->id_to_token);
         merge_table_free(bpe->merges);
         ck_tokenizer_hash_table_free(bpe->vocab, true);
         free(bpe);
         return NULL;
     }
  
     /* Default special token IDs */
     bpe->unk_id = 0;
     bpe->bos_id = -1;
     bpe->eos_id = -1;
     bpe->pad_id = -1;
  
     /* Initialize special tokens array */
     bpe->num_special_tokens = 0;
     for (int i = 0; i < MAX_SPECIAL_TOKENS; i++) {
         bpe->special_tokens[i].token = NULL;
         bpe->special_tokens[i].id = -1;
         bpe->special_tokens[i].len = 0;
     }
  
     /* Default config */
     bpe->config.add_bos = false;
     bpe->config.add_eos = false;
     bpe->config.byte_fallback = true;
     bpe->config.space_prefix_style = CK_SPACE_PREFIX_AUTO;
  
     return bpe;
 }

References CK_SPACE_PREFIX_AUTO, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, MAX_SPECIAL_TOKENS, MERGE_HASH_SIZE, merge_table_create(), and merge_table_free().

Referenced by main(), and run_inference().

◆ ck_true_bpe_decode()

int ck_true_bpe_decode	(	const CKTrueBPE *	bpe,
		const int32_t *	ids,
		int	num_ids,
		char *	text,
		int	max_len
	)

Definition at line 1439 of file true_bpe.c.

                                                                                                        {
     if (!bpe || !ids || !text || max_len <= 0) return 0;
  
     int len = 0;
     for (int i = 0; i < num_ids && len < max_len - 1; i++) {
         int32_t id = ids[i];
         if (id < 0) continue;
  
         /* Skip special tokens */
         if (id == bpe->bos_id || id == bpe->eos_id || id == bpe->pad_id) {
             continue;
         }
  
         const char *token = ck_true_bpe_id_to_token(bpe, id);
         if (!token) continue;
  
         int token_len = (int)strlen(token);
  
         /* Check for SentencePiece space marker ▁ (U+2581) at start */
         if (token_len >= 3 &&
             (unsigned char)token[0] == 0xE2 &&
             (unsigned char)token[1] == 0x96 &&
             (unsigned char)token[2] == 0x81) {
             /* ▁ -> space */
             if (len < max_len - 1) text[len++] = ' ';
             token += 3;
             token_len -= 3;
         }
  
         /* Process rest of token, decoding GPT-2 byte-level encoding */
         int pos = 0;
         while (pos < token_len && len < max_len - 1) {
             unsigned char c0 = (unsigned char)token[pos];
  
             /* Try GPT-2 byte decoding for 2-byte UTF-8 sequences */
             if (pos + 1 < token_len && (c0 & 0xE0) == 0xC0) {
                 int decoded = gpt2_decode_byte((unsigned char*)token + pos, token_len - pos);
                 if (decoded >= 0) {
                     text[len++] = (char)decoded;
                     pos += 2;
                     continue;
                 }
             }
  
             /* Regular character - copy as-is */
             int char_len = 1;
             if ((c0 & 0x80) == 0) char_len = 1;       /* ASCII */
             else if ((c0 & 0xE0) == 0xC0) char_len = 2;
             else if ((c0 & 0xF0) == 0xE0) char_len = 3;
             else if ((c0 & 0xF8) == 0xF0) char_len = 4;
  
             /* Copy the character */
             for (int j = 0; j < char_len && pos + j < token_len && len < max_len - 1; j++) {
                 text[len++] = token[pos + j];
             }
             pos += char_len;
         }
     }
  
     text[len] = '\0';
     return len;
 }

References ck_true_bpe_id_to_token(), gpt2_decode_byte(), ids, max_len, num_ids, text, and token.

◆ ck_true_bpe_detect_space_style()

CKSpacePrefixStyle ck_true_bpe_detect_space_style ( CKTrueBPE * bpe )

Definition at line 654 of file true_bpe.c.

                                                                   {
     if (!bpe) return CK_SPACE_PREFIX_GPT2;
  
     if (bpe->config.space_prefix_style != CK_SPACE_PREFIX_AUTO) {
         return bpe->config.space_prefix_style;
     }
  
     /* Count tokens starting with each style */
     int gpt2_count = 0;  /* Ġ (0xC4 0xA0) */
     int spm_count = 0;   /* ▁ (0xE2 0x96 0x81) */
  
     for (size_t i = 0; i < bpe->vocab_size && i < 10000; i++) {
         const char *token = bpe->id_to_token[i];
         if (!token) continue;
  
         unsigned char c0 = (unsigned char)token[0];
         unsigned char c1 = (unsigned char)token[1];
  
         if (c0 == 0xC4 && c1 == 0xA0) {
             gpt2_count++;
         } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
             spm_count++;
         }
     }
  
     CKSpacePrefixStyle detected = (spm_count > gpt2_count * 2) ? CK_SPACE_PREFIX_SPM : CK_SPACE_PREFIX_GPT2;
     bpe->config.space_prefix_style = detected;
  
     return detected;
 }

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, and token.

Referenced by ck_true_bpe_encode().

◆ ck_true_bpe_encode()

int ck_true_bpe_encode	(	CKTrueBPE *	bpe,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

Definition at line 1338 of file true_bpe.c.

                                                                                                   {
     if (!bpe || !text || !ids || max_ids <= 0) return 0;
     if (text_len < 0) text_len = (int)strlen(text);
     if (text_len == 0) return 0;
  
     /* Auto-detect space style if needed */
     if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
         ck_true_bpe_detect_space_style(bpe);
     }
  
     int out_idx = 0;
  
     /* Add BOS token if configured */
     if (bpe->config.add_bos && bpe->bos_id >= 0 && out_idx < max_ids) {
         ids[out_idx++] = bpe->bos_id;
     }
  
     /* If no special tokens registered, use fast path */
     if (bpe->num_special_tokens == 0) {
         out_idx += encode_text_segment(bpe, text, text_len, ids + out_idx, max_ids - out_idx);
     } else {
         /* Scan for special tokens and encode segments between them */
         int pos = 0;
         int segment_start = 0;
  
         while (pos < text_len && out_idx < max_ids) {
             int match = match_special_token(bpe, text, text_len, pos);
  
             if (match >= 0) {
                 /* Found special token - first encode any text before it */
                 if (pos > segment_start) {
                     int seg_len = pos - segment_start;
                     out_idx += encode_text_segment(bpe, text + segment_start, seg_len,
                                                    ids + out_idx, max_ids - out_idx);
                 }
  
                 /* Output the special token ID */
                 if (out_idx < max_ids) {
                     ids[out_idx++] = bpe->special_tokens[match].id;
                 }
  
                 /* Advance past the special token */
                 pos += bpe->special_tokens[match].len;
                 segment_start = pos;
             } else {
                 /* No special token here, advance to next character */
                 pos++;
             }
         }
  
         /* Encode any remaining text after last special token */
         if (segment_start < text_len && out_idx < max_ids) {
             out_idx += encode_text_segment(bpe, text + segment_start, text_len - segment_start,
                                            ids + out_idx, max_ids - out_idx);
         }
     }
  
     /* Add EOS token if configured */
     if (bpe->config.add_eos && bpe->eos_id >= 0 && out_idx < max_ids) {
         ids[out_idx++] = bpe->eos_id;
     }
  
     return out_idx;
 }

References CK_SPACE_PREFIX_AUTO, ck_true_bpe_detect_space_style(), encode_text_segment(), ids, match_special_token(), max_ids, text, and text_len.

Referenced by main(), run_inference(), and run_prompt().

◆ ck_true_bpe_free()

void ck_true_bpe_free ( CKTrueBPE * bpe )

Definition at line 405 of file true_bpe.c.

                                       {
     if (!bpe) return;
  
     if (bpe->vocab) {
         ck_tokenizer_hash_table_free(bpe->vocab, true);
     }
  
     if (bpe->merges) {
         merge_table_free(bpe->merges);
     }
  
     if (bpe->id_to_token) {
         for (size_t i = 0; i < bpe->vocab_size; i++) {
             if (bpe->id_to_token[i]) {
                 free(bpe->id_to_token[i]);
             }
         }
         free(bpe->id_to_token);
     }
  
     if (bpe->str_buffer) {
         free(bpe->str_buffer);
     }
  
     /* Free special tokens */
     for (int i = 0; i < bpe->num_special_tokens; i++) {
         if (bpe->special_tokens[i].token) {
             free(bpe->special_tokens[i].token);
         }
     }
  
     free(bpe);
 }

References ck_tokenizer_hash_table_free(), and merge_table_free().

Referenced by main(), and run_inference().

◆ ck_true_bpe_id_to_token()

const char* ck_true_bpe_id_to_token	(	const CKTrueBPE *	bpe,
		int32_t	id
	)

Definition at line 645 of file true_bpe.c.

                                                                       {
     if (!bpe || id < 0 || id >= (int32_t)bpe->vocab_size) return NULL;
     return bpe->id_to_token[id];
 }

References id.

Referenced by ck_true_bpe_decode(), main(), run_inference(), and run_prompt().

◆ ck_true_bpe_load_binary()

int ck_true_bpe_load_binary	(	CKTrueBPE *	bpe,
		int	vocab_size,
		const int32_t *	offsets,
		const char *	strings,
		int	num_merges,
		const int32_t *	merges
	)

Definition at line 606 of file true_bpe.c.

                                                    {
     if (!bpe || !offsets || !strings || vocab_size <= 0) return -1;
  
     for (int i = 0; i < vocab_size; i++) {
         const char *token = strings + offsets[i];
         if (ck_true_bpe_add_token(bpe, token, i, 0.0f) != 0) {
             return -1;
         }
     }
  
     if (merges && num_merges > 0) {
         for (int i = 0; i < num_merges; i++) {
             int32_t left = merges[i * 3 + 0];
             int32_t right = merges[i * 3 + 1];
             int32_t merged = merges[i * 3 + 2];
             if (left < 0 || right < 0 || merged < 0) {
                 continue;
             }
             if (ck_true_bpe_add_merge(bpe, left, right, merged, i) != 0) {
                 return -1;
             }
         }
     }
  
     return 0;
 }

References ck_true_bpe_add_merge(), ck_true_bpe_add_token(), left, merges, num_merges, offsets, right, strings, token, and vocab_size.

Referenced by main(), and run_inference().

◆ ck_true_bpe_lookup()

int32_t ck_true_bpe_lookup	(	const CKTrueBPE *	bpe,
		const char *	token
	)

Definition at line 638 of file true_bpe.c.

                                                                     {
     if (!bpe || !token) return -1;
  
     BPETokenInfo *info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
     return info ? info->id : bpe->unk_id;
 }

References ck_tokenizer_hash_table_lookup(), and token.

Referenced by encode_chunk(), init_tokens_from_text(), and main().

◆ ck_true_bpe_num_merges()

int32_t ck_true_bpe_num_merges ( const CKTrueBPE * bpe )

Definition at line 1510 of file true_bpe.c.

                                                      {
     return bpe ? bpe->num_merges : 0;
 }

◆ ck_true_bpe_set_config()

void ck_true_bpe_set_config	(	CKTrueBPE *	bpe,
		const CKBPEConfig *	config
	)

Definition at line 560 of file true_bpe.c.

                                                                        {
     if (!bpe || !config) return;
     bpe->config = *config;
 }

References config.

◆ ck_true_bpe_set_special_ids()

void ck_true_bpe_set_special_ids	(	CKTrueBPE *	bpe,
		int32_t	unk,
		int32_t	bos,
		int32_t	eos,
		int32_t	pad
	)

Definition at line 552 of file true_bpe.c.

                                                                                                      {
     if (!bpe) return;
     bpe->unk_id = unk;
     bpe->bos_id = bos;
     bpe->eos_id = eos;
     bpe->pad_id = pad;
 }

References bos, eos, pad, and unk.

◆ ck_true_bpe_vocab_size()

size_t ck_true_bpe_vocab_size ( const CKTrueBPE * bpe )

Definition at line 1506 of file true_bpe.c.

                                                     {
     return bpe ? bpe->vocab_size : 0;
 }

◆ encode_chunk()

static int encode_chunk	(	CKTrueBPE *	bpe,
		const char *	chunk,
		int	chunk_len,
		int32_t *	ids,
		int	max_ids,
		CKBPETokenList *	list
	)

static

Definition at line 1213 of file true_bpe.c.

                                                                          {
     if (chunk_len <= 0) return 0;
  
     /* First, try to look up the entire chunk as a single token */
     char chunk_buf[256];
     if (chunk_len < (int)sizeof(chunk_buf)) {
         memcpy(chunk_buf, chunk, chunk_len);
         chunk_buf[chunk_len] = '\0';
         int32_t chunk_id = ck_true_bpe_lookup(bpe, chunk_buf);
         if (chunk_id >= 0) {
             /* Entire chunk is a single token */
             if (max_ids >= 1) {
                 ids[0] = chunk_id;
                 return 1;
             }
             return 0;
         }
     }
  
     /* Initialize token list from chunk characters */
     if (init_tokens_from_text(bpe, list, chunk, chunk_len) != 0) {
         return 0;
     }
  
     /* Apply BPE merges to this chunk */
     apply_bpe_merges(bpe, list);
  
     /* Extract token IDs from this chunk */
     int out_idx = 0;
     for (size_t i = 0; i < list->count && out_idx < max_ids; i++) {
         int32_t id = list->tokens[i].id;
  
         /* Handle unknown tokens */
         if (id < 0) {
             if (bpe->config.byte_fallback) {
                 /* Output each byte as separate token (byte fallback) */
                 for (size_t j = 0; j < list->tokens[i].len && out_idx < max_ids; j++) {
                     char byte_token[8];
                     snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)list->tokens[i].str[j]);
                     int32_t byte_id = ck_true_bpe_lookup(bpe, byte_token);
                     ids[out_idx++] = (byte_id >= 0) ? byte_id : bpe->unk_id;
                 }
             } else {
                 ids[out_idx++] = bpe->unk_id;
             }
         } else {
             ids[out_idx++] = id;
         }
     }
  
     return out_idx;
 }

References apply_bpe_merges(), ck_true_bpe_lookup(), id, ids, init_tokens_from_text(), and max_ids.

Referenced by encode_text_segment().

◆ encode_text_segment()

static int encode_text_segment	(	CKTrueBPE *	bpe,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

static

Definition at line 1270 of file true_bpe.c.

                                                            {
     if (text_len <= 0) return 0;
  
     /* Preprocess text (byte-level encoding) */
     char preprocessed[16384];
     int pp_len = preprocess_text(bpe, text, text_len, preprocessed, sizeof(preprocessed) - 1);
     if (pp_len < 0) {
         return 0;
     }
     preprocessed[pp_len] = '\0';
  
     int out_idx = 0;
  
     /* For GPT-2 style, use pretokenizer to split into chunks */
     if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_GPT2 ||
         bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
  
         /* Pretokenize */
         PretokChunk chunks[1024];
         int num_chunks = gpt2_pretokenize(preprocessed, pp_len, chunks, 1024);
  
         /* Create reusable token list */
         CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
         if (!list) return out_idx;
  
         /* Process each chunk independently with BPE */
         for (int c = 0; c < num_chunks && out_idx < max_ids; c++) {
             int chunk_ids = encode_chunk(bpe, chunks[c].start, chunks[c].len,
                                          ids + out_idx, max_ids - out_idx, list);
             out_idx += chunk_ids;
         }
  
         token_list_free(list);
     } else {
         /* SentencePiece style: no pretokenization, process entire text */
         CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
         if (!list) return out_idx;
  
         int chunk_ids = encode_chunk(bpe, preprocessed, pp_len,
                                      ids + out_idx, max_ids - out_idx, list);
         out_idx += chunk_ids;
  
         token_list_free(list);
     }
  
     return out_idx;
 }

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, encode_chunk(), gpt2_pretokenize(), ids, INITIAL_TOKEN_CAPACITY, max_ids, preprocess_text(), start, text, text_len, token_list_create(), and token_list_free().

Referenced by ck_true_bpe_encode().

◆ find_best_merge()

static int find_best_merge	(	const CKTrueBPE *	bpe,
		const CKBPETokenList *	list,
		size_t *	best_pos,
		const CKBPEMerge **	best_merge
	)

static

Definition at line 1149 of file true_bpe.c.

                                                                             {
     *best_pos = 0;
     *best_merge = NULL;
     int32_t best_priority = INT32_MAX;
  
     for (size_t i = 0; i + 1 < list->count; i++) {
         int32_t left_id = list->tokens[i].id;
         int32_t right_id = list->tokens[i + 1].id;
  
         if (left_id < 0 || right_id < 0) continue;  /* Unknown tokens can't merge */
  
         const CKBPEMerge *merge = merge_table_lookup(bpe->merges, left_id, right_id);
         if (merge && merge->priority < best_priority) {
             best_priority = merge->priority;
             *best_pos = i;
             *best_merge = merge;
         }
     }
  
     return (*best_merge != NULL) ? 0 : -1;
 }

References left_id, merge_table_lookup(), and right_id.

Referenced by apply_bpe_merges().

◆ gpt2_decode_byte()

static int gpt2_decode_byte	(	const unsigned char *	s,
		int	len
	)

static

Definition at line 1417 of file true_bpe.c.

                                                              {
     if (len < 2) return -1;
  
     /* Check for 2-byte UTF-8 sequence: 110xxxxx 10xxxxxx */
     if ((s[0] & 0xE0) == 0xC0 && (s[1] & 0xC0) == 0x80) {
         unsigned int codepoint = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
  
         /* GPT-2 byte range: U+0100 to U+01FF */
         if (codepoint >= 0x100 && codepoint <= 0x1FF) {
             /* Decode back to original byte */
             if (codepoint <= 0x120) {
                 /* U+0100-U+0120 -> bytes 0x00-0x20 */
                 return codepoint - 0x100;
             } else if (codepoint >= 0x17F && codepoint <= 0x1A0) {
                 /* U+017F-U+01A0 -> bytes 0x7F-0xA0 */
                 return codepoint - 0x100;
             }
         }
     }
     return -1;
 }

Referenced by ck_true_bpe_decode().

◆ gpt2_pretokenize()

static int gpt2_pretokenize	(	const char *	text,
		int	text_len,
		PretokChunk *	chunks,
		int	max_chunks
	)

static

Definition at line 915 of file true_bpe.c.

                                                                                                  {
     int num_chunks = 0;
     int pos = 0;
  
     while (pos < text_len && num_chunks < max_chunks) {
         int chunk_start = pos;
         int char_len = utf8_char_len((unsigned char)text[pos]);
         if (pos + char_len > text_len) char_len = text_len - pos;
  
         /* Pattern 2: [^\r\n\p{L}\p{N}]?\p{L}+ - Words with optional punctuation prefix */
         /* This pattern MUST be checked before pattern 4 (punctuation) */
         /* Check if we have: letter, or punctuation followed by letter */
         bool is_word = false;
         int word_start = pos;
         int prefix_len = 0;
  
         if (is_bpe_letter(text + pos, char_len)) {
             /* Word without prefix */
             is_word = true;
         } else if (is_bpe_punct(text + pos, char_len) && !is_gpt2_space(text + pos, text_len - pos)) {
             /* Check if punctuation is followed by a letter */
             int after = pos + char_len;
             if (after < text_len) {
                 int next_len = utf8_char_len((unsigned char)text[after]);
                 if (is_bpe_letter(text + after, next_len)) {
                     /* This is pattern 2: punctuation + letters */
                     is_word = true;
                     prefix_len = char_len;
                 }
             }
         }
  
         if (is_word) {
             /* Collect the word (with optional prefix) */
             pos = word_start + prefix_len;  /* Skip prefix if any */
             while (pos < text_len) {
                 int clen = utf8_char_len((unsigned char)text[pos]);
                 if (is_bpe_letter(text + pos, clen)) {
                     pos += clen;
                 } else {
                     break;
                 }
             }
             chunks[num_chunks].start = text + chunk_start;
             chunks[num_chunks].len = pos - chunk_start;
             chunks[num_chunks].type = CHUNK_WORD;
             num_chunks++;
             continue;
         }
  
         /* Pattern 3: \p{N} - Single digit */
         if (is_bpe_digit(text + pos, char_len)) {
             pos += char_len;
             chunks[num_chunks].start = text + chunk_start;
             chunks[num_chunks].len = pos - chunk_start;
             chunks[num_chunks].type = CHUNK_NUMBER;
             num_chunks++;
             continue;
         }
  
         /* Pattern 4:  ?[^\s\p{L}\p{N}]+[\r\n]* - Optional space + punctuation + newlines */
         /* At this point, we know the current char is NOT followed by letters (checked above) */
         /* Check for space (Ġ) followed by punctuation, OR just punctuation */
         bool has_leading_space = is_gpt2_space(text + pos, text_len - pos);
         int punct_start = has_leading_space ? pos + 2 : pos;
  
         if (punct_start < text_len) {
             int pchar_len = utf8_char_len((unsigned char)text[punct_start]);
             if (is_bpe_punct(text + punct_start, pchar_len)) {
                 /* This matches pattern 4: space? + punctuation + newlines? */
                 if (has_leading_space) {
                     pos += 2;  /* Include the leading space */
                 }
                 /* Collect punctuation characters */
                 /* Pattern 4 collects ALL consecutive punctuation, NOT stopping before letters */
                 /* The key insight: if we have " (" before "int", the " (" is pattern 4, */
                 /* and "(int" would be pattern 2, but pattern 4 matches first since " (" */
                 /* is a complete match for pattern 4 (space + one punctuation). */
                 while (pos < text_len) {
                     int clen = utf8_char_len((unsigned char)text[pos]);
                     if (is_bpe_punct(text + pos, clen)) {
                         pos += clen;
                     } else {
                         break;
                     }
                 }
                 /* Include trailing newlines */
                 while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
                     pos += 2;
                 }
                 chunks[num_chunks].start = text + chunk_start;
                 chunks[num_chunks].len = pos - chunk_start;
                 chunks[num_chunks].type = CHUNK_OTHER;
                 num_chunks++;
                 continue;
             }
         }
  
         /* Pattern 5/6/7: Whitespace handling */
         if (is_gpt2_space(text + pos, text_len - pos)) {
             /* Count consecutive spaces */
             int space_count = 0;
             int space_end = pos;
             while (space_end < text_len && is_gpt2_space(text + space_end, text_len - space_end)) {
                 space_count++;
                 space_end += 2;
             }
  
             /* Check if spaces are followed by newlines (pattern 5) */
             if (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
                 /* Whitespace + newlines */
                 while (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
                     space_end += 2;
                 }
                 chunks[num_chunks].start = text + pos;
                 chunks[num_chunks].len = space_end - pos;
                 chunks[num_chunks].type = CHUNK_WHITESPACE;
                 num_chunks++;
                 pos = space_end;
                 continue;
             }
  
             /* Check what follows the spaces */
             if (space_end < text_len) {
                 int next_len = utf8_char_len((unsigned char)text[space_end]);
                 if (is_bpe_letter(text + space_end, next_len)) {
                     /* Letters follow - output (n-1) spaces, then space+word (pattern 2) */
                     if (space_count > 1) {
                         chunks[num_chunks].start = text + pos;
                         chunks[num_chunks].len = (space_count - 1) * 2;
                         chunks[num_chunks].type = CHUNK_WHITESPACE;
                         num_chunks++;
                         pos += (space_count - 1) * 2;
                         if (num_chunks >= max_chunks) break;
                     }
                     /* Collect space + word */
                     chunk_start = pos;
                     pos += 2;  /* Skip the Ġ */
                     while (pos < text_len) {
                         int clen = utf8_char_len((unsigned char)text[pos]);
                         if (is_bpe_letter(text + pos, clen)) {
                             pos += clen;
                         } else {
                             break;
                         }
                     }
                     chunks[num_chunks].start = text + chunk_start;
                     chunks[num_chunks].len = pos - chunk_start;
                     chunks[num_chunks].type = CHUNK_WORD;
                     num_chunks++;
                     continue;
                 } else if (is_bpe_digit(text + space_end, next_len)) {
                     /* Digit follows - output (n-1) spaces, then space+digit */
                     if (space_count > 1) {
                         chunks[num_chunks].start = text + pos;
                         chunks[num_chunks].len = (space_count - 1) * 2;
                         chunks[num_chunks].type = CHUNK_WHITESPACE;
                         num_chunks++;
                         pos += (space_count - 1) * 2;
                         if (num_chunks >= max_chunks) break;
                     }
                     /* Space + single digit */
                     chunk_start = pos;
                     pos += 2 + 1;  /* Ġ + digit */
                     chunks[num_chunks].start = text + chunk_start;
                     chunks[num_chunks].len = pos - chunk_start;
                     chunks[num_chunks].type = CHUNK_NUMBER;
                     num_chunks++;
                     continue;
                 }
                 /* Pattern 4 would have caught space+punct, so this is trailing space before something else */
             }
  
             /* Just whitespace */
             chunks[num_chunks].start = text + pos;
             chunks[num_chunks].len = space_count * 2;
             chunks[num_chunks].type = CHUNK_WHITESPACE;
             num_chunks++;
             pos = space_end;
             continue;
         }
  
         /* Pattern 5: Newlines (Ċ) */
         if (is_bpe_newline(text + pos, text_len - pos)) {
             while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
                 pos += 2;
             }
             chunks[num_chunks].start = text + chunk_start;
             chunks[num_chunks].len = pos - chunk_start;
             chunks[num_chunks].type = CHUNK_OTHER;
             num_chunks++;
             continue;
         }
  
         /* Fallback: single character chunk */
         pos += char_len;
         chunks[num_chunks].start = text + chunk_start;
         chunks[num_chunks].len = pos - chunk_start;
         chunks[num_chunks].type = CHUNK_OTHER;
         num_chunks++;
     }
  
     return num_chunks;
 }

References CHUNK_NUMBER, CHUNK_OTHER, CHUNK_WHITESPACE, CHUNK_WORD, is_bpe_digit(), is_bpe_letter(), is_bpe_newline(), is_bpe_punct(), is_gpt2_space(), text, text_len, and utf8_char_len().

Referenced by encode_text_segment().

◆ init_tokens_from_text()

static int init_tokens_from_text	(	CKTrueBPE *	bpe,
		CKBPETokenList *	list,
		const char *	text,
		int	text_len
	)

static

Definition at line 1121 of file true_bpe.c.

                                                                                                        {
     token_list_clear(list);
  
     int pos = 0;
     while (pos < text_len) {
         int char_len = utf8_char_len((unsigned char)text[pos]);
         if (pos + char_len > text_len) {
             char_len = text_len - pos;  /* Truncated UTF-8 */
         }
  
         /* Look up this character/byte in vocabulary */
         char char_buf[8];
         memcpy(char_buf, text + pos, char_len);
         char_buf[char_len] = '\0';
  
         int32_t id = ck_true_bpe_lookup(bpe, char_buf);
  
         if (token_list_append(list, char_buf, char_len, id) != 0) {
             return -1;
         }
  
         pos += char_len;
     }
  
     return 0;
 }

References ck_true_bpe_lookup(), text, text_len, token_list_append(), token_list_clear(), and utf8_char_len().

Referenced by encode_chunk().

◆ is_bpe_digit()

static bool is_bpe_digit	(	const char *	s,
		int	len
	)

static

Definition at line 842 of file true_bpe.c.

                                                  {
     if (len == 1) {
         unsigned char c = (unsigned char)s[0];
         return is_digit(c);
     }
     return false;
 }

References is_digit().

Referenced by gpt2_pretokenize().

◆ is_bpe_letter()

static bool is_bpe_letter	(	const char *	s,
		int	len
	)

static

Definition at line 833 of file true_bpe.c.

                                                   {
     if (len == 1) {
         unsigned char c = (unsigned char)s[0];
         return is_letter(c);
     }
     return false;
 }

References is_letter().

Referenced by gpt2_pretokenize().

◆ is_bpe_newline()

static bool is_bpe_newline	(	const char *	s,
		int	len
	)

static

Definition at line 866 of file true_bpe.c.

                                                    {
     return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0x8A;
 }

Referenced by gpt2_pretokenize(), and is_word_prefix_char().

◆ is_bpe_punct()

static bool is_bpe_punct	(	const char *	s,
		int	len
	)

static

Definition at line 884 of file true_bpe.c.

                                                  {
     if (len == 1) {
         unsigned char c = (unsigned char)s[0];
         return !is_letter(c) && !is_digit(c) && c != ' ' && c != '\t' && c != '\n' && c != '\r';
     }
     /* Multi-byte: not Ġ (space) or Ċ (newline) */
     if (is_gpt2_space(s, len)) return false;
     if (len >= 2 && (unsigned char)s[0] == 0xC4) {
         unsigned char c1 = (unsigned char)s[1];
         /* Ċ (newline), ĉ (tab), č (CR) etc. are not punctuation */
         if (c1 == 0x8A || c1 == 0x89 || c1 == 0x8D) return false;
     }
     return true;
 }

References is_digit(), is_gpt2_space(), and is_letter().

Referenced by gpt2_pretokenize().

◆ is_digit()

static bool is_digit ( unsigned char c )

static

Definition at line 819 of file true_bpe.c.

                                       {
     return c >= '0' && c <= '9';
 }

Referenced by is_bpe_digit(), is_bpe_punct(), and is_word_prefix_char().

◆ is_gpt2_space()

static bool is_gpt2_space	(	const char *	s,
		int	len
	)

static

Definition at line 828 of file true_bpe.c.

                                                   {
     return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0xA0;
 }

Referenced by gpt2_pretokenize(), is_bpe_punct(), and is_word_prefix_char().

◆ is_letter()

static bool is_letter ( unsigned char c )

static

Definition at line 815 of file true_bpe.c.

                                        {
     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 }

Referenced by is_bpe_letter(), is_bpe_punct(), and is_word_prefix_char().

◆ is_whitespace()

static bool is_whitespace ( unsigned char c )

static

Definition at line 823 of file true_bpe.c.

                                            {
     return c == ' ' || c == '\t' || c == '\n' || c == '\r';
 }

◆ is_word_prefix_char()

static bool is_word_prefix_char	(	const char *	s,
		int	len
	)

static

Definition at line 871 of file true_bpe.c.

                                                         {
     /* In GPT-2 regex: [^\r\n\p{L}\p{N}]? matches any char except newline, letter, digit */
     if (len == 1) {
         unsigned char c = (unsigned char)s[0];
         return !is_letter(c) && !is_digit(c) && c != '\n' && c != '\r';
     }
     /* Multi-byte: check if it's not a letter/digit (newline is Ċ which we handle separately) */
     if (is_gpt2_space(s, len)) return true;  /* Space can prefix */
     if (is_bpe_newline(s, len)) return false;  /* Newline cannot prefix */
     return true;  /* Other multi-byte chars can prefix */
 }

References is_bpe_newline(), is_digit(), is_gpt2_space(), and is_letter().

◆ match_special_token()

static int match_special_token	(	const CKTrueBPE *	bpe,
		const char *	text,
		int	text_len,
		int	pos
	)

static

Definition at line 1323 of file true_bpe.c.

                                                                                               {
     int remaining = text_len - pos;
     const char *cur = text + pos;
  
     /* Special tokens are sorted longest first, so first match is best */
     for (int i = 0; i < bpe->num_special_tokens; i++) {
         int tok_len = bpe->special_tokens[i].len;
         if (tok_len <= remaining &&
             memcmp(cur, bpe->special_tokens[i].token, tok_len) == 0) {
             return i;
         }
     }
     return -1;
 }

References text, and text_len.

Referenced by ck_true_bpe_encode().

◆ merge_hash()

static size_t merge_hash	(	uint64_t	key,
		size_t	num_buckets
	)

static

Definition at line 157 of file true_bpe.c.

                                                            {
     /* Simple hash mixing */
     key ^= key >> 33;
     key *= 0xff51afd7ed558ccdULL;
     key ^= key >> 33;
     key *= 0xc4ceb9fe1a85ec53ULL;
     key ^= key >> 33;
     return key % num_buckets;
 }

Referenced by merge_table_insert(), and merge_table_lookup().

◆ merge_key()

static uint64_t merge_key	(	int32_t	left_id,
		int32_t	right_id
	)

static

Definition at line 153 of file true_bpe.c.

                                                              {
     return ((uint64_t)left_id << 32) | (uint32_t)right_id;
 }

References left_id, and right_id.

Referenced by merge_table_insert(), and merge_table_lookup().

◆ merge_table_create()

static CKMergeTable* merge_table_create ( size_t num_buckets )

static

Definition at line 167 of file true_bpe.c.

                                                             {
     CKMergeTable *table = (CKMergeTable *)malloc(sizeof(CKMergeTable));
     if (!table) return NULL;
  
     table->buckets = (CKMergeEntry **)calloc(num_buckets, sizeof(CKMergeEntry *));
     if (!table->buckets) {
         free(table);
         return NULL;
     }
  
     table->num_buckets = num_buckets;
     table->num_entries = 0;
     return table;
 }

Referenced by ck_true_bpe_create().

◆ merge_table_free()

static void merge_table_free ( CKMergeTable * table )

static

Definition at line 182 of file true_bpe.c.

                                                   {
     if (!table) return;
  
     for (size_t i = 0; i < table->num_buckets; i++) {
         CKMergeEntry *entry = table->buckets[i];
         while (entry) {
             CKMergeEntry *next = entry->next;
             free(entry);
             entry = next;
         }
     }
  
     free(table->buckets);
     free(table);
 }

Referenced by ck_true_bpe_create(), and ck_true_bpe_free().

◆ merge_table_insert()

static int merge_table_insert	(	CKMergeTable *	table,
		const CKBPEMerge *	merge
	)

static

Definition at line 198 of file true_bpe.c.

                                                                             {
     uint64_t key = merge_key(merge->left_id, merge->right_id);
     size_t bucket = merge_hash(key, table->num_buckets);
  
     /* Check if already exists */
     CKMergeEntry *entry = table->buckets[bucket];
     while (entry) {
         if (entry->key == key) {
             /* Update existing */
             entry->merge = *merge;
             return 0;
         }
         entry = entry->next;
     }
  
     /* Create new entry */
     entry = (CKMergeEntry *)malloc(sizeof(CKMergeEntry));
     if (!entry) return -1;
  
     entry->key = key;
     entry->merge = *merge;
     entry->next = table->buckets[bucket];
     table->buckets[bucket] = entry;
     table->num_entries++;
  
     return 0;
 }

References merge_hash(), and merge_key().

Referenced by ck_true_bpe_add_merge().

◆ merge_table_lookup()

static const CKBPEMerge* merge_table_lookup	(	const CKMergeTable *	table,
		int32_t	left_id,
		int32_t	right_id
	)

static

Definition at line 226 of file true_bpe.c.

                                                                                                           {
     uint64_t key = merge_key(left_id, right_id);
     size_t bucket = merge_hash(key, table->num_buckets);
  
     CKMergeEntry *entry = table->buckets[bucket];
     while (entry) {
         if (entry->key == key) {
             return &entry->merge;
         }
         entry = entry->next;
     }
  
     return NULL;
 }

References left_id, merge_hash(), merge_key(), and right_id.

Referenced by find_best_merge().

◆ preprocess_text()

static int preprocess_text	(	const CKTrueBPE *	bpe,
		const char *	text,
		int	text_len,
		char *	out,
		int	out_max
	)

static

Definition at line 749 of file true_bpe.c.

                                                                                                          {
     CKSpacePrefixStyle style = bpe->config.space_prefix_style;
     int out_len = 0;
  
     /* For SentencePiece, add ▁ at start */
     if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
         if (out_len + 3 > out_max) return -1;
         out[out_len++] = (char)0xE2;
         out[out_len++] = (char)0x96;
         out[out_len++] = (char)0x81;
     }
  
     for (int i = 0; i < text_len; i++) {
         unsigned char byte = (unsigned char)text[i];
  
         if (style == CK_SPACE_PREFIX_SPM) {
             /* SentencePiece style: only convert spaces */
             if (byte == ' ') {
                 if (out_len + 3 > out_max) return -1;
                 out[out_len++] = (char)0xE2;
                 out[out_len++] = (char)0x96;
                 out[out_len++] = (char)0x81;
             } else {
                 if (out_len + 1 > out_max) return -1;
                 out[out_len++] = (char)byte;
             }
         } else {
             /* GPT-2 style: full byte-level encoding */
             char encoded[4];
             int enc_len = byte_to_gpt2(byte, encoded);
             if (out_len + enc_len > out_max) return -1;
             for (int j = 0; j < enc_len; j++) {
                 out[out_len++] = encoded[j];
             }
         }
     }
  
     return out_len;
 }

References byte_to_gpt2(), CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.

Referenced by encode_text_segment().

◆ token_list_append()

static int token_list_append	(	CKBPETokenList *	list,
		const char *	str,
		size_t	len,
		int32_t	id
	)

static

Definition at line 283 of file true_bpe.c.

                                                                                             {
     if (list->count >= list->capacity) {
         size_t new_cap = list->capacity * 2;
         CKBPEToken *new_tokens = (CKBPEToken *)realloc(list->tokens, new_cap * sizeof(CKBPEToken));
         if (!new_tokens) return -1;
         list->tokens = new_tokens;
         list->capacity = new_cap;
         /* Zero new entries */
         memset(list->tokens + list->count, 0, (new_cap - list->count) * sizeof(CKBPEToken));
     }
  
     CKBPEToken *tok = &list->tokens[list->count];
     tok->str = (char *)malloc(len + 1);
     if (!tok->str) return -1;
  
     memcpy(tok->str, str, len);
     tok->str[len] = '\0';
     tok->len = (uint16_t)len;
     tok->id = id;
     tok->is_merged = false;
  
     list->count++;
     return 0;
 }

References id.

Referenced by init_tokens_from_text().

◆ token_list_clear()

static void token_list_clear ( CKBPETokenList * list )

static

Definition at line 273 of file true_bpe.c.

                                                    {
     for (size_t i = 0; i < list->count; i++) {
         if (list->tokens[i].str) {
             free(list->tokens[i].str);
             list->tokens[i].str = NULL;
         }
     }
     list->count = 0;
 }

Referenced by init_tokens_from_text().

◆ token_list_create()

static CKBPETokenList* token_list_create ( size_t initial_capacity )

static

Definition at line 245 of file true_bpe.c.

                                                                   {
     CKBPETokenList *list = (CKBPETokenList *)malloc(sizeof(CKBPETokenList));
     if (!list) return NULL;
  
     list->tokens = (CKBPEToken *)calloc(initial_capacity, sizeof(CKBPEToken));
     if (!list->tokens) {
         free(list);
         return NULL;
     }
  
     list->count = 0;
     list->capacity = initial_capacity;
     return list;
 }

Referenced by encode_text_segment().

◆ token_list_free()

static void token_list_free ( CKBPETokenList * list )

static

Definition at line 260 of file true_bpe.c.

                                                   {
     if (!list) return;
  
     for (size_t i = 0; i < list->count; i++) {
         if (list->tokens[i].str) {
             free(list->tokens[i].str);
         }
     }
  
     free(list->tokens);
     free(list);
 }

Referenced by encode_text_segment().

◆ token_list_merge_at()

static int token_list_merge_at	(	CKBPETokenList *	list,
		size_t	pos,
		const char *	merged_str,
		size_t	merged_len,
		int32_t	merged_id
	)

static

Definition at line 309 of file true_bpe.c.

                                                                                                                                {
     if (pos + 1 >= list->count) return -1;
  
     /* Free old strings */
     free(list->tokens[pos].str);
     free(list->tokens[pos + 1].str);
  
     /* Create merged token */
     list->tokens[pos].str = (char *)malloc(merged_len + 1);
     if (!list->tokens[pos].str) return -1;
  
     memcpy(list->tokens[pos].str, merged_str, merged_len);
     list->tokens[pos].str[merged_len] = '\0';
     list->tokens[pos].len = (uint16_t)merged_len;
     list->tokens[pos].id = merged_id;
     list->tokens[pos].is_merged = true;
  
     /* Shift remaining tokens left */
     for (size_t i = pos + 1; i < list->count - 1; i++) {
         list->tokens[i] = list->tokens[i + 1];
     }
     list->count--;
  
     /* Clear the now-unused last slot */
     list->tokens[list->count].str = NULL;
  
     return 0;
 }

References merged_id.

Referenced by apply_bpe_merges().

◆ utf8_char_len()

static int utf8_char_len ( unsigned char c )

static

Definition at line 790 of file true_bpe.c.

                                           {
     if ((c & 0x80) == 0) return 1;       /* 0xxxxxxx */
     if ((c & 0xE0) == 0xC0) return 2;    /* 110xxxxx */
     if ((c & 0xF0) == 0xE0) return 3;    /* 1110xxxx */
     if ((c & 0xF8) == 0xF0) return 4;    /* 11110xxx */
     return 1;  /* Invalid, treat as single byte */
 }

Referenced by gpt2_pretokenize(), and init_tokens_from_text().

Macros

Enumerations

Functions

Macro Definition Documentation

◆ INITIAL_TOKEN_CAPACITY

◆ MAX_SPECIAL_TOKENS

◆ MAX_TOKEN_LEN

◆ MERGE_HASH_SIZE

Enumeration Type Documentation

◆ ChunkType

Function Documentation

◆ apply_bpe_merges()

◆ byte_to_gpt2()

◆ ck_true_bpe_add_merge()

◆ ck_true_bpe_add_merge_by_tokens()

◆ ck_true_bpe_add_special_token()

◆ ck_true_bpe_add_token()

◆ ck_true_bpe_create()

◆ ck_true_bpe_decode()

◆ ck_true_bpe_detect_space_style()

◆ ck_true_bpe_encode()

◆ ck_true_bpe_free()

◆ ck_true_bpe_id_to_token()

◆ ck_true_bpe_load_binary()

◆ ck_true_bpe_lookup()

◆ ck_true_bpe_num_merges()

◆ ck_true_bpe_set_config()

◆ ck_true_bpe_set_special_ids()

◆ ck_true_bpe_vocab_size()

◆ encode_chunk()

◆ encode_text_segment()

◆ find_best_merge()

◆ gpt2_decode_byte()

◆ gpt2_pretokenize()

◆ init_tokens_from_text()

◆ is_bpe_digit()

◆ is_bpe_letter()

◆ is_bpe_newline()

◆ is_bpe_punct()

◆ is_digit()

◆ is_gpt2_space()

◆ is_letter()

◆ is_whitespace()

◆ is_word_prefix_char()

◆ match_special_token()

◆ merge_hash()

◆ merge_key()

◆ merge_table_create()

◆ merge_table_free()

◆ merge_table_insert()

◆ merge_table_lookup()

◆ preprocess_text()

◆ token_list_append()

◆ token_list_clear()

◆ token_list_create()

◆ token_list_free()

◆ token_list_merge_at()

◆ utf8_char_len()