← Back to C-Kernel-Engine Docs Doxygen Source Documentation
tokenizer.h File Reference
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "tokenizer/murmurhash3.h"
#include "tokenizer/memory_pool.h"
#include "tokenizer/hash_table.h"
#include "tokenizer/utf8.h"
#include "data_structures/tries/trie.h"

Go to the source code of this file.

Data Structures

struct  CKTokenizer
 
struct  CKTokenizerConfig
 
struct  CKTokenizerToken
 

Macros

#define CK_TOKENIZER_API   __attribute__((visibility("default")))
 
#define CK_TOKENIZER_DEFAULT_HT_SIZE   65536
 
#define CK_TOKENIZER_MAX_TOKEN_LEN   256
 
#define CK_TOKENIZER_MAX_VOCAB_SIZE   256000
 

Enumerations

enum  CKSpacePrefixStyle {
  CK_SPACE_PREFIX_AUTO = 0 , CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2 , CK_SPACE_PREFIX_AUTO = 0 ,
  CK_SPACE_PREFIX_GPT2 = 1 , CK_SPACE_PREFIX_SPM = 2
}
 
enum  CKSpmMode { CK_SPM_MODE_UNIGRAM = 0 , CK_SPM_MODE_LLAMA = 1 }
 
enum  CKTokenizerType { CK_TOKENIZER_BPE = 0 , CK_TOKENIZER_WORDPIECE = 1 , CK_TOKENIZER_SPM = 2 }
 

Functions

 __attribute__ ((visibility("default"))) CKTokenizer *ck_tokenizer_create(CKTokenizerType type)
 
int ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
 
int ck_tokenizer_add_special_token (CKTokenizer *tok, const char *name, int32_t id)
 
int ck_tokenizer_add_token (CKTokenizer *tok, const char *token, int32_t id, float score)
 
static CKTokenizerck_tokenizer_create_bpe (void)
 
static CKTokenizerck_tokenizer_create_spm (void)
 
static CKTokenizerck_tokenizer_create_wordpiece (void)
 
int ck_tokenizer_decode (const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
 
int ck_tokenizer_encode (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
int ck_tokenizer_encode_tokens (const CKTokenizer *tok, const char *text, int text_len, const char **out_tokens, int max_tokens)
 
int ck_tokenizer_encode_with_special (CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids, bool add_special)
 
int ck_tokenizer_load_binary (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
 
int ck_tokenizer_load_binary_with_scores (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
 
int ck_tokenizer_load_gguf (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_json (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_merges (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_text (CKTokenizer *tok, const char *path)
 
static size_t ck_tokenizer_vocab_size (const CKTokenizer *tok)
 

Variables

bool add_bos
 
bool bool add_eos
 
bool add_space_prefix
 
int32_t int32_t bos
 
int32_t int32_t int32_t eos
 
int32_t id
 
const int32_t * ids
 
bool lowercase
 
int32_t int32_t int32_t int32_t int32_t mask
 
const int32_t int num_ids
 
const int32_t int int * out_len
 
int32_t int32_t int32_t int32_t pad
 
int32_t float * score
 
CKSpmMode spm_mode
 
CKSpacePrefixStyle style
 
const char * text
 
const char * token
 
int32_t unk
 
bool use_trie
 

Macro Definition Documentation

◆ CK_TOKENIZER_API

#define CK_TOKENIZER_API   __attribute__((visibility("default")))

Definition at line 40 of file tokenizer.h.

◆ CK_TOKENIZER_DEFAULT_HT_SIZE

#define CK_TOKENIZER_DEFAULT_HT_SIZE   65536

Definition at line 50 of file tokenizer.h.

◆ CK_TOKENIZER_MAX_TOKEN_LEN

#define CK_TOKENIZER_MAX_TOKEN_LEN   256

Definition at line 44 of file tokenizer.h.

◆ CK_TOKENIZER_MAX_VOCAB_SIZE

#define CK_TOKENIZER_MAX_VOCAB_SIZE   256000

Definition at line 47 of file tokenizer.h.

Enumeration Type Documentation

◆ CKSpacePrefixStyle

Enumerator
CK_SPACE_PREFIX_AUTO 
CK_SPACE_PREFIX_GPT2 
CK_SPACE_PREFIX_SPM 
CK_SPACE_PREFIX_AUTO 
CK_SPACE_PREFIX_GPT2 
CK_SPACE_PREFIX_SPM 

Definition at line 60 of file tokenizer.h.

60  {
61  CK_SPACE_PREFIX_AUTO = 0, /* Auto-detect from vocabulary */
62  CK_SPACE_PREFIX_GPT2 = 1, /* GPT-2 style: Ġ (U+0120, bytes 0xC4 0xA0) */
63  CK_SPACE_PREFIX_SPM = 2 /* SentencePiece style: ▁ (U+2581, bytes 0xE2 0x96 0x81) */
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62

◆ CKSpmMode

enum CKSpmMode
Enumerator
CK_SPM_MODE_UNIGRAM 
CK_SPM_MODE_LLAMA 

Definition at line 67 of file tokenizer.h.

67  {
68  CK_SPM_MODE_UNIGRAM = 0, /* SentencePiece unigram/Viterbi */
69  CK_SPM_MODE_LLAMA = 1 /* llama.cpp merge-style SPM */
70 } CKSpmMode;
CKSpmMode
Definition: tokenizer.h:67
@ CK_SPM_MODE_UNIGRAM
Definition: tokenizer.h:68
@ CK_SPM_MODE_LLAMA
Definition: tokenizer.h:69

◆ CKTokenizerType

Enumerator
CK_TOKENIZER_BPE 
CK_TOKENIZER_WORDPIECE 
CK_TOKENIZER_SPM 

Definition at line 53 of file tokenizer.h.

53  {
54  CK_TOKENIZER_BPE = 0, /* Byte-Pair Encoding (GPT-2, LLaMA, Qwen) */
55  CK_TOKENIZER_WORDPIECE = 1, /* WordPiece (BERT, RoBERTa) */
56  CK_TOKENIZER_SPM = 2 /* SentencePiece (unigram) */
CKTokenizerType
Definition: tokenizer.h:53
@ CK_TOKENIZER_BPE
Definition: tokenizer.h:54
@ CK_TOKENIZER_SPM
Definition: tokenizer.h:56
@ CK_TOKENIZER_WORDPIECE
Definition: tokenizer.h:55

Function Documentation

◆ __attribute__()

__attribute__ ( (visibility("default"))  )

Create a new tokenizer.

Parameters
typeTokenizer type (BPE, WordPiece, SPM)
Returns
Newly allocated tokenizer, or NULL on error

Free a tokenizer.

Parameters
tokTokenizer to free

Reset tokenizer state (clear vocab but keep config).

Parameters
tokTokenizer to reset

Set special token IDs.

Parameters
tokTokenizer
unkUnknown token ID
bosBeginning-of-sequence token ID
eosEnd-of-sequence token ID
padPadding token ID
maskMask token ID

Set whether to add BOS/EOS tokens during encoding.

Parameters
tokTokenizer
add_bosIf true, prepend BOS token (if available)
add_eosIf true, append EOS token (if available)

Set whether to add the SentencePiece space prefix (▁) at the start.

This mirrors SentencePiece's add_dummy_prefix behavior.

Parameters
tokTokenizer
add_space_prefixIf true, add leading ▁ when appropriate

Set SentencePiece mode.

Parameters
tokTokenizer
spm_modeSPM mode (unigram or llama-style)

Set whether to lowercase input text before tokenizing.

Parameters
tokTokenizer
lowercaseIf true, convert text to lowercase

Set lookup method (trie vs hash table).

Parameters
tokTokenizer
use_trieIf true, use trie (faster for longest-match), false = hash table

Set space prefix style for BPE tokenizers.

GPT-2/Qwen use Ġ (U+0120), LLaMA/SentencePiece use ▁ (U+2581). Default is AUTO which auto-detects from vocabulary.

Parameters
tokTokenizer
styleSpace prefix style (AUTO, GPT2, or SPM)

Auto-detect space prefix style from vocabulary.

Checks for presence of tokens starting with Ġ vs ▁ to determine style.

Parameters
tokTokenizer
Returns
Detected style (GPT2 or SPM)

Look up token ID by string.

Parameters
tokTokenizer
tokenToken string
Returns
Token ID, or unk_id if not found

Get token string by ID.

Parameters
tokTokenizer
idToken ID
Returns
Token string, or NULL if invalid

Get token info by ID.

Parameters
tokTokenizer
idToken ID
scoreOutput: token score
Returns
Token string, or NULL if invalid

Decode to buffer allocated by caller.

Parameters
tokTokenizer
idsInput token IDs
num_idsNumber of IDs
out_lenOutput: length of decoded string
Returns
Newly allocated string, or NULL on error

Get the tokenizer type name.

Parameters
tokTokenizer
Returns
Type name string

Check if token is special.

Parameters
tokTokenizer
idToken ID
Returns
true if special token

Estimate encoded token count.

Parameters
tokTokenizer
textInput text
Returns
Estimated number of tokens

Get last error message.

Returns
Last error message, or NULL if no error

Free a True BPE tokenizer.

Parameters
bpeTokenizer to free

Add a token to the vocabulary.

Parameters
bpeTokenizer
tokenToken string (UTF-8)
idToken ID
scoreToken score (for unigram models, 0.0 for BPE)
Returns
0 on success, -1 on error

Add a BPE merge rule by token IDs.

Merge rules define how tokens are combined during encoding. Rules with lower priority numbers are applied first.

Parameters
bpeTokenizer
left_idLeft token ID
right_idRight token ID
merged_idResulting merged token ID
priorityMerge priority (lower = applied first)
Returns
0 on success, -1 on error

Add a BPE merge rule by token strings.

This looks up the token IDs automatically and determines the merged token. The merged token must already exist in the vocabulary.

Parameters
bpeTokenizer
leftLeft token string
rightRight token string
priorityMerge priority (lower = applied first)
Returns
0 on success, -1 on error

Set special token IDs.

Parameters
bpeTokenizer
unkUnknown token ID (-1 to disable)
bosBeginning-of-sequence token ID (-1 to disable)
eosEnd-of-sequence token ID (-1 to disable)
padPadding token ID (-1 to disable)

Add a special token that should be matched BEFORE BPE encoding.

Special tokens like <|im_start|>, <|im_end|>, <|endoftext|> are matched literally in the input text before BPE processing. Without this, BPE would break them into individual characters.

Parameters
bpeTokenizer
tokenToken string to match literally (e.g., "<|im_end|>")
idToken ID to output when matched
Returns
0 on success, -1 on error

Set tokenizer configuration.

Parameters
bpeTokenizer
configConfiguration to apply

Load vocabulary + merges from binary buffers.

Parameters
bpeTokenizer
vocab_sizeNumber of tokens
offsetsOffsets array (length vocab_size)
stringsNull-terminated token strings blob
num_mergesNumber of merge rules
mergesMerge triples [left_id, right_id, merged_id] (length num_merges*3)
Returns
0 on success, -1 on error

Look up a token ID by string.

Parameters
bpeTokenizer
tokenToken string
Returns
Token ID, or unk_id if not found

Get a token string by ID.

Parameters
bpeTokenizer
idToken ID
Returns
Token string, or NULL if invalid

Get vocabulary size.

Parameters
bpeTokenizer
Returns
Number of tokens in vocabulary

Get number of merge rules.

Parameters
bpeTokenizer
Returns
Number of merge rules

Auto-detect space prefix style from vocabulary.

Counts tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece) to determine style. The detected style is cached in the config.

Parameters
bpeTokenizer
Returns
Detected style (GPT2 or SPM)

Encode text to token IDs using true BPE algorithm.

This applies merge rules in priority order (not greedy longest-match).

Parameters
bpeTokenizer
textInput text (UTF-8)
text_lenText length in bytes, or -1 for null-terminated
idsOutput token IDs array
max_idsMaximum IDs to write
Returns
Number of tokens written

Decode token IDs to text.

Parameters
bpeTokenizer
idsInput token IDs
num_idsNumber of IDs
textOutput text buffer
max_lenMaximum text length
Returns
Number of bytes written (excluding null terminator)

Referenced by fused_mlp_swiglu_decode(), fused_mlp_swiglu_decode_tiled(), fused_mlp_swiglu_decode_v2(), geglu_forward_fp32(), gelu_backward_exact(), gelu_backward_fast(), gelu_fast_inplace(), swiglu_backward(), and swiglu_forward().

◆ ck_tokenizer_add_merge()

int ck_tokenizer_add_merge ( CKTokenizer tok,
int32_t  left_id,
int32_t  right_id,
int32_t  merged_id,
int32_t  priority 
)

Add a BPE merge rule.

Parameters
tokTokenizer
left_idLeft token ID
right_idRight token ID
merged_idMerged token ID
priorityLower = higher priority (applied first)
Returns
0 on success, -1 on error

Definition at line 1336 of file tokenizer.c.

1336  {
1337  (void)tok; (void)left; (void)right; (void)merged; (void)priority; return 0;
1338 }
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131

References left, priority, and right.

◆ ck_tokenizer_add_special_token()

int ck_tokenizer_add_special_token ( CKTokenizer tok,
const char *  name,
int32_t  id 
)

Add special token (UNK, BOS, EOS, PAD, MASK).

Parameters
tokTokenizer
nameSpecial token name ("unk", "bos", "eos", "pad", "mask")
idToken ID
Returns
0 on success, -1 on error

Definition at line 213 of file tokenizer.c.

213  {
214  if (!tok || !name) return -1;
215  if (ck_tokenizer_add_token(tok, name, id, -1e10f) != 0) return -1;
216 
217  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, name);
218  if (info) info->is_special = true;
219 
220  /* Also add to trie as special */
221  if (tok->vocab_trie) {
222  ck_trie_insert(tok->vocab_trie, name, id, true, 0);
223  }
224 
225  if (strcmp(name, "<unk>") == 0 || strcmp(name, "[UNK]") == 0) tok->unk_id = id;
226  else if (strcmp(name, "<s>") == 0 || strcmp(name, "<bos>") == 0 || strcmp(name, "[BOS]") == 0) tok->bos_id = id;
227  else if (strcmp(name, "</s>") == 0 || strcmp(name, "<eos>") == 0 || strcmp(name, "[EOS]") == 0) tok->eos_id = id;
228  else if (strcmp(name, "<pad>") == 0 || strcmp(name, "[PAD]") == 0) tok->pad_id = id;
229 
230  return 0;
231 }
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
Definition: trie.c:110
int32_t bos_id
Definition: ck_tokenizer.h:98
int32_t unk_id
Definition: ck_tokenizer.h:97
CKTrie * vocab_trie
Definition: tokenizer.h:103
int32_t eos_id
Definition: ck_tokenizer.h:99
CKTokenizerHashTable * vocab
Definition: tokenizer.h:100
int32_t pad_id
Definition: ck_tokenizer.h:100
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
Definition: tokenizer.c:157
int32_t id
Definition: tokenizer.h:315

References CKTokenizer::bos_id, ck_tokenizer_add_token(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), CKTokenizer::eos_id, id, CKTokenizer::pad_id, CKTokenizer::unk_id, CKTokenizer::vocab, and CKTokenizer::vocab_trie.

Referenced by main().

◆ ck_tokenizer_add_token()

int ck_tokenizer_add_token ( CKTokenizer tok,
const char *  token,
int32_t  id,
float  score 
)

Add a token to vocabulary.

Parameters
tokTokenizer
tokenToken string
idToken ID
scoreToken score (for SPM)
Returns
0 on success, -1 on error

Definition at line 157 of file tokenizer.c.

157  {
158  if (!tok || !token) {
159  return -1;
160  }
161 
162  /* Ensure we have space in reverse vocab */
163  if (id >= (int32_t)tok->vocab_capacity) {
164  size_t new_cap = tok->vocab_capacity * 2;
165  while (new_cap <= (size_t)id) {
166  new_cap *= 2;
167  }
168  char **new_array = (char **)realloc(tok->id_to_token, new_cap * sizeof(char *));
169  if (!new_array) {
170  return -1;
171  }
172  memset(new_array + tok->vocab_capacity, 0, (new_cap - tok->vocab_capacity) * sizeof(char *));
173  tok->id_to_token = new_array;
174  tok->vocab_capacity = new_cap;
175  }
176 
177  /* Check if token already exists */
178  TokenInfo *existing = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
179  if (existing) {
180  existing->id = id;
181  existing->score = score;
182  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
183  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
184  tok->id_to_token[id] = strdup(token);
185  return 0;
186  }
187 
188  /* Create new token info */
189  TokenInfo *info = (TokenInfo *)malloc(sizeof(TokenInfo));
190  if (!info) return -1;
191  info->id = id;
192  info->score = score;
193  info->is_special = false;
194 
195  if (ck_tokenizer_hash_table_insert(tok->vocab, token, info) != 0) {
196  free(info);
197  return -1;
198  }
199 
200  /* Also add to trie for fast longest-match lookups */
201  if (tok->vocab_trie) {
202  ck_trie_insert(tok->vocab_trie, token, id, false, 0);
203  }
204 
205  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
206  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
207  tok->id_to_token[id] = strdup(token);
208 
209  return 0;
210 }
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
size_t vocab_capacity
Definition: tokenizer.h:108
char ** id_to_token
Definition: ck_tokenizer.h:86
const char * token
Definition: tokenizer.h:306
int32_t float * score
Definition: tokenizer.h:327

References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), id, CKTokenizer::id_to_token, score, token, CKTokenizer::vocab, CKTokenizer::vocab_capacity, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_load_binary_with_scores().

◆ ck_tokenizer_create_bpe()

static CKTokenizer* ck_tokenizer_create_bpe ( void  )
inlinestatic

Create tokenizer with default BPE config.

Definition at line 156 of file tokenizer.h.

156  {
158 }
CKTokenizer * ck_tokenizer_create(CKTokenizerType type)
Definition: tokenizer.c:34

References CK_TOKENIZER_BPE, and ck_tokenizer_create().

Referenced by main().

◆ ck_tokenizer_create_spm()

static CKTokenizer* ck_tokenizer_create_spm ( void  )
inlinestatic

Create tokenizer with default SPM config.

Definition at line 170 of file tokenizer.h.

170  {
172 }

References ck_tokenizer_create(), and CK_TOKENIZER_SPM.

◆ ck_tokenizer_create_wordpiece()

static CKTokenizer* ck_tokenizer_create_wordpiece ( void  )
inlinestatic

Create tokenizer with default WordPiece config.

Definition at line 163 of file tokenizer.h.

163  {
165 }

References ck_tokenizer_create(), and CK_TOKENIZER_WORDPIECE.

◆ ck_tokenizer_decode()

int ck_tokenizer_decode ( const CKTokenizer tok,
const int32_t *  ids,
int  num_ids,
char *  text,
int  max_len 
)

Decode token IDs to text.

Parameters
tokTokenizer
idsInput token IDs
num_idsNumber of IDs
textOutput text buffer
max_lenMaximum text length
Returns
Number of bytes written

Definition at line 737 of file ck_tokenizer.c.

741  {
742  int len = 0;
743 
744  for (int i = 0; i < num_ids; i++) {
745  /* Skip special tokens */
746  if (ids[i] == tok->bos_id || ids[i] == tok->eos_id || ids[i] == tok->pad_id) {
747  continue;
748  }
749 
750  const char *token = ck_tokenizer_id_to_token(tok, ids[i]);
751  if (!token) continue;
752 
753  int token_len = (int)strlen(token);
754 
755  /* Handle byte tokens <0xXX> */
756  if (token_len == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x') {
757  char hex[3] = {token[3], token[4], 0};
758  unsigned int byte = (unsigned int)strtol(hex, NULL, 16);
759  if (len < max_len - 1) {
760  text[len++] = (char)byte;
761  }
762  continue;
763  }
764 
765  /* Handle GPT-style space prefix (Ġ = 0xC4 0xA0 in UTF-8) */
766  const char *src = token;
767  if ((unsigned char)token[0] == 0xC4 && (unsigned char)token[1] == 0xA0) {
768  if (len < max_len - 1) {
769  text[len++] = ' ';
770  }
771  src = token + 2;
772  token_len -= 2;
773  }
774 
775  /* Copy token */
776  for (int j = 0; j < token_len && len < max_len - 1; j++) {
777  text[len++] = src[j];
778  }
779  }
780 
781  text[len] = '\0';
782  return len;
783 }
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
const int32_t * ids
Definition: tokenizer.h:443
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const int32_t int char int max_len
Definition: true_bpe.h:280

References ck_tokenizer_id_to_token(), ids, max_len, num_ids, text, and token.

◆ ck_tokenizer_encode()

int ck_tokenizer_encode ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)

Encode text to token IDs using greedy longest-match.

For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.

Parameters
tokTokenizer
textInput text
text_lenText length, or -1 for null-terminated
idsOutput token IDs
max_idsMaximum IDs to write
Returns
Number of tokens written

Definition at line 638 of file ck_tokenizer.c.

642  {
643  if (text_len < 0) text_len = (int)strlen(text);
644 
645  /* Pre-tokenize: split on whitespace, keep spaces as tokens */
646  /* For simplicity, treat each byte as initial token, then apply BPE */
647 
648  /* Initial tokens: one per byte */
649  int32_t *tokens = (int32_t *)malloc(text_len * sizeof(int32_t));
650  int num_tokens = 0;
651 
652  for (int i = 0; i < text_len; i++) {
653  /* Look up single-character token */
654  char c[2] = {text[i], '\0'};
655  int32_t id = ck_tokenizer_lookup(tok, c, 1);
656 
657  /* Handle special byte tokens like <0xXX> */
658  if (id == tok->unk_id) {
659  char byte_token[8];
660  snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)text[i]);
661  id = ck_tokenizer_lookup(tok, byte_token, -1);
662  }
663 
664  /* Try UTF-8 multi-byte sequences */
665  if (id == tok->unk_id && (unsigned char)text[i] >= 0x80) {
666  int utf8_len = 1;
667  if ((text[i] & 0xE0) == 0xC0) utf8_len = 2;
668  else if ((text[i] & 0xF0) == 0xE0) utf8_len = 3;
669  else if ((text[i] & 0xF8) == 0xF0) utf8_len = 4;
670 
671  if (i + utf8_len <= text_len) {
672  id = ck_tokenizer_lookup(tok, text + i, utf8_len);
673  if (id != tok->unk_id) {
674  tokens[num_tokens++] = id;
675  i += utf8_len - 1;
676  continue;
677  }
678  }
679  }
680 
681  tokens[num_tokens++] = id;
682  }
683 
684  /* Apply BPE merges iteratively */
685  bool changed = true;
686  while (changed && num_tokens > 1) {
687  changed = false;
688 
689  /* Find best merge (lowest priority = earliest in merge list) */
690  int best_pos = -1;
691  int best_priority = tok->num_merges;
692 
693  for (int i = 0; i < num_tokens - 1; i++) {
694  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[i], tokens[i + 1]);
695  if (merge_idx >= 0 && tok->merges[merge_idx].priority < best_priority) {
696  best_pos = i;
697  best_priority = tok->merges[merge_idx].priority;
698  }
699  }
700 
701  if (best_pos >= 0) {
702  int merge_idx = ck_tokenizer_lookup_merge(tok, tokens[best_pos], tokens[best_pos + 1]);
703  tokens[best_pos] = tok->merges[merge_idx].merged;
704 
705  /* Shift remaining tokens */
706  for (int i = best_pos + 1; i < num_tokens - 1; i++) {
707  tokens[i] = tokens[i + 1];
708  }
709  num_tokens--;
710  changed = true;
711  }
712  }
713 
714  /* Copy to output */
715  int out_len = 0;
716 
717  if (tok->add_bos && out_len < max_ids) {
718  ids[out_len++] = tok->bos_id;
719  }
720 
721  for (int i = 0; i < num_tokens && out_len < max_ids; i++) {
722  ids[out_len++] = tokens[i];
723  }
724 
725  if (tok->add_eos && out_len < max_ids) {
726  ids[out_len++] = tok->eos_id;
727  }
728 
729  free(tokens);
730  return out_len;
731 }
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
int ck_tokenizer_lookup_merge(const CKTokenizer *tok, int32_t left, int32_t right)
Definition: ck_tokenizer.c:276
int32_t merged
Definition: ck_tokenizer.h:69
CKMergeRule * merges
Definition: ck_tokenizer.h:89
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541
const int32_t int int * out_len
Definition: tokenizer.h:445
const char int text_len
Definition: true_bpe.h:262
const char int int32_t int max_ids
Definition: true_bpe.h:264

References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, CKTokenizer::bos_id, CK_SPM_MODE_LLAMA, CK_TOKENIZER_BPE, ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), CK_TOKENIZER_SPM, CKTokenizer::config, config, CKTokenizer::eos_id, find_longest_match(), id, ids, max_ids, preprocess_bpe_spaces(), CKTokenizerConfig::spm_mode, style, text, text_len, CKTokenizerConfig::type, and CKTokenizer::unk_id.

◆ ck_tokenizer_encode_tokens()

int ck_tokenizer_encode_tokens ( const CKTokenizer tok,
const char *  text,
int  text_len,
const char **  out_tokens,
int  max_tokens 
)

Encode and return tokens as array of strings.

Parameters
tokTokenizer
textInput text
text_lenText length
out_tokensOutput token strings (caller must free each)
max_tokensMaximum tokens
Returns
Number of tokens written

◆ ck_tokenizer_encode_with_special()

int ck_tokenizer_encode_with_special ( CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids,
bool  add_special 
)

Encode with special token handling.

Parameters
tokTokenizer
textInput text
text_lenText length, or -1 for null-terminated
idsOutput token IDs
max_idsMaximum IDs to write
add_specialAdd BOS/EOS tokens
Returns
Number of tokens written

◆ ck_tokenizer_load_binary()

int ck_tokenizer_load_binary ( CKTokenizer tok,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
int  num_merges,
const int32_t *  merges 
)

Load vocabulary from memory-mapped binary data.

Parameters
tokTokenizer
vocab_sizeNumber of tokens
offsetsArray of offsets into strings pool
stringsString pool containing null-terminated tokens
num_mergesNumber of BPE merges
mergesMerge rules as (left, right, merged) triplets
Returns
0 on success, -1 on error

Definition at line 18 of file ck_tokenizer_v2.c.

23  {
24  if (!tok || !offsets || !strings) return -1;
25 
26  // We assume ck_tokenizer_init was already called to alloc hash tables
27  tok->vocab_size = 0;
28 
29  for (int i = 0; i < vocab_size; i++) {
30  const char *token = strings + offsets[i];
31  int len = (int)strlen(token);
32 
33  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
34  entry->token = (char *)token;
35  entry->token_len = len;
36  entry->id = i;
37 
38  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
39  entry->next = tok->vocab_hash[bucket];
40  tok->vocab_hash[bucket] = entry;
41 
42  tok->id_to_token[i] = entry->token;
43  tok->vocab_size++;
44  }
45 
46  if (merges && num_merges > 0) {
47  for (int i = 0; i < num_merges; i++) {
48  int32_t left = merges[i*3 + 0];
49  int32_t right = merges[i*3 + 1];
50  int32_t merged = merges[i*3 + 2];
51  ck_tokenizer_add_merge(tok, left, right, merged);
52  }
53  }
54 
55  return 0;
56 }
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69
static uint32_t hash_string(const char *s, int len)
CKMemPool pool
Definition: ck_tokenizer.h:78
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int vocab_hash_size
Definition: ck_tokenizer.h:83
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186

References ck_pool_alloc(), ck_tokenizer_add_merge(), ck_tokenizer_load_binary_with_scores(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, left, merges, CKVocabEntry::next, num_merges, offsets, CKTokenizer::pool, right, strings, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, CKTokenizer::vocab_size, and vocab_size.

Referenced by main().

◆ ck_tokenizer_load_binary_with_scores()

int ck_tokenizer_load_binary_with_scores ( CKTokenizer tok,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
const float *  scores,
const uint8_t *  types,
int  num_merges,
const int32_t *  merges 
)

Load vocabulary from memory-mapped binary data with scores and types.

This extended version supports SPM (SentencePiece) tokenizers which require token scores for Viterbi/DP encoding.

Parameters
tokTokenizer
vocab_sizeNumber of tokens
offsetsArray of offsets into strings pool
stringsString pool containing null-terminated tokens
scoresArray of token scores (float32), can be NULL
typesArray of token types (uint8), can be NULL
num_mergesNumber of BPE merges
mergesMerge rules as (left, right, merged) triplets
Returns
0 on success, -1 on error

Definition at line 1252 of file tokenizer.c.

1259  {
1260  if (!tok || !offsets || !strings) return -1;
1261  ck_tokenizer_reset(tok);
1262 
1263  /* Free any existing scores/types arrays before reallocating */
1264  if (tok->scores) {
1265  free(tok->scores);
1266  tok->scores = NULL;
1267  tok->scores_size = 0;
1268  }
1269  if (tok->types) {
1270  free(tok->types);
1271  tok->types = NULL;
1272  tok->types_size = 0;
1273  }
1274 
1275  /* Allocate scores and types arrays if provided */
1276  if (scores && vocab_size > 0) {
1277  tok->scores = (float *)malloc(vocab_size * sizeof(float));
1278  if (!tok->scores) return -1;
1279  memcpy(tok->scores, scores, vocab_size * sizeof(float));
1280  tok->scores_size = (size_t)vocab_size;
1281  }
1282  if (types && vocab_size > 0) {
1283  tok->types = (uint8_t *)malloc(vocab_size * sizeof(uint8_t));
1284  if (!tok->types) {
1285  if (tok->scores) {
1286  free(tok->scores);
1287  tok->scores = NULL;
1288  }
1289  return -1;
1290  }
1291  memcpy(tok->types, types, vocab_size * sizeof(uint8_t));
1292  tok->types_size = (size_t)vocab_size;
1293  }
1294 
1295  for (int i = 0; i < vocab_size; i++) {
1296  const char *token = strings + offsets[i];
1297  float score = scores ? scores[i] : 0.0f;
1299  }
1300 
1301  /* Build byte token lookup table if types are available */
1302  if (types && vocab_size > 0) {
1304 
1305  /* Log token type statistics */
1306  int count_normal = 0, count_unknown = 0, count_control = 0, count_byte = 0, count_other = 0;
1307  int max_type = 0;
1308  for (int i = 0; i < vocab_size; i++) {
1309  uint8_t t = tok->types[i];
1310  if (t > max_type) max_type = t;
1311  switch (t) {
1312  case GGUF_TOKEN_NORMAL: count_normal++; break;
1313  case GGUF_TOKEN_UNKNOWN: count_unknown++; break;
1314  case GGUF_TOKEN_CONTROL: count_control++; break;
1315  case GGUF_TOKEN_BYTE: count_byte++; break;
1316  default: count_other++; break;
1317  }
1318  }
1319  fprintf(stderr, "[TOKENIZER] Loaded %d tokens: normal=%d, unknown=%d, control=%d, byte=%d, other=%d\n",
1320  vocab_size, count_normal, count_unknown, count_control, count_byte, count_other);
1321  if (max_type > GGUF_TOKEN_BYTE) {
1322  fprintf(stderr, "[TOKENIZER] Warning: Unexpected token type %d\n", max_type);
1323  }
1324  }
1325 
1326  /* TODO: Merges */
1327  (void)num_merges; (void)merges;
1328  return 0;
1329 }
float * scores
Definition: tokenizer.h:111
size_t types_size
Definition: tokenizer.h:114
uint8_t * types
Definition: tokenizer.h:113
size_t scores_size
Definition: tokenizer.h:112
#define GGUF_TOKEN_CONTROL
Definition: tokenizer.c:462
static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size)
Definition: tokenizer.c:507
void ck_tokenizer_reset(CKTokenizer *tok)
Definition: tokenizer.c:125
#define GGUF_TOKEN_BYTE
Definition: tokenizer.c:465
#define GGUF_TOKEN_UNKNOWN
Definition: tokenizer.c:461
#define GGUF_TOKEN_NORMAL
Definition: tokenizer.c:460

References ck_tokenizer_add_token(), ck_tokenizer_reset(), GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_NORMAL, GGUF_TOKEN_UNKNOWN, merges, num_merges, offsets, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_build_byte_lookup(), strings, token, CKTokenizer::types, CKTokenizer::types_size, and vocab_size.

Referenced by ck_tokenizer_load_binary().

◆ ck_tokenizer_load_gguf()

int ck_tokenizer_load_gguf ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from GGUF file.

Parameters
tokTokenizer
pathPath to GGUF file
Returns
0 on success, -1 on error

Definition at line 1332 of file tokenizer.c.

1332 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_json()

int ck_tokenizer_load_json ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from JSON file (HuggingFace format).

Parameters
tokTokenizer
pathPath to vocab.json or tokenizer.json
Returns
0 on success, -1 on error

Definition at line 1333 of file tokenizer.c.

1333 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_merges()

int ck_tokenizer_load_merges ( CKTokenizer tok,
const char *  path 
)

Load BPE merges from text file.

Format: token1 token2 (one merge per line)

Parameters
tokTokenizer
pathPath to merges.txt
Returns
0 on success, -1 on error

Definition at line 1335 of file tokenizer.c.

1335 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_text()

int ck_tokenizer_load_text ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from text file (one token per line).

Format: token_string [id] [score] Lines starting with # are comments.

Parameters
tokTokenizer
pathPath to vocabulary file
Returns
0 on success, -1 on error

Definition at line 1334 of file tokenizer.c.

1334 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_vocab_size()

static size_t ck_tokenizer_vocab_size ( const CKTokenizer tok)
inlinestatic

Get vocabulary size.

Definition at line 332 of file tokenizer.h.

332  {
333  return tok ? tok->vocab_size : 0;
334 }

References CKTokenizer::vocab_size.

Variable Documentation

◆ add_bos

bool add_bos

Definition at line 242 of file tokenizer.h.

Referenced by ck_tokenizer_set_add_bos_eos().

◆ add_eos

bool bool add_eos

Definition at line 242 of file tokenizer.h.

Referenced by ck_tokenizer_set_add_bos_eos().

◆ add_space_prefix

bool add_space_prefix

◆ bos

int32_t int32_t bos

◆ eos

int32_t int32_t int32_t eos

◆ id

◆ ids

◆ lowercase

bool lowercase

Definition at line 268 of file tokenizer.h.

◆ mask

int32_t int32_t int32_t int32_t int32_t mask

◆ num_ids

const int32_t int num_ids

Definition at line 444 of file tokenizer.h.

Referenced by ck_tokenizer_decode(), ck_true_bpe_decode(), and main().

◆ out_len

◆ pad

int32_t int32_t int32_t int32_t pad

Definition at line 232 of file tokenizer.h.

Referenced by ck_tokenizer_set_special_ids(), and ck_true_bpe_set_special_ids().

◆ score

◆ spm_mode

CKSpmMode spm_mode

Definition at line 260 of file tokenizer.h.

Referenced by ck_tokenizer_set_spm_mode().

◆ style

◆ text

◆ token

◆ unk

int32_t unk

Definition at line 229 of file tokenizer.h.

Referenced by ck_tokenizer_set_special_ids(), and ck_true_bpe_set_special_ids().

◆ use_trie

bool use_trie

Definition at line 276 of file tokenizer.h.

Referenced by ck_tokenizer_set_use_trie().