← Back to C-Kernel-Engine Docs Doxygen Source Documentation
tokenizer.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <ctype.h>
#include "tokenizer/tokenizer.h"
#include "tokenizer/murmurhash3.h"
#include "tokenizer/hash_table.h"

Go to the source code of this file.

Macros

#define GGUF_TOKEN_BYTE   6
 
#define GGUF_TOKEN_CONTROL   3
 
#define GGUF_TOKEN_NORMAL   1
 
#define GGUF_TOKEN_UNKNOWN   2
 
#define GGUF_TOKEN_UNUSED   5
 
#define GGUF_TOKEN_USER_DEFINED   4
 

Functions

int ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority)
 
int ck_tokenizer_add_special_token (CKTokenizer *tok, const char *name, int32_t id)
 
int ck_tokenizer_add_token (CKTokenizer *tok, const char *token, int32_t id, float score)
 
CKTokenizerck_tokenizer_create (CKTokenizerType type)
 
int ck_tokenizer_decode (const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
 
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style (CKTokenizer *tok)
 
int ck_tokenizer_encode (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
static int ck_tokenizer_encode_spm_impl (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
static int ck_tokenizer_encode_spm_llama_impl (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
void ck_tokenizer_free (CKTokenizer *tok)
 
const char * ck_tokenizer_id_to_token (const CKTokenizer *tok, int32_t id)
 
int ck_tokenizer_load_binary (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
 
int ck_tokenizer_load_binary_with_scores (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
 
int ck_tokenizer_load_gguf (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_json (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_merges (CKTokenizer *tok, const char *path)
 
int ck_tokenizer_load_text (CKTokenizer *tok, const char *path)
 
int32_t ck_tokenizer_lookup (const CKTokenizer *tok, const char *token)
 
static int32_t ck_tokenizer_lookup_exact (const CKTokenizer *tok, const char *token)
 
static int32_t ck_tokenizer_lookup_exact_n (const CKTokenizer *tok, const char *text, int text_len)
 
void ck_tokenizer_reset (CKTokenizer *tok)
 
void ck_tokenizer_set_add_bos_eos (CKTokenizer *tok, bool add_bos, bool add_eos)
 
void ck_tokenizer_set_add_space_prefix (CKTokenizer *tok, bool add_space_prefix)
 
void ck_tokenizer_set_space_prefix_style (CKTokenizer *tok, CKSpacePrefixStyle style)
 
void ck_tokenizer_set_special_ids (CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)
 
void ck_tokenizer_set_spm_mode (CKTokenizer *tok, CKSpmMode spm_mode)
 
void ck_tokenizer_set_use_trie (CKTokenizer *tok, bool use_trie)
 
static int32_t find_longest_match (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
 
static int32_t find_longest_match_hash (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
 
static int32_t find_longest_match_trie (const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
 
static int preprocess_bpe_spaces (const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style)
 
static int preprocess_spm_llama_text (const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
 
static int preprocess_spm_text (const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
 
static void spm_build_byte_lookup (CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size)
 
static int spm_count_unknown_run (const CKTokenizer *tok, const char *text, int text_len, size_t pos)
 
static int spm_encode_byte_fallback (const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
 
static int spm_find_candidates_at_pos (const CKTokenizer *tok, const char *text, int text_len, size_t pos, int32_t *candidates, int max_candidates)
 
static int32_t spm_get_byte_token (const CKTokenizer *tok, unsigned char byte_val)
 
static bool spm_is_byte_token (const CKTokenizer *tok, int32_t token_id)
 
static int spm_llama_resegment_node (const CKTokenizer *tok, const SpmLlamaNode *nodes, int node_id, int32_t *ids, int max_ids, int out_idx)
 
static bool spm_token_allowed_in_dp (const CKTokenizer *tok, int32_t token_id)
 
static bool spm_token_is_byte_format (const char *token)
 
static int utf8_len (unsigned char c)
 

Macro Definition Documentation

◆ GGUF_TOKEN_BYTE

#define GGUF_TOKEN_BYTE   6

Definition at line 465 of file tokenizer.c.

◆ GGUF_TOKEN_CONTROL

#define GGUF_TOKEN_CONTROL   3

Definition at line 462 of file tokenizer.c.

◆ GGUF_TOKEN_NORMAL

#define GGUF_TOKEN_NORMAL   1

Definition at line 460 of file tokenizer.c.

◆ GGUF_TOKEN_UNKNOWN

#define GGUF_TOKEN_UNKNOWN   2

Definition at line 461 of file tokenizer.c.

◆ GGUF_TOKEN_UNUSED

#define GGUF_TOKEN_UNUSED   5

Definition at line 464 of file tokenizer.c.

◆ GGUF_TOKEN_USER_DEFINED

#define GGUF_TOKEN_USER_DEFINED   4

Definition at line 463 of file tokenizer.c.

Function Documentation

◆ ck_tokenizer_add_merge()

int ck_tokenizer_add_merge ( CKTokenizer tok,
int32_t  left_id,
int32_t  right_id,
int32_t  merged_id,
int32_t  priority 
)

Add a BPE merge rule.

Parameters
tokTokenizer
left_idLeft token ID
right_idRight token ID
merged_idMerged token ID
priorityLower = higher priority (applied first)
Returns
0 on success, -1 on error

Definition at line 1336 of file tokenizer.c.

1336  {
1337  (void)tok; (void)left; (void)right; (void)merged; (void)priority; return 0;
1338 }
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131

References left, priority, and right.

◆ ck_tokenizer_add_special_token()

int ck_tokenizer_add_special_token ( CKTokenizer tok,
const char *  name,
int32_t  id 
)

Add special token (UNK, BOS, EOS, PAD, MASK).

Parameters
tokTokenizer
nameSpecial token name ("unk", "bos", "eos", "pad", "mask")
idToken ID
Returns
0 on success, -1 on error

Definition at line 213 of file tokenizer.c.

213  {
214  if (!tok || !name) return -1;
215  if (ck_tokenizer_add_token(tok, name, id, -1e10f) != 0) return -1;
216 
217  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, name);
218  if (info) info->is_special = true;
219 
220  /* Also add to trie as special */
221  if (tok->vocab_trie) {
222  ck_trie_insert(tok->vocab_trie, name, id, true, 0);
223  }
224 
225  if (strcmp(name, "<unk>") == 0 || strcmp(name, "[UNK]") == 0) tok->unk_id = id;
226  else if (strcmp(name, "<s>") == 0 || strcmp(name, "<bos>") == 0 || strcmp(name, "[BOS]") == 0) tok->bos_id = id;
227  else if (strcmp(name, "</s>") == 0 || strcmp(name, "<eos>") == 0 || strcmp(name, "[EOS]") == 0) tok->eos_id = id;
228  else if (strcmp(name, "<pad>") == 0 || strcmp(name, "[PAD]") == 0) tok->pad_id = id;
229 
230  return 0;
231 }
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
Definition: trie.c:110
int32_t bos_id
Definition: ck_tokenizer.h:98
int32_t unk_id
Definition: ck_tokenizer.h:97
CKTrie * vocab_trie
Definition: tokenizer.h:103
int32_t eos_id
Definition: ck_tokenizer.h:99
CKTokenizerHashTable * vocab
Definition: tokenizer.h:100
int32_t pad_id
Definition: ck_tokenizer.h:100
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
Definition: tokenizer.c:157
int32_t id
Definition: tokenizer.h:315

References CKTokenizer::bos_id, ck_tokenizer_add_token(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), CKTokenizer::eos_id, id, CKTokenizer::pad_id, CKTokenizer::unk_id, CKTokenizer::vocab, and CKTokenizer::vocab_trie.

Referenced by main().

◆ ck_tokenizer_add_token()

int ck_tokenizer_add_token ( CKTokenizer tok,
const char *  token,
int32_t  id,
float  score 
)

Add a token to vocabulary.

Parameters
tokTokenizer
tokenToken string
idToken ID
scoreToken score (for SPM)
Returns
0 on success, -1 on error

Definition at line 157 of file tokenizer.c.

157  {
158  if (!tok || !token) {
159  return -1;
160  }
161 
162  /* Ensure we have space in reverse vocab */
163  if (id >= (int32_t)tok->vocab_capacity) {
164  size_t new_cap = tok->vocab_capacity * 2;
165  while (new_cap <= (size_t)id) {
166  new_cap *= 2;
167  }
168  char **new_array = (char **)realloc(tok->id_to_token, new_cap * sizeof(char *));
169  if (!new_array) {
170  return -1;
171  }
172  memset(new_array + tok->vocab_capacity, 0, (new_cap - tok->vocab_capacity) * sizeof(char *));
173  tok->id_to_token = new_array;
174  tok->vocab_capacity = new_cap;
175  }
176 
177  /* Check if token already exists */
178  TokenInfo *existing = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
179  if (existing) {
180  existing->id = id;
181  existing->score = score;
182  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
183  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
184  tok->id_to_token[id] = strdup(token);
185  return 0;
186  }
187 
188  /* Create new token info */
189  TokenInfo *info = (TokenInfo *)malloc(sizeof(TokenInfo));
190  if (!info) return -1;
191  info->id = id;
192  info->score = score;
193  info->is_special = false;
194 
195  if (ck_tokenizer_hash_table_insert(tok->vocab, token, info) != 0) {
196  free(info);
197  return -1;
198  }
199 
200  /* Also add to trie for fast longest-match lookups */
201  if (tok->vocab_trie) {
202  ck_trie_insert(tok->vocab_trie, token, id, false, 0);
203  }
204 
205  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
206  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
207  tok->id_to_token[id] = strdup(token);
208 
209  return 0;
210 }
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
size_t vocab_capacity
Definition: tokenizer.h:108
char ** id_to_token
Definition: ck_tokenizer.h:86
const char * token
Definition: tokenizer.h:306
int32_t float * score
Definition: tokenizer.h:327

References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), id, CKTokenizer::id_to_token, score, token, CKTokenizer::vocab, CKTokenizer::vocab_capacity, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_load_binary_with_scores().

◆ ck_tokenizer_create()

CKTokenizer* ck_tokenizer_create ( CKTokenizerType  type)

Definition at line 34 of file tokenizer.c.

34  {
35  CKTokenizer *tok = (CKTokenizer *)malloc(sizeof(CKTokenizer));
36  if (!tok) {
37  return NULL;
38  }
39 
40  memset(tok, 0, sizeof(*tok));
41 
42  /* Create hash table for vocabulary */
44  if (!tok->vocab) {
45  free(tok);
46  return NULL;
47  }
48 
49  /* Create trie for fast lookups (1M nodes for ~50k vocab) */
50  tok->vocab_trie = ck_trie_create(1000000);
51  if (!tok->vocab_trie) {
53  free(tok);
54  return NULL;
55  }
56 
57  /* Initialize reverse vocab */
58  tok->vocab_capacity = 4096;
59  tok->id_to_token = (char **)calloc(tok->vocab_capacity, sizeof(char *));
60  if (!tok->id_to_token) {
62  free(tok);
63  return NULL;
64  }
65 
66  /* Set default special tokens */
67  tok->unk_id = 0;
68  tok->bos_id = 1;
69  tok->eos_id = 2;
70  tok->pad_id = -1;
71  tok->mask_id = -1;
72 
73  /* Initialize scores and types for SPM */
74  tok->scores = NULL;
75  tok->types = NULL;
76 
77  /* Set config */
78  tok->config.type = type;
79  tok->config.add_bos = false;
80  tok->config.add_eos = false;
81  tok->config.add_space_prefix = true;
82  tok->config.unk_score = -1e10f;
84 
85  ck_tokenizer_mempool_init(&tok->pool, 1024 * 1024);
86 
87  return tok;
88 }
#define CK_TOKENIZER_HT_BUCKETS_LARGE
Definition: hash_table.h:142
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80
CKTrie * ck_trie_create(size_t max_nodes)
Definition: trie.c:29
int ck_tokenizer_mempool_init(CKTokenizerMemPool *pool, size_t size)
Definition: memory_pool.c:11
bool add_space_prefix
Definition: tokenizer.h:77
CKTokenizerType type
Definition: tokenizer.h:74
CKSpmMode spm_mode
Definition: tokenizer.h:84
float * scores
Definition: tokenizer.h:111
CKMemPool pool
Definition: ck_tokenizer.h:78
uint8_t * types
Definition: tokenizer.h:113
int32_t mask_id
Definition: tokenizer.h:124
CKTokenizerConfig config
Definition: tokenizer.h:97
@ CK_SPM_MODE_UNIGRAM
Definition: tokenizer.h:68

References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, CKTokenizerConfig::add_space_prefix, CKTokenizer::bos_id, CK_SPM_MODE_UNIGRAM, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, ck_tokenizer_mempool_init(), ck_trie_create(), CKTokenizer::config, CKTokenizer::eos_id, CKTokenizer::id_to_token, CKTokenizer::mask_id, CKTokenizer::pad_id, CKTokenizer::pool, CKTokenizer::scores, CKTokenizerConfig::spm_mode, CKTokenizerConfig::type, CKTokenizer::types, CKTokenizer::unk_id, CKTokenizerConfig::unk_score, CKTokenizer::vocab, CKTokenizer::vocab_capacity, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_create_bpe(), ck_tokenizer_create_spm(), ck_tokenizer_create_wordpiece(), and main().

◆ ck_tokenizer_decode()

int ck_tokenizer_decode ( const CKTokenizer tok,
const int32_t *  ids,
int  num_ids,
char *  text,
int  max_len 
)

Decode token IDs to text.

Parameters
tokTokenizer
idsInput token IDs
num_idsNumber of IDs
textOutput text buffer
max_lenMaximum text length
Returns
Number of bytes written

Definition at line 1211 of file tokenizer.c.

1211  {
1212  if (!tok || !ids || !text || max_len <= 0) return 0;
1213  int len = 0;
1214  for (int i = 0; i < num_ids; i++) {
1215  int32_t id = ids[i];
1216  if (id < 0) continue;
1217  const char *token = ck_tokenizer_id_to_token(tok, id);
1218  if (!token) continue;
1219  int token_len = (int)strlen(token);
1220 
1221  /* Check for space prefix markers and convert to ASCII space */
1222  unsigned char c0 = (unsigned char)token[0];
1223  unsigned char c1 = (unsigned char)token[1];
1224 
1225  if (c0 == 0xC4 && c1 == 0xA0) {
1226  /* Ġ (U+0120) is 2 bytes - convert to space */
1227  if (len < max_len - 1) text[len++] = ' ';
1228  token += 2; token_len -= 2;
1229  } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
1230  /* ▁ (U+2581) is 3 bytes - convert to space */
1231  if (len < max_len - 1) text[len++] = ' ';
1232  token += 3; token_len -= 3;
1233  }
1234 
1235  for (int j = 0; j < token_len && len < max_len - 1; j++) text[len++] = token[j];
1236  }
1237  text[len] = '\0';
1238  return len;
1239 }
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: tokenizer.c:353
const int32_t * ids
Definition: tokenizer.h:443
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const int32_t int char int max_len
Definition: true_bpe.h:280

References CKTokenizer::bos_id, ck_tokenizer_id_to_token(), CKTokenizer::eos_id, ids, max_len, num_ids, CKTokenizer::pad_id, text, and token.

Referenced by main().

◆ ck_tokenizer_detect_space_prefix_style()

CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style ( CKTokenizer tok)

Definition at line 276 of file tokenizer.c.

276  {
277  if (!tok) return CK_SPACE_PREFIX_GPT2;
278 
279  /* Already detected? */
281  return tok->config.space_prefix_style;
282  }
283 
284  /* Count tokens starting with each style:
285  * Ġ (U+0120) = bytes 0xC4 0xA0
286  * ▁ (U+2581) = bytes 0xE2 0x96 0x81
287  */
288  int gpt2_count = 0;
289  int spm_count = 0;
290 
291  for (size_t i = 0; i < tok->vocab_size && i < 10000; i++) { /* Sample first 10k tokens */
292  const char *token = tok->id_to_token[i];
293  if (!token) continue;
294 
295  unsigned char c0 = (unsigned char)token[0];
296  unsigned char c1 = (unsigned char)token[1];
297 
298  /* Check for Ġ (0xC4 0xA0) */
299  if (c0 == 0xC4 && c1 == 0xA0) {
300  gpt2_count++;
301  }
302  /* Check for ▁ (0xE2 0x96 0x81) */
303  else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
304  spm_count++;
305  }
306  }
307 
308  /* Determine style based on counts */
309  CKSpacePrefixStyle detected;
310  if (spm_count > gpt2_count * 2) {
311  detected = CK_SPACE_PREFIX_SPM;
312  } else {
313  detected = CK_SPACE_PREFIX_GPT2; /* Default to GPT-2 if similar counts */
314  }
315 
316  tok->config.space_prefix_style = detected;
317  tok->config.space_prefix_detected = true;
318 
319  return detected;
320 }
bool space_prefix_detected
Definition: tokenizer.h:83
CKSpacePrefixStyle space_prefix_style
Definition: tokenizer.h:82
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, CKTokenizer::config, CKTokenizer::id_to_token, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, token, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_encode()

int ck_tokenizer_encode ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)

Encode text to token IDs using greedy longest-match.

For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.

Parameters
tokTokenizer
textInput text
text_lenText length, or -1 for null-terminated
idsOutput token IDs
max_idsMaximum IDs to write
Returns
Number of tokens written

Definition at line 1132 of file tokenizer.c.

1132  {
1133  if (!tok || !text || !ids || max_ids <= 0) return 0;
1134  if (text_len < 0) text_len = (int)strlen(text);
1135 
1136  /* For SPM tokenizers, use either unigram/Viterbi or llama-style SPM. */
1137  if (tok->config.type == CK_TOKENIZER_SPM) {
1138  int out_idx = 0;
1139  if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
1140  ids[out_idx++] = tok->bos_id;
1141  }
1142  if (text_len == 0) {
1143  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1144  ids[out_idx++] = tok->eos_id;
1145  }
1146  return out_idx;
1147  }
1148  int n = 0;
1149  if (tok->config.spm_mode == CK_SPM_MODE_LLAMA) {
1150  n = ck_tokenizer_encode_spm_llama_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
1151  } else {
1152  n = ck_tokenizer_encode_spm_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
1153  }
1154  if (n <= 0) return n;
1155  out_idx += n;
1156  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1157  ids[out_idx++] = tok->eos_id;
1158  }
1159  return out_idx;
1160  }
1161 
1162  /* For BPE tokenizers, convert spaces to appropriate prefix marker.
1163  * Auto-detect style from vocabulary if not already set. */
1164  char preprocessed[8192];
1165  const char *input = text;
1166  int input_len = text_len;
1167 
1168  if (tok->config.type == CK_TOKENIZER_BPE) {
1169  /* Get or detect space prefix style */
1170  CKSpacePrefixStyle style = ((CKTokenizer *)tok)->config.space_prefix_style;
1171  if (!((CKTokenizer *)tok)->config.space_prefix_detected) {
1173  }
1174 
1175  int pp_len = preprocess_bpe_spaces(text, text_len, preprocessed, sizeof(preprocessed) - 1, style);
1176  if (pp_len > 0) {
1177  preprocessed[pp_len] = '\0';
1178  input = preprocessed;
1179  input_len = pp_len;
1180  }
1181  }
1182 
1183  int out_idx = 0;
1184  if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
1185  ids[out_idx++] = tok->bos_id;
1186  }
1187 
1188  size_t pos = 0;
1189  while (pos < (size_t)input_len && out_idx < max_ids) {
1190  size_t match_len = 0;
1191  int32_t id = find_longest_match(tok, input, input_len, pos, &match_len);
1192 
1193  if (match_len == 0) {
1194  /* Emit UNK for unknown characters */
1195  if (tok->unk_id >= 0) ids[out_idx++] = tok->unk_id;
1196  pos++;
1197  } else {
1198  ids[out_idx++] = id;
1199  pos += match_len;
1200  }
1201  }
1202 
1203  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1204  ids[out_idx++] = tok->eos_id;
1205  }
1206 
1207  return out_idx;
1208 }
static int ck_tokenizer_encode_spm_llama_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:642
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok)
Definition: tokenizer.c:276
static int32_t find_longest_match(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:400
static int preprocess_bpe_spaces(const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style)
Definition: tokenizer.c:412
static int ck_tokenizer_encode_spm_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:832
@ CK_TOKENIZER_BPE
Definition: tokenizer.h:54
@ CK_TOKENIZER_SPM
Definition: tokenizer.h:56
@ CK_SPM_MODE_LLAMA
Definition: tokenizer.h:69
CKSpacePrefixStyle style
Definition: tokenizer.h:287
const CKBPEConfig * config
Definition: true_bpe.h:171
const char int text_len
Definition: true_bpe.h:262
const char int int32_t int max_ids
Definition: true_bpe.h:264

References CKTokenizer::add_bos, CKTokenizerConfig::add_bos, CKTokenizer::add_eos, CKTokenizerConfig::add_eos, CKTokenizer::bos_id, CK_SPM_MODE_LLAMA, CK_TOKENIZER_BPE, ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_lookup(), ck_tokenizer_lookup_merge(), CK_TOKENIZER_SPM, CKTokenizer::config, config, CKTokenizer::eos_id, find_longest_match(), id, ids, max_ids, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, out_len, preprocess_bpe_spaces(), CKMergeRule::priority, CKTokenizerConfig::spm_mode, style, text, text_len, CKTokenizerConfig::type, CKTokenizer::unk_id, and utf8_len().

Referenced by main(), and run_inference().

◆ ck_tokenizer_encode_spm_impl()

static int ck_tokenizer_encode_spm_impl ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)
static

Definition at line 832 of file tokenizer.c.

836  {
837  if (!tok || !text || !ids || max_ids <= 0) return 0;
838  if (text_len < 0) text_len = (int)strlen(text);
839  if (text_len == 0) return 0;
840  const int dbg = getenv("CK_DEBUG_SPM_ENCODE") ? 1 : 0;
841  if (dbg) {
842  fprintf(stderr, "[SPM] encode start: text_len=%d max_ids=%d\n", text_len, max_ids);
843  }
844 
845  /* Preprocess: replace spaces with ▁ */
846  char preprocessed[8192];
847  int pp_len = preprocess_spm_text(text, text_len, preprocessed, sizeof(preprocessed) - 1,
848  tok->config.add_space_prefix);
849  if (pp_len < 0) return 0;
850  preprocessed[pp_len] = '\0';
851  if (dbg) {
852  fprintf(stderr, "[SPM] preprocessed len=%d: \"%.*s\"\n", pp_len, pp_len, preprocessed);
853  }
854 
855  /* DP arrays - use malloc for large inputs */
856  size_t n = (size_t)pp_len + 1;
857  float *best_score = (float *)malloc(n * sizeof(float));
858  int32_t *best_prev = (int32_t *)malloc(n * sizeof(int32_t));
859  int32_t *best_token = (int32_t *)malloc(n * sizeof(int32_t));
860  if (dbg) {
861  fprintf(stderr, "[SPM] DP alloc n=%zu\n", n);
862  }
863 
864  if (!best_score || !best_prev || !best_token) {
865  if (best_score) free(best_score);
866  if (best_prev) free(best_prev);
867  if (best_token) free(best_token);
868  return 0;
869  }
870 
871  /* Initialize DP */
872  const float neg_inf = -1e30f;
873  const float unknown_penalty = -10.0f; /* SentencePiece-style UNK penalty */
874  for (size_t i = 0; i < n; i++) {
875  best_score[i] = neg_inf;
876  best_prev[i] = -1;
877  best_token[i] = -1;
878  }
879  best_score[0] = 0.0f;
880 
881  /* DP: for each position, find best way to reach it */
882  for (size_t pos = 0; pos < n; pos++) {
883  if (best_score[pos] == neg_inf) continue;
884 
885  /* Find all tokens that match at this position */
886  int32_t candidates[64];
887  int num_cand = spm_find_candidates_at_pos(tok, preprocessed, pp_len, pos, candidates, 64);
888  if (dbg && pos < 8) {
889  fprintf(stderr, "[SPM] pos=%zu cand=%d\n", pos, num_cand);
890  }
891 
892  for (int c = 0; c < num_cand; c++) {
893  int32_t token_id = candidates[c];
894 
895  /* Skip disallowed token types in DP */
896  if (!spm_token_allowed_in_dp(tok, token_id)) {
897  continue;
898  }
899 
900  /* Get token string and length */
901  const char *token = ck_tokenizer_id_to_token(tok, token_id);
902  if (!token) continue;
903 
904  /* Calculate token length in bytes */
905  int token_len = (int)strlen(token);
906 
907  /* For UNK token, use the unknown run length to cover all consecutive unknown bytes */
908  if (token_id == tok->unk_id) {
909  token_len = spm_count_unknown_run(tok, preprocessed, pp_len, pos);
910  if (token_len == 0) token_len = 1; /* At least 1 byte */
911  }
912 
913  size_t next_pos = pos + token_len;
914 
915  if (next_pos >= n) continue;
916 
917  /* Get token score for Viterbi */
918  float token_score = 0.0f;
919  if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->vocab_size) {
920  token_score = tok->scores[token_id];
921  }
922 
923  /* USER_DEFINED tokens get score 0 (like llama.cpp) */
924  if (tok->types && token_id >= 0 && token_id < (int32_t)tok->types_size) {
925  if (tok->types[token_id] == GGUF_TOKEN_USER_DEFINED) {
926  token_score = 0.0f;
927  }
928  }
929  /* Apply UNK penalty (SentencePiece behavior) */
930  if (token_id == tok->unk_id) {
931  token_score += unknown_penalty;
932  }
933 
934  /* Transition: score = best_score[pos] + token_score */
935  float new_score = best_score[pos] + token_score;
936 
937  if (new_score > best_score[next_pos]) {
938  best_score[next_pos] = new_score;
939  best_prev[next_pos] = (int32_t)pos;
940  best_token[next_pos] = token_id;
941  }
942  }
943  }
944 
945  /* Backtrack to find best token sequence */
946  int32_t *reverse_ids = (int32_t *)malloc(max_ids * sizeof(int32_t));
947  if (!reverse_ids) {
948  free(best_score);
949  free(best_prev);
950  free(best_token);
951  return 0;
952  }
953 
954  int num_tokens = 0;
955  int32_t curr = (int32_t)(n - 1);
956 
957  /* Handle trailingUNK by finding valid end */
958  while (curr > 0 && best_token[curr] < 0) {
959  curr = best_prev[curr];
960  }
961 
962  /* Backtrack from end to start, collecting tokens.
963  * We track the token's start position to avoid duplicates. */
964  int last_start = -1; /* Track the start position of last added token */
965  while (curr > 0 && num_tokens < max_ids) {
966  int32_t token_id = best_token[curr];
967  if (token_id >= 0) {
968  /* Use the DP backpointer as the true token start */
969  int token_start = best_prev[curr];
970 
971  /* Only add if this is a new token (different start position) */
972  if (token_start != last_start) {
973  reverse_ids[num_tokens++] = token_id;
974  last_start = token_start;
975  }
976  }
977  curr = best_prev[curr];
978  }
979  if (dbg) {
980  fprintf(stderr, "[SPM] backtrack tokens=%d curr=%d\n", num_tokens, curr);
981  }
982 
983  /* Free DP arrays before using reverse_ids */
984  free(best_score);
985  free(best_prev);
986  free(best_token);
987 
988  if (num_tokens > max_ids) num_tokens = max_ids;
989 
990  /* Backtracking collected tokens in reverse order, so reverse once */
991  for (int i = 0; i < num_tokens / 2; i++) {
992  int32_t tmp = reverse_ids[i];
993  reverse_ids[i] = reverse_ids[num_tokens - 1 - i];
994  reverse_ids[num_tokens - 1 - i] = tmp;
995  }
996 
997  /* Copy to output and merge consecutive UNK tokens (SPM behavior) */
998  int out_idx = 0;
999  for (int i = 0; i < num_tokens && out_idx < max_ids; i++) {
1000  int32_t token_id = reverse_ids[i];
1001 
1002  /* Merge consecutive UNK tokens into one */
1003  if (token_id == tok->unk_id && out_idx > 0 && ids[out_idx - 1] == tok->unk_id) {
1004  continue; /* Skip - already have UNK */
1005  }
1006  ids[out_idx++] = token_id;
1007  }
1008  if (dbg) {
1009  fprintf(stderr, "[SPM] encode done: out=%d\n", out_idx);
1010  }
1011 
1012  free(reverse_ids);
1013 
1014  /* If DP failed to produce valid tokens, use byte-fallback */
1015  if (num_tokens == 0) {
1017  }
1018 
1019  return out_idx;
1020 }
size_t types_size
Definition: tokenizer.h:114
static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len, size_t pos, int32_t *candidates, int max_candidates)
Definition: tokenizer.c:1047
static bool spm_token_allowed_in_dp(const CKTokenizer *tok, int32_t token_id)
Definition: tokenizer.c:469
static int preprocess_spm_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:763
static int spm_encode_byte_fallback(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:1024
#define GGUF_TOKEN_USER_DEFINED
Definition: tokenizer.c:463
static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos)
Definition: tokenizer.c:1098

References CKTokenizerConfig::add_space_prefix, ck_tokenizer_id_to_token(), CKTokenizer::config, GGUF_TOKEN_USER_DEFINED, ids, max_ids, preprocess_spm_text(), CKTokenizer::scores, spm_count_unknown_run(), spm_encode_byte_fallback(), spm_find_candidates_at_pos(), spm_token_allowed_in_dp(), text, text_len, token, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::unk_id, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_encode_spm_llama_impl()

static int ck_tokenizer_encode_spm_llama_impl ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)
static

Definition at line 642 of file tokenizer.c.

646  {
647  if (!tok || !text || !ids || max_ids <= 0) return 0;
648  if (text_len < 0) text_len = (int)strlen(text);
649  if (text_len == 0) return 0;
650 
651  char preprocessed[8192];
652  int pp_len = preprocess_spm_llama_text(text, text_len, preprocessed, (int)sizeof(preprocessed) - 1,
653  tok->config.add_space_prefix);
654  if (pp_len < 0) return 0;
655  preprocessed[pp_len] = '\0';
656 
657  int num_symbols = 0;
658  for (int offs = 0; offs < pp_len;) {
659  int char_len = utf8_len((unsigned char)preprocessed[offs]);
660  if (char_len <= 0) char_len = 1;
661  if (offs + char_len > pp_len) char_len = pp_len - offs;
662  offs += char_len;
663  num_symbols++;
664  }
665  if (num_symbols <= 0) return 0;
666 
667  SpmLlamaSymbol *symbols = (SpmLlamaSymbol *)calloc((size_t)num_symbols, sizeof(SpmLlamaSymbol));
668  int node_cap = 2 * num_symbols + 1;
669  SpmLlamaNode *nodes = (SpmLlamaNode *)calloc((size_t)node_cap, sizeof(SpmLlamaNode));
670  if (!symbols || !nodes) {
671  if (symbols) free(symbols);
672  if (nodes) free(nodes);
673  return 0;
674  }
675 
676  int index = 0;
677  for (int offs = 0; offs < pp_len && index < num_symbols;) {
678  int char_len = utf8_len((unsigned char)preprocessed[offs]);
679  if (char_len <= 0) char_len = 1;
680  if (offs + char_len > pp_len) char_len = pp_len - offs;
681 
682  symbols[index].text = preprocessed + offs;
683  symbols[index].n = char_len;
684  symbols[index].prev = index - 1;
685  symbols[index].next = (index + 1 < num_symbols) ? (index + 1) : -1;
686  symbols[index].node_id = index;
687 
688  nodes[index].text = preprocessed + offs;
689  nodes[index].n = char_len;
690  nodes[index].left = -1;
691  nodes[index].right = -1;
692 
693  offs += char_len;
694  index++;
695  }
696 
697  int node_count = num_symbols;
698  for (;;) {
699  int best_left = -1;
700  int best_right = -1;
701  float best_score = -1e30f;
702 
703  for (int left = 0; left != -1; left = symbols[left].next) {
704  int right = symbols[left].next;
705  if (right < 0) continue;
706 
707  int pair_len = symbols[left].n + symbols[right].n;
708  int32_t token_id = ck_tokenizer_lookup_exact_n(tok, symbols[left].text, pair_len);
709  if (token_id < 0 || token_id >= (int32_t)tok->vocab_size) continue;
710 
711  float score = 0.0f;
712  if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->scores_size) {
713  score = tok->scores[token_id];
714  }
715 
716  if (best_left < 0 || score > best_score || (score == best_score && left < best_left)) {
717  best_left = left;
718  best_right = right;
719  best_score = score;
720  }
721  }
722 
723  if (best_left < 0 || best_right < 0) break;
724  if (node_count >= node_cap) break;
725 
726  SpmLlamaSymbol *left = &symbols[best_left];
727  SpmLlamaSymbol *right = &symbols[best_right];
728 
729  int new_node_id = node_count++;
730  nodes[new_node_id].text = left->text;
731  nodes[new_node_id].n = left->n + right->n;
732  nodes[new_node_id].left = left->node_id;
733  nodes[new_node_id].right = right->node_id;
734 
735  left->n += right->n;
736  left->node_id = new_node_id;
737  left->next = right->next;
738  if (right->next >= 0) {
739  symbols[right->next].prev = best_left;
740  }
741 
742  right->n = 0;
743  right->prev = -1;
744  right->next = -1;
745  }
746 
747  int out_idx = 0;
748  for (int i = 0; i != -1 && out_idx < max_ids; i = symbols[i].next) {
749  out_idx = spm_llama_resegment_node(tok, nodes, symbols[i].node_id, ids, max_ids, out_idx);
750  }
751 
752  free(symbols);
753  free(nodes);
754  return out_idx;
755 }
size_t scores_size
Definition: tokenizer.h:112
static int preprocess_spm_llama_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:554
static int spm_llama_resegment_node(const CKTokenizer *tok, const SpmLlamaNode *nodes, int node_id, int32_t *ids, int max_ids, int out_idx)
Definition: tokenizer.c:611
static int32_t ck_tokenizer_lookup_exact_n(const CKTokenizer *tok, const char *text, int text_len)
Definition: tokenizer.c:337
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541

References CKTokenizerConfig::add_space_prefix, ck_tokenizer_lookup_exact_n(), CKTokenizer::config, ids, left, max_ids, preprocess_spm_llama_text(), right, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_llama_resegment_node(), text, text_len, utf8_len(), and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_free()

void ck_tokenizer_free ( CKTokenizer tok)

Definition at line 91 of file tokenizer.c.

91  {
92  if (!tok) return;
93 
94  /* Free vocabulary entries */
95  if (tok->vocab) {
97  }
98 
99  /* Free trie */
100  if (tok->vocab_trie) {
101  ck_trie_free(tok->vocab_trie);
102  }
103 
104  /* Free reverse vocab strings */
105  if (tok->id_to_token) {
106  /* Note: strings were strdup'd in add_token */
107  for (size_t i = 0; i < tok->vocab_size; i++) {
108  if (tok->id_to_token[i]) {
109  free(tok->id_to_token[i]);
110  }
111  }
112  free(tok->id_to_token);
113  }
114 
115  /* Free SPM-related arrays */
116  if (tok->scores) free(tok->scores);
117  if (tok->types) free(tok->types);
118  if (tok->byte_token_id) free(tok->byte_token_id);
119 
121  free(tok);
122 }
void ck_trie_free(CKTrie *trie)
Definition: trie.c:51
void ck_tokenizer_mempool_free(CKTokenizerMemPool *pool)
Definition: memory_pool.c:28
int32_t * byte_token_id
Definition: tokenizer.h:117

References CKTokenizer::byte_token_id, ck_pool_free(), ck_tokenizer_hash_table_free(), ck_tokenizer_mempool_free(), ck_trie_free(), CKTokenizer::id_to_token, CKTokenizer::merge_hash, CKTokenizer::merges, CKTokenizer::pool, CKTokenizer::scores, CKTokenizer::types, CKTokenizer::vocab, CKTokenizer::vocab_hash, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by main(), and run_inference().

◆ ck_tokenizer_id_to_token()

const char* ck_tokenizer_id_to_token ( const CKTokenizer tok,
int32_t  id 
)

Definition at line 353 of file tokenizer.c.

353  {
354  if (!tok || id < 0 || id >= (int32_t)tok->vocab_size) return NULL;
355  return tok->id_to_token[id];
356 }

References id, CKTokenizer::id_to_token, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_decode(), ck_tokenizer_encode_spm_impl(), main(), and run_inference().

◆ ck_tokenizer_load_binary()

int ck_tokenizer_load_binary ( CKTokenizer tok,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
int  num_merges,
const int32_t *  merges 
)

Load vocabulary from memory-mapped binary data.

Parameters
tokTokenizer
vocab_sizeNumber of tokens
offsetsArray of offsets into strings pool
stringsString pool containing null-terminated tokens
num_mergesNumber of BPE merges
mergesMerge rules as (left, right, merged) triplets
Returns
0 on success, -1 on error

Definition at line 1242 of file tokenizer.c.

1247  {
1249 }
int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1252
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186

References ck_tokenizer_load_binary_with_scores(), merges, num_merges, offsets, strings, and vocab_size.

◆ ck_tokenizer_load_binary_with_scores()

int ck_tokenizer_load_binary_with_scores ( CKTokenizer tok,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
const float *  scores,
const uint8_t *  types,
int  num_merges,
const int32_t *  merges 
)

Load vocabulary from memory-mapped binary data with scores and types.

This extended version supports SPM (SentencePiece) tokenizers which require token scores for Viterbi/DP encoding.

Parameters
tokTokenizer
vocab_sizeNumber of tokens
offsetsArray of offsets into strings pool
stringsString pool containing null-terminated tokens
scoresArray of token scores (float32), can be NULL
typesArray of token types (uint8), can be NULL
num_mergesNumber of BPE merges
mergesMerge rules as (left, right, merged) triplets
Returns
0 on success, -1 on error

Definition at line 1252 of file tokenizer.c.

1259  {
1260  if (!tok || !offsets || !strings) return -1;
1261  ck_tokenizer_reset(tok);
1262 
1263  /* Free any existing scores/types arrays before reallocating */
1264  if (tok->scores) {
1265  free(tok->scores);
1266  tok->scores = NULL;
1267  tok->scores_size = 0;
1268  }
1269  if (tok->types) {
1270  free(tok->types);
1271  tok->types = NULL;
1272  tok->types_size = 0;
1273  }
1274 
1275  /* Allocate scores and types arrays if provided */
1276  if (scores && vocab_size > 0) {
1277  tok->scores = (float *)malloc(vocab_size * sizeof(float));
1278  if (!tok->scores) return -1;
1279  memcpy(tok->scores, scores, vocab_size * sizeof(float));
1280  tok->scores_size = (size_t)vocab_size;
1281  }
1282  if (types && vocab_size > 0) {
1283  tok->types = (uint8_t *)malloc(vocab_size * sizeof(uint8_t));
1284  if (!tok->types) {
1285  if (tok->scores) {
1286  free(tok->scores);
1287  tok->scores = NULL;
1288  }
1289  return -1;
1290  }
1291  memcpy(tok->types, types, vocab_size * sizeof(uint8_t));
1292  tok->types_size = (size_t)vocab_size;
1293  }
1294 
1295  for (int i = 0; i < vocab_size; i++) {
1296  const char *token = strings + offsets[i];
1297  float score = scores ? scores[i] : 0.0f;
1299  }
1300 
1301  /* Build byte token lookup table if types are available */
1302  if (types && vocab_size > 0) {
1304 
1305  /* Log token type statistics */
1306  int count_normal = 0, count_unknown = 0, count_control = 0, count_byte = 0, count_other = 0;
1307  int max_type = 0;
1308  for (int i = 0; i < vocab_size; i++) {
1309  uint8_t t = tok->types[i];
1310  if (t > max_type) max_type = t;
1311  switch (t) {
1312  case GGUF_TOKEN_NORMAL: count_normal++; break;
1313  case GGUF_TOKEN_UNKNOWN: count_unknown++; break;
1314  case GGUF_TOKEN_CONTROL: count_control++; break;
1315  case GGUF_TOKEN_BYTE: count_byte++; break;
1316  default: count_other++; break;
1317  }
1318  }
1319  fprintf(stderr, "[TOKENIZER] Loaded %d tokens: normal=%d, unknown=%d, control=%d, byte=%d, other=%d\n",
1320  vocab_size, count_normal, count_unknown, count_control, count_byte, count_other);
1321  if (max_type > GGUF_TOKEN_BYTE) {
1322  fprintf(stderr, "[TOKENIZER] Warning: Unexpected token type %d\n", max_type);
1323  }
1324  }
1325 
1326  /* TODO: Merges */
1327  (void)num_merges; (void)merges;
1328  return 0;
1329 }
#define GGUF_TOKEN_CONTROL
Definition: tokenizer.c:462
static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size)
Definition: tokenizer.c:507
void ck_tokenizer_reset(CKTokenizer *tok)
Definition: tokenizer.c:125
#define GGUF_TOKEN_BYTE
Definition: tokenizer.c:465
#define GGUF_TOKEN_UNKNOWN
Definition: tokenizer.c:461
#define GGUF_TOKEN_NORMAL
Definition: tokenizer.c:460

References ck_tokenizer_add_token(), ck_tokenizer_reset(), GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_NORMAL, GGUF_TOKEN_UNKNOWN, merges, num_merges, offsets, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_build_byte_lookup(), strings, token, CKTokenizer::types, CKTokenizer::types_size, and vocab_size.

Referenced by ck_tokenizer_load_binary().

◆ ck_tokenizer_load_gguf()

int ck_tokenizer_load_gguf ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from GGUF file.

Parameters
tokTokenizer
pathPath to GGUF file
Returns
0 on success, -1 on error

Definition at line 1332 of file tokenizer.c.

1332 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_json()

int ck_tokenizer_load_json ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from JSON file (HuggingFace format).

Parameters
tokTokenizer
pathPath to vocab.json or tokenizer.json
Returns
0 on success, -1 on error

Definition at line 1333 of file tokenizer.c.

1333 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_merges()

int ck_tokenizer_load_merges ( CKTokenizer tok,
const char *  path 
)

Load BPE merges from text file.

Format: token1 token2 (one merge per line)

Parameters
tokTokenizer
pathPath to merges.txt
Returns
0 on success, -1 on error

Definition at line 1335 of file tokenizer.c.

1335 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_text()

int ck_tokenizer_load_text ( CKTokenizer tok,
const char *  path 
)

Load vocabulary from text file (one token per line).

Format: token_string [id] [score] Lines starting with # are comments.

Parameters
tokTokenizer
pathPath to vocabulary file
Returns
0 on success, -1 on error

Definition at line 1334 of file tokenizer.c.

1334 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_lookup()

int32_t ck_tokenizer_lookup ( const CKTokenizer tok,
const char *  token 
)

Definition at line 323 of file tokenizer.c.

323  {
324  if (!tok || !token) return -1;
325  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
326  return info ? info->id : tok->unk_id;
327 }

References ck_tokenizer_hash_table_lookup(), token, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by spm_get_byte_token().

◆ ck_tokenizer_lookup_exact()

static int32_t ck_tokenizer_lookup_exact ( const CKTokenizer tok,
const char *  token 
)
static

Definition at line 330 of file tokenizer.c.

330  {
331  if (!tok || !token) return -1;
332  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
333  return info ? info->id : -1;
334 }

References ck_tokenizer_hash_table_lookup(), token, and CKTokenizer::vocab.

Referenced by ck_tokenizer_lookup_exact_n().

◆ ck_tokenizer_lookup_exact_n()

static int32_t ck_tokenizer_lookup_exact_n ( const CKTokenizer tok,
const char *  text,
int  text_len 
)
static

Definition at line 337 of file tokenizer.c.

337  {
338  if (!tok || !text || text_len <= 0) return -1;
339  char stack_buf[512];
340  char *tmp = stack_buf;
341  if (text_len >= (int)sizeof(stack_buf)) {
342  tmp = (char *)malloc((size_t)text_len + 1);
343  if (!tmp) return -1;
344  }
345  memcpy(tmp, text, (size_t)text_len);
346  tmp[text_len] = '\0';
347  int32_t id = ck_tokenizer_lookup_exact(tok, tmp);
348  if (tmp != stack_buf) free(tmp);
349  return id;
350 }
static int32_t ck_tokenizer_lookup_exact(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:330

References ck_tokenizer_lookup_exact(), id, text, and text_len.

Referenced by ck_tokenizer_encode_spm_llama_impl(), and spm_llama_resegment_node().

◆ ck_tokenizer_reset()

void ck_tokenizer_reset ( CKTokenizer tok)

Definition at line 125 of file tokenizer.c.

125  {
126  if (!tok) return;
127 
129 
130  if (tok->vocab_trie) {
132  }
133 
134  for (size_t i = 0; i < tok->vocab_size; i++) {
135  if (tok->id_to_token[i]) {
136  free(tok->id_to_token[i]);
137  tok->id_to_token[i] = NULL;
138  }
139  }
140 
141  tok->vocab_size = 0;
142 
143  /* Reset SPM-related arrays using actual allocated sizes */
144  if (tok->scores && tok->scores_size > 0) {
145  memset(tok->scores, 0, tok->scores_size * sizeof(float));
146  }
147  if (tok->types && tok->types_size > 0) {
148  memset(tok->types, 0, tok->types_size * sizeof(uint8_t));
149  }
150  /* Clear byte lookup table */
151  if (tok->byte_token_id) {
152  memset(tok->byte_token_id, -1, 256 * sizeof(int32_t));
153  }
154 }
void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:312
void ck_trie_clear(CKTrie *trie)
Definition: trie.c:80

References CKTokenizer::byte_token_id, ck_tokenizer_hash_table_clear(), ck_trie_clear(), CKTokenizer::id_to_token, CKTokenizer::scores, CKTokenizer::scores_size, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::vocab, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_load_binary_with_scores().

◆ ck_tokenizer_set_add_bos_eos()

void ck_tokenizer_set_add_bos_eos ( CKTokenizer tok,
bool  add_bos,
bool  add_eos 
)

Definition at line 243 of file tokenizer.c.

243  {
244  if (!tok) return;
245  tok->config.add_bos = add_bos;
246  tok->config.add_eos = add_eos;
247 }
bool bool add_eos
Definition: tokenizer.h:242
bool add_bos
Definition: tokenizer.h:242

References CKTokenizerConfig::add_bos, add_bos, CKTokenizerConfig::add_eos, add_eos, and CKTokenizer::config.

◆ ck_tokenizer_set_add_space_prefix()

void ck_tokenizer_set_add_space_prefix ( CKTokenizer tok,
bool  add_space_prefix 
)

Definition at line 249 of file tokenizer.c.

249  {
250  if (!tok) return;
252 }
bool add_space_prefix
Definition: tokenizer.h:252

References CKTokenizerConfig::add_space_prefix, add_space_prefix, and CKTokenizer::config.

◆ ck_tokenizer_set_space_prefix_style()

void ck_tokenizer_set_space_prefix_style ( CKTokenizer tok,
CKSpacePrefixStyle  style 
)

Definition at line 266 of file tokenizer.c.

266  {
267  if (!tok) return;
269  if (style != CK_SPACE_PREFIX_AUTO) {
270  tok->config.space_prefix_detected = true;
271  }
272 }

References CK_SPACE_PREFIX_AUTO, CKTokenizer::config, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, and style.

◆ ck_tokenizer_set_special_ids()

void ck_tokenizer_set_special_ids ( CKTokenizer tok,
int32_t  unk,
int32_t  bos,
int32_t  eos,
int32_t  pad,
int32_t  mask 
)

Definition at line 234 of file tokenizer.c.

234  {
235  if (!tok) return;
236  tok->unk_id = unk;
237  tok->bos_id = bos;
238  tok->eos_id = eos;
239  tok->pad_id = pad;
240  tok->mask_id = mask;
241 }
int32_t int32_t int32_t int32_t int32_t mask
Definition: tokenizer.h:233
int32_t unk
Definition: tokenizer.h:229
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232
int32_t int32_t bos
Definition: tokenizer.h:230

References bos, CKTokenizer::bos_id, eos, CKTokenizer::eos_id, mask, CKTokenizer::mask_id, pad, CKTokenizer::pad_id, unk, and CKTokenizer::unk_id.

◆ ck_tokenizer_set_spm_mode()

void ck_tokenizer_set_spm_mode ( CKTokenizer tok,
CKSpmMode  spm_mode 
)

Definition at line 254 of file tokenizer.c.

254  {
255  if (!tok) return;
256  tok->config.spm_mode = spm_mode;
257 }
CKSpmMode spm_mode
Definition: tokenizer.h:260

References CKTokenizer::config, CKTokenizerConfig::spm_mode, and spm_mode.

◆ ck_tokenizer_set_use_trie()

void ck_tokenizer_set_use_trie ( CKTokenizer tok,
bool  use_trie 
)

Definition at line 260 of file tokenizer.c.

260  {
261  if (!tok) return;
262  tok->config.use_trie = use_trie;
263 }
bool use_trie
Definition: tokenizer.h:276

References CKTokenizer::config, CKTokenizerConfig::use_trie, and use_trie.

◆ find_longest_match()

static int32_t find_longest_match ( const CKTokenizer tok,
const char *  text,
size_t  text_len,
size_t  pos,
size_t *  match_len 
)
static

Definition at line 400 of file tokenizer.c.

400  {
401  if (tok->config.use_trie) {
402  return find_longest_match_trie(tok, text, text_len, pos, match_len);
403  } else {
404  return find_longest_match_hash(tok, text, text_len, pos, match_len);
405  }
406 }
static int32_t find_longest_match_trie(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:359
static int32_t find_longest_match_hash(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:370

References CKTokenizer::config, find_longest_match_hash(), find_longest_match_trie(), text, text_len, and CKTokenizerConfig::use_trie.

Referenced by ck_tokenizer_encode().

◆ find_longest_match_hash()

static int32_t find_longest_match_hash ( const CKTokenizer tok,
const char *  text,
size_t  text_len,
size_t  pos,
size_t *  match_len 
)
static

Definition at line 370 of file tokenizer.c.

370  {
371  if (!tok || !text || pos >= text_len) {
372  *match_len = 0;
373  return tok ? tok->unk_id : -1;
374  }
375 
376  size_t max_len = 64;
377  if (pos + max_len > text_len) max_len = text_len - pos;
378 
379  int32_t best_id = tok->unk_id;
380  size_t best_len = 0;
381 
382  for (size_t len = max_len; len >= 1; len--) {
383  char tmp[65];
384  memcpy(tmp, text + pos, len);
385  tmp[len] = '\0';
386 
387  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
388  if (info) {
389  best_id = info->id;
390  best_len = len;
391  break;
392  }
393  }
394 
395  *match_len = best_len;
396  return best_id;
397 }

References ck_tokenizer_hash_table_lookup(), max_len, text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by find_longest_match().

◆ find_longest_match_trie()

static int32_t find_longest_match_trie ( const CKTokenizer tok,
const char *  text,
size_t  text_len,
size_t  pos,
size_t *  match_len 
)
static

Definition at line 359 of file tokenizer.c.

359  {
360  if (!tok || !tok->vocab_trie || !text || pos >= text_len) {
361  *match_len = 0;
362  return tok ? tok->unk_id : -1;
363  }
364 
365  int32_t token_id = ck_trie_find_longest(tok->vocab_trie, text, text_len, pos, match_len);
366  return token_id >= 0 ? token_id : tok->unk_id;
367 }
int32_t ck_trie_find_longest(const CKTrie *trie, const char *text, size_t text_len, size_t start_pos, size_t *match_len)
Definition: trie.c:142

References ck_trie_find_longest(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab_trie.

Referenced by find_longest_match().

◆ preprocess_bpe_spaces()

static int preprocess_bpe_spaces ( const char *  text,
int  text_len,
char *  out,
int  out_max,
CKSpacePrefixStyle  style 
)
static

Definition at line 412 of file tokenizer.c.

412  {
413  int out_len = 0;
414 
415  /* For SentencePiece, add ▁ at the start of text (unless text starts with space) */
416  if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
417  if (out_len + 3 > out_max) return -1;
418  out[out_len++] = (char)0xE2;
419  out[out_len++] = (char)0x96;
420  out[out_len++] = (char)0x81;
421  }
422 
423  for (int i = 0; i < text_len; i++) {
424  if (text[i] == ' ') {
425  if (style == CK_SPACE_PREFIX_SPM) {
426  /* SentencePiece style: ▁ (3 bytes: 0xE2 0x96 0x81) */
427  if (out_len + 3 > out_max) return -1;
428  out[out_len++] = (char)0xE2;
429  out[out_len++] = (char)0x96;
430  out[out_len++] = (char)0x81;
431  } else {
432  /* GPT-2 style: Ġ (2 bytes: 0xC4 0xA0) */
433  if (out_len + 2 > out_max) return -1;
434  out[out_len++] = (char)0xC4;
435  out[out_len++] = (char)0xA0;
436  }
437  } else {
438  if (out_len + 1 > out_max) return -1;
439  out[out_len++] = text[i];
440  }
441  }
442  return out_len;
443 }
const int32_t int int * out_len
Definition: tokenizer.h:445

References CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.

Referenced by ck_tokenizer_encode().

◆ preprocess_spm_llama_text()

static int preprocess_spm_llama_text ( const char *  text,
int  text_len,
char *  out,
int  out_max,
bool  add_space_prefix 
)
static

Definition at line 554 of file tokenizer.c.

554  {
555  int out_len = 0;
556  if (text_len < 0) text_len = (int)strlen(text);
557 
558  if (add_space_prefix && text_len > 0) {
559  if (out_len + 3 > out_max) return -1;
560  out[out_len++] = (char)0xE2;
561  out[out_len++] = (char)0x96;
562  out[out_len++] = (char)0x81;
563  }
564 
565  for (int i = 0; i < text_len;) {
566  if (text[i] == ' ') {
567  int j = i;
568  while (j < text_len && text[j] == ' ') {
569  j++;
570  }
571  int run = j - i;
572 
573  /* Match llama.cpp behavior for this GGUF family:
574  * single separators map to ▁, but multi-space runs remain literal. */
575  if (run == 1) {
576  if (out_len + 3 > out_max) return -1;
577  out[out_len++] = (char)0xE2;
578  out[out_len++] = (char)0x96;
579  out[out_len++] = (char)0x81;
580  } else {
581  if (out_len + run > out_max) return -1;
582  for (int k = 0; k < run; k++) {
583  out[out_len++] = ' ';
584  }
585  }
586  i = j;
587  } else {
588  if (out_len + 1 > out_max) return -1;
589  out[out_len++] = text[i++];
590  }
591  }
592 
593  return out_len;
594 }

References add_space_prefix, out_len, text, and text_len.

Referenced by ck_tokenizer_encode_spm_llama_impl().

◆ preprocess_spm_text()

static int preprocess_spm_text ( const char *  text,
int  text_len,
char *  out,
int  out_max,
bool  add_space_prefix 
)
static

Definition at line 763 of file tokenizer.c.

763  {
764  int out_len = 0;
765 
766  /* Count leading spaces */
767  int lead_spaces = 0;
768  while (lead_spaces < text_len && text[lead_spaces] == ' ') {
769  lead_spaces++;
770  }
771 
772  /* Count trailing spaces */
773  int trail_spaces = 0;
774  while (trail_spaces < text_len - lead_spaces &&
775  text[text_len - 1 - trail_spaces] == ' ') {
776  trail_spaces++;
777  }
778 
779  /* Add ▁ at start if there's any non-space content AND text doesn't already start with ▁ */
780  int content_len = text_len - lead_spaces - trail_spaces;
781  int starts_with_prefix = (text_len >= 3 &&
782  (unsigned char)text[0] == 0xE2 &&
783  (unsigned char)text[1] == 0x96 &&
784  (unsigned char)text[2] == 0x81);
785  int inserted_prefix = 0;
786  if (content_len > 0 && !starts_with_prefix && add_space_prefix) {
787  if (out_len + 3 > out_max) return -1;
788  out[out_len++] = (char)0xE2;
789  out[out_len++] = (char)0x96;
790  out[out_len++] = (char)0x81;
791  inserted_prefix = 1;
792  }
793 
794  /* Process middle content: collapse multiple spaces to single ▁ */
795  int i = lead_spaces;
796  int last_was_space = (starts_with_prefix || inserted_prefix) ? 1 : 0;
797  while (i < text_len - trail_spaces) {
798  if (text[i] == ' ') {
799  if (!last_was_space) {
800  /* First space after content - add ▁ */
801  if (out_len + 3 > out_max) return -1;
802  out[out_len++] = (char)0xE2;
803  out[out_len++] = (char)0x96;
804  out[out_len++] = (char)0x81;
805  last_was_space = 1;
806  }
807  /* Skip additional consecutive spaces */
808  } else {
809  if (out_len + 1 > out_max) return -1;
810  out[out_len++] = text[i];
811  last_was_space = 0;
812  }
813  i++;
814  }
815 
816  return out_len;
817 }

References add_space_prefix, out_len, text, and text_len.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_build_byte_lookup()

static void spm_build_byte_lookup ( CKTokenizer tok,
const char *  strings,
const int32_t *  offsets,
int  vocab_size 
)
static

Definition at line 507 of file tokenizer.c.

507  {
508  /* Reuse existing array or allocate new one */
509  if (!tok->byte_token_id) {
510  tok->byte_token_id = (int32_t *)malloc(256 * sizeof(int32_t));
511  if (!tok->byte_token_id) return;
512  }
513 
514  /* Initialize all entries to -1 */
515  for (int i = 0; i < 256; i++) {
516  tok->byte_token_id[i] = -1;
517  }
518 
519  /* Scan vocab for byte tokens */
520  for (int i = 0; i < vocab_size; i++) {
521  if (!tok->types || tok->types[i] != GGUF_TOKEN_BYTE) continue;
522 
523  const char *token = strings + offsets[i];
524  size_t len = strlen(token);
525 
526  if (len == 1) {
527  /* Raw byte token (single byte) */
528  unsigned char byte_val = (unsigned char)token[0];
529  tok->byte_token_id[byte_val] = i;
530  } else if (spm_token_is_byte_format(token)) {
531  /* <0xXX> format - parse the hex value */
532  unsigned int byte_val;
533  if (sscanf(token, "<0x%02X>", &byte_val) == 1 && byte_val < 256) {
534  tok->byte_token_id[byte_val] = i;
535  }
536  }
537  }
538 }
static bool spm_token_is_byte_format(const char *token)
Definition: tokenizer.c:500

References CKTokenizer::byte_token_id, GGUF_TOKEN_BYTE, offsets, spm_token_is_byte_format(), strings, token, CKTokenizer::types, and vocab_size.

Referenced by ck_tokenizer_load_binary_with_scores().

◆ spm_count_unknown_run()

static int spm_count_unknown_run ( const CKTokenizer tok,
const char *  text,
int  text_len,
size_t  pos 
)
static

Definition at line 1098 of file tokenizer.c.

1098  {
1099  int run = 0;
1100  while (pos + run < (size_t)text_len) {
1101  /* Stop at '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token */
1102  if (pos + run + 3 <= (size_t)text_len &&
1103  (unsigned char)text[pos + run] == 0xE2 &&
1104  (unsigned char)text[pos + run + 1] == 0x96 &&
1105  (unsigned char)text[pos + run + 2] == 0x81) {
1106  break;
1107  }
1108 
1109  /* Check if any vocab token matches at this position */
1110  int max_len = 64;
1111  if (pos + run + max_len > (size_t)text_len) {
1112  max_len = (int)(text_len - pos - run);
1113  }
1114  int found = 0;
1115  for (int len = max_len; len >= 1; len--) {
1116  char tmp[65];
1117  memcpy(tmp, text + pos + run, len);
1118  tmp[len] = '\0';
1119  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
1120  if (info && info->id >= 0 && info->id != tok->unk_id && spm_token_allowed_in_dp(tok, info->id)) {
1121  found = 1;
1122  break;
1123  }
1124  }
1125  if (found) break;
1126  run++;
1127  }
1128  return run;
1129 }

References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_encode_byte_fallback()

static int spm_encode_byte_fallback ( const CKTokenizer tok,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)
static

Definition at line 1024 of file tokenizer.c.

1026  {
1027  if (!tok || !text || !ids || max_ids <= 0) return 0;
1028  if (text_len < 0) text_len = (int)strlen(text);
1029  if (text_len == 0) return 0;
1030 
1031  int count = 0;
1032  for (int i = 0; i < text_len && count < max_ids; i++) {
1033  unsigned char byte_val = (unsigned char)text[i];
1034  int32_t byte_token = spm_get_byte_token(tok, byte_val);
1035 
1036  /* If we have a byte token, use it; otherwise use UNK */
1037  if (byte_token >= 0 && byte_token != tok->unk_id) {
1038  ids[count++] = byte_token;
1039  } else {
1040  ids[count++] = tok->unk_id;
1041  }
1042  }
1043  return count;
1044 }
static int32_t spm_get_byte_token(const CKTokenizer *tok, unsigned char byte_val)
Definition: tokenizer.c:487

References ids, max_ids, spm_get_byte_token(), text, text_len, and CKTokenizer::unk_id.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_find_candidates_at_pos()

static int spm_find_candidates_at_pos ( const CKTokenizer tok,
const char *  text,
int  text_len,
size_t  pos,
int32_t *  candidates,
int  max_candidates 
)
static

Definition at line 1047 of file tokenizer.c.

1048  {
1049  if (!tok || !text || pos >= (size_t)text_len) return 0;
1050 
1051  int num_found = 0;
1052  int max_len = 64;
1053  if (pos + max_len > (size_t)text_len) max_len = (int)(text_len - pos);
1054 
1055  /* Iterate from longest to shortest to find all matches */
1056  char tmp[65];
1057  for (int len = max_len; len >= 1 && num_found < max_candidates; len--) {
1058  memcpy(tmp, text + pos, len);
1059  tmp[len] = '\0';
1060 
1061  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
1062  if (info && info->id >= 0 && info->id != tok->unk_id) {
1063  /* Skip disallowed token types */
1064  if (!spm_token_allowed_in_dp(tok, info->id)) {
1065  continue;
1066  }
1067 
1068  /* Check if already added */
1069  int dup = 0;
1070  for (int j = 0; j < num_found; j++) {
1071  if (candidates[j] == info->id) {
1072  dup = 1;
1073  break;
1074  }
1075  }
1076  if (!dup) {
1077  candidates[num_found++] = info->id;
1078  }
1079  }
1080  }
1081 
1082  /* If no candidates found, add UNK token as fallback.
1083  * For SPM, UNK should cover all consecutive unknown bytes until a known token or end.
1084  * We handle this by adding UNK with a special marker - we'll check at runtime
1085  * how far we can extend it. */
1086  if (num_found == 0 && tok->unk_id >= 0 && max_candidates > 0) {
1087  /* Only add UNK if it's allowed in DP */
1088  if (spm_token_allowed_in_dp(tok, tok->unk_id)) {
1089  candidates[num_found++] = tok->unk_id;
1090  }
1091  }
1092 
1093  return num_found;
1094 }

References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_get_byte_token()

static int32_t spm_get_byte_token ( const CKTokenizer tok,
unsigned char  byte_val 
)
inlinestatic

Definition at line 487 of file tokenizer.c.

487  {
488  /* Try fast lookup table first */
489  if (tok->byte_token_id && tok->byte_token_id[byte_val] >= 0) {
490  return tok->byte_token_id[byte_val];
491  }
492  /* Fallback to <0xXX> format */
493  char byte_token[16];
494  int len = snprintf(byte_token, sizeof(byte_token), "<0x%02X>", byte_val);
495  if (len <= 0) return tok->unk_id;
496  return ck_tokenizer_lookup(tok, byte_token);
497 }
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:323

References CKTokenizer::byte_token_id, ck_tokenizer_lookup(), and CKTokenizer::unk_id.

Referenced by spm_encode_byte_fallback(), and spm_llama_resegment_node().

◆ spm_is_byte_token()

static bool spm_is_byte_token ( const CKTokenizer tok,
int32_t  token_id 
)
inlinestatic

Definition at line 479 of file tokenizer.c.

479  {
480  if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
481  return false;
482  }
483  return tok->types[token_id] == GGUF_TOKEN_BYTE;
484 }

References GGUF_TOKEN_BYTE, CKTokenizer::types, and CKTokenizer::vocab_size.

◆ spm_llama_resegment_node()

static int spm_llama_resegment_node ( const CKTokenizer tok,
const SpmLlamaNode *  nodes,
int  node_id,
int32_t *  ids,
int  max_ids,
int  out_idx 
)
static

Definition at line 611 of file tokenizer.c.

616  {
617  if (!tok || !nodes || node_id < 0 || !ids || out_idx >= max_ids) {
618  return out_idx;
619  }
620 
621  const SpmLlamaNode *node = &nodes[node_id];
622  int32_t token_id = ck_tokenizer_lookup_exact_n(tok, node->text, node->n);
623  if (token_id >= 0) {
624  ids[out_idx++] = token_id;
625  return out_idx;
626  }
627 
628  if (node->left >= 0 && node->right >= 0) {
629  out_idx = spm_llama_resegment_node(tok, nodes, node->left, ids, max_ids, out_idx);
630  out_idx = spm_llama_resegment_node(tok, nodes, node->right, ids, max_ids, out_idx);
631  return out_idx;
632  }
633 
634  for (int i = 0; i < node->n && out_idx < max_ids; i++) {
635  int32_t byte_token = spm_get_byte_token(tok, (unsigned char)node->text[i]);
636  ids[out_idx++] = (byte_token >= 0) ? byte_token : tok->unk_id;
637  }
638  return out_idx;
639 }

References ck_tokenizer_lookup_exact_n(), ids, max_ids, spm_get_byte_token(), and CKTokenizer::unk_id.

Referenced by ck_tokenizer_encode_spm_llama_impl().

◆ spm_token_allowed_in_dp()

static bool spm_token_allowed_in_dp ( const CKTokenizer tok,
int32_t  token_id 
)
inlinestatic

Definition at line 469 of file tokenizer.c.

469  {
470  if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
471  return true; /* No type info, allow all */
472  }
473  uint8_t t = tok->types[token_id];
474  /* Reject CONTROL, UNUSED, and BYTE tokens (but allow UNKNOWN for fallback) */
475  return t != GGUF_TOKEN_CONTROL && t != GGUF_TOKEN_UNUSED && t != GGUF_TOKEN_BYTE;
476 }
#define GGUF_TOKEN_UNUSED
Definition: tokenizer.c:464

References GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_UNUSED, CKTokenizer::types, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode_spm_impl(), spm_count_unknown_run(), and spm_find_candidates_at_pos().

◆ spm_token_is_byte_format()

static bool spm_token_is_byte_format ( const char *  token)
inlinestatic

Definition at line 500 of file tokenizer.c.

500  {
501  return token && token[0] == '<' && token[1] == '0' &&
502  token[2] == 'x' && token[3] >= '0' && token[3] <= 'F' &&
503  token[4] >= '0' && token[4] <= 'F' && token[5] == '>';
504 }

References token.

Referenced by spm_build_byte_lookup().

◆ utf8_len()

static int utf8_len ( unsigned char  c)
inlinestatic

Definition at line 541 of file tokenizer.c.

541  {
542  if ((c & 0x80) == 0x00) return 1; /* ASCII */
543  if ((c & 0xE0) == 0xC0) return 2; /* 2-byte sequence */
544  if ((c & 0xF0) == 0xE0) return 3; /* 3-byte sequence */
545  if ((c & 0xF8) == 0xF0) return 4; /* 4-byte sequence */
546  return 1; /* Invalid, treat as 1 byte */
547 }

Referenced by ck_tokenizer_encode(), and ck_tokenizer_encode_spm_llama_impl().