← Back to C-Kernel-Engine Docs Doxygen Source Documentation
tokenizer.c
Go to the documentation of this file.
1 /*
2  * C-Kernel-Engine Greedy Tokenizer
3  *
4  * High-performance tokenizer with:
5  * - Greedy longest-match encoding
6  * - BPE and WordPiece support
7  * - MurmurHash3 for fast lookups
8  * - AVX-512 string comparison (when available)
9  *
10  * By Anthony Shivakumar
11  */
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <stdint.h>
17 #include <stdbool.h>
18 #include <ctype.h>
19 
20 #include "tokenizer/tokenizer.h"
21 #include "tokenizer/murmurhash3.h"
22 #include "tokenizer/hash_table.h"
23 
24 /* Token info stored in hash table value */
25 typedef struct {
26  int32_t id;
27  float score;
28  bool is_special;
29 } TokenInfo;
30 
31 /* Tokenizer structure - defined in tokenizer.h via typedef */
32 
33 /* Create a new tokenizer */
35  CKTokenizer *tok = (CKTokenizer *)malloc(sizeof(CKTokenizer));
36  if (!tok) {
37  return NULL;
38  }
39 
40  memset(tok, 0, sizeof(*tok));
41 
42  /* Create hash table for vocabulary */
44  if (!tok->vocab) {
45  free(tok);
46  return NULL;
47  }
48 
49  /* Create trie for fast lookups (1M nodes for ~50k vocab) */
50  tok->vocab_trie = ck_trie_create(1000000);
51  if (!tok->vocab_trie) {
53  free(tok);
54  return NULL;
55  }
56 
57  /* Initialize reverse vocab */
58  tok->vocab_capacity = 4096;
59  tok->id_to_token = (char **)calloc(tok->vocab_capacity, sizeof(char *));
60  if (!tok->id_to_token) {
62  free(tok);
63  return NULL;
64  }
65 
66  /* Set default special tokens */
67  tok->unk_id = 0;
68  tok->bos_id = 1;
69  tok->eos_id = 2;
70  tok->pad_id = -1;
71  tok->mask_id = -1;
72 
73  /* Initialize scores and types for SPM */
74  tok->scores = NULL;
75  tok->types = NULL;
76 
77  /* Set config */
78  tok->config.type = type;
79  tok->config.add_bos = false;
80  tok->config.add_eos = false;
81  tok->config.add_space_prefix = true;
82  tok->config.unk_score = -1e10f;
84 
85  ck_tokenizer_mempool_init(&tok->pool, 1024 * 1024);
86 
87  return tok;
88 }
89 
90 /* Free a tokenizer */
92  if (!tok) return;
93 
94  /* Free vocabulary entries */
95  if (tok->vocab) {
97  }
98 
99  /* Free trie */
100  if (tok->vocab_trie) {
101  ck_trie_free(tok->vocab_trie);
102  }
103 
104  /* Free reverse vocab strings */
105  if (tok->id_to_token) {
106  /* Note: strings were strdup'd in add_token */
107  for (size_t i = 0; i < tok->vocab_size; i++) {
108  if (tok->id_to_token[i]) {
109  free(tok->id_to_token[i]);
110  }
111  }
112  free(tok->id_to_token);
113  }
114 
115  /* Free SPM-related arrays */
116  if (tok->scores) free(tok->scores);
117  if (tok->types) free(tok->types);
118  if (tok->byte_token_id) free(tok->byte_token_id);
119 
121  free(tok);
122 }
123 
124 /* Reset tokenizer state */
126  if (!tok) return;
127 
129 
130  if (tok->vocab_trie) {
132  }
133 
134  for (size_t i = 0; i < tok->vocab_size; i++) {
135  if (tok->id_to_token[i]) {
136  free(tok->id_to_token[i]);
137  tok->id_to_token[i] = NULL;
138  }
139  }
140 
141  tok->vocab_size = 0;
142 
143  /* Reset SPM-related arrays using actual allocated sizes */
144  if (tok->scores && tok->scores_size > 0) {
145  memset(tok->scores, 0, tok->scores_size * sizeof(float));
146  }
147  if (tok->types && tok->types_size > 0) {
148  memset(tok->types, 0, tok->types_size * sizeof(uint8_t));
149  }
150  /* Clear byte lookup table */
151  if (tok->byte_token_id) {
152  memset(tok->byte_token_id, -1, 256 * sizeof(int32_t));
153  }
154 }
155 
156 /* Add a token to vocabulary */
157 int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score) {
158  if (!tok || !token) {
159  return -1;
160  }
161 
162  /* Ensure we have space in reverse vocab */
163  if (id >= (int32_t)tok->vocab_capacity) {
164  size_t new_cap = tok->vocab_capacity * 2;
165  while (new_cap <= (size_t)id) {
166  new_cap *= 2;
167  }
168  char **new_array = (char **)realloc(tok->id_to_token, new_cap * sizeof(char *));
169  if (!new_array) {
170  return -1;
171  }
172  memset(new_array + tok->vocab_capacity, 0, (new_cap - tok->vocab_capacity) * sizeof(char *));
173  tok->id_to_token = new_array;
174  tok->vocab_capacity = new_cap;
175  }
176 
177  /* Check if token already exists */
178  TokenInfo *existing = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
179  if (existing) {
180  existing->id = id;
181  existing->score = score;
182  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
183  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
184  tok->id_to_token[id] = strdup(token);
185  return 0;
186  }
187 
188  /* Create new token info */
189  TokenInfo *info = (TokenInfo *)malloc(sizeof(TokenInfo));
190  if (!info) return -1;
191  info->id = id;
192  info->score = score;
193  info->is_special = false;
194 
195  if (ck_tokenizer_hash_table_insert(tok->vocab, token, info) != 0) {
196  free(info);
197  return -1;
198  }
199 
200  /* Also add to trie for fast longest-match lookups */
201  if (tok->vocab_trie) {
202  ck_trie_insert(tok->vocab_trie, token, id, false, 0);
203  }
204 
205  if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
206  if (tok->id_to_token[id]) free(tok->id_to_token[id]);
207  tok->id_to_token[id] = strdup(token);
208 
209  return 0;
210 }
211 
212 /* Add special token */
213 int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id) {
214  if (!tok || !name) return -1;
215  if (ck_tokenizer_add_token(tok, name, id, -1e10f) != 0) return -1;
216 
217  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, name);
218  if (info) info->is_special = true;
219 
220  /* Also add to trie as special */
221  if (tok->vocab_trie) {
222  ck_trie_insert(tok->vocab_trie, name, id, true, 0);
223  }
224 
225  if (strcmp(name, "<unk>") == 0 || strcmp(name, "[UNK]") == 0) tok->unk_id = id;
226  else if (strcmp(name, "<s>") == 0 || strcmp(name, "<bos>") == 0 || strcmp(name, "[BOS]") == 0) tok->bos_id = id;
227  else if (strcmp(name, "</s>") == 0 || strcmp(name, "<eos>") == 0 || strcmp(name, "[EOS]") == 0) tok->eos_id = id;
228  else if (strcmp(name, "<pad>") == 0 || strcmp(name, "[PAD]") == 0) tok->pad_id = id;
229 
230  return 0;
231 }
232 
233 /* Set special token IDs */
234 void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask) {
235  if (!tok) return;
236  tok->unk_id = unk;
237  tok->bos_id = bos;
238  tok->eos_id = eos;
239  tok->pad_id = pad;
240  tok->mask_id = mask;
241 }
242 
244  if (!tok) return;
245  tok->config.add_bos = add_bos;
246  tok->config.add_eos = add_eos;
247 }
248 
250  if (!tok) return;
252 }
253 
255  if (!tok) return;
256  tok->config.spm_mode = spm_mode;
257 }
258 
259 /* Set whether to use trie for lookups */
261  if (!tok) return;
262  tok->config.use_trie = use_trie;
263 }
264 
265 /* Set space prefix style for BPE tokenizers */
267  if (!tok) return;
269  if (style != CK_SPACE_PREFIX_AUTO) {
270  tok->config.space_prefix_detected = true;
271  }
272 }
273 
274 /* Auto-detect space prefix style from vocabulary.
275  * Checks for presence of tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece). */
277  if (!tok) return CK_SPACE_PREFIX_GPT2;
278 
279  /* Already detected? */
281  return tok->config.space_prefix_style;
282  }
283 
284  /* Count tokens starting with each style:
285  * Ġ (U+0120) = bytes 0xC4 0xA0
286  * ▁ (U+2581) = bytes 0xE2 0x96 0x81
287  */
288  int gpt2_count = 0;
289  int spm_count = 0;
290 
291  for (size_t i = 0; i < tok->vocab_size && i < 10000; i++) { /* Sample first 10k tokens */
292  const char *token = tok->id_to_token[i];
293  if (!token) continue;
294 
295  unsigned char c0 = (unsigned char)token[0];
296  unsigned char c1 = (unsigned char)token[1];
297 
298  /* Check for Ġ (0xC4 0xA0) */
299  if (c0 == 0xC4 && c1 == 0xA0) {
300  gpt2_count++;
301  }
302  /* Check for ▁ (0xE2 0x96 0x81) */
303  else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
304  spm_count++;
305  }
306  }
307 
308  /* Determine style based on counts */
309  CKSpacePrefixStyle detected;
310  if (spm_count > gpt2_count * 2) {
311  detected = CK_SPACE_PREFIX_SPM;
312  } else {
313  detected = CK_SPACE_PREFIX_GPT2; /* Default to GPT-2 if similar counts */
314  }
315 
316  tok->config.space_prefix_style = detected;
317  tok->config.space_prefix_detected = true;
318 
319  return detected;
320 }
321 
322 /* Look up token ID */
323 int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token) {
324  if (!tok || !token) return -1;
325  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
326  return info ? info->id : tok->unk_id;
327 }
328 
329 /* Internal exact lookup (returns -1 if token string is not in vocab). */
330 static int32_t ck_tokenizer_lookup_exact(const CKTokenizer *tok, const char *token) {
331  if (!tok || !token) return -1;
332  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
333  return info ? info->id : -1;
334 }
335 
336 /* Internal exact lookup for non-null-terminated text slices. */
337 static int32_t ck_tokenizer_lookup_exact_n(const CKTokenizer *tok, const char *text, int text_len) {
338  if (!tok || !text || text_len <= 0) return -1;
339  char stack_buf[512];
340  char *tmp = stack_buf;
341  if (text_len >= (int)sizeof(stack_buf)) {
342  tmp = (char *)malloc((size_t)text_len + 1);
343  if (!tmp) return -1;
344  }
345  memcpy(tmp, text, (size_t)text_len);
346  tmp[text_len] = '\0';
347  int32_t id = ck_tokenizer_lookup_exact(tok, tmp);
348  if (tmp != stack_buf) free(tmp);
349  return id;
350 }
351 
352 /* Get token string by ID */
353 const char *ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id) {
354  if (!tok || id < 0 || id >= (int32_t)tok->vocab_size) return NULL;
355  return tok->id_to_token[id];
356 }
357 
358 /* Find longest matching token at position using trie (O(k) where k = token length) */
359 static int32_t find_longest_match_trie(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {
360  if (!tok || !tok->vocab_trie || !text || pos >= text_len) {
361  *match_len = 0;
362  return tok ? tok->unk_id : -1;
363  }
364 
365  int32_t token_id = ck_trie_find_longest(tok->vocab_trie, text, text_len, pos, match_len);
366  return token_id >= 0 ? token_id : tok->unk_id;
367 }
368 
369 /* Find longest matching token at position using hash table (O(n*k) worst case) */
370 static int32_t find_longest_match_hash(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {
371  if (!tok || !text || pos >= text_len) {
372  *match_len = 0;
373  return tok ? tok->unk_id : -1;
374  }
375 
376  size_t max_len = 64;
377  if (pos + max_len > text_len) max_len = text_len - pos;
378 
379  int32_t best_id = tok->unk_id;
380  size_t best_len = 0;
381 
382  for (size_t len = max_len; len >= 1; len--) {
383  char tmp[65];
384  memcpy(tmp, text + pos, len);
385  tmp[len] = '\0';
386 
387  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
388  if (info) {
389  best_id = info->id;
390  best_len = len;
391  break;
392  }
393  }
394 
395  *match_len = best_len;
396  return best_id;
397 }
398 
399 /* Find longest matching token at position - dispatches to trie or hash table */
400 static int32_t find_longest_match(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {
401  if (tok->config.use_trie) {
402  return find_longest_match_trie(tok, text, text_len, pos, match_len);
403  } else {
404  return find_longest_match_hash(tok, text, text_len, pos, match_len);
405  }
406 }
407 
408 /* Convert ASCII spaces to space prefix marker.
409  * GPT-2/Qwen use Ġ (U+0120, bytes 0xC4 0xA0) - replaces spaces only
410  * LLaMA/SentencePiece use ▁ (U+2581, bytes 0xE2 0x96 0x81) - adds prefix at start AND replaces spaces
411  * Returns new length, or -1 if buffer too small. */
412 static int preprocess_bpe_spaces(const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style) {
413  int out_len = 0;
414 
415  /* For SentencePiece, add ▁ at the start of text (unless text starts with space) */
416  if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
417  if (out_len + 3 > out_max) return -1;
418  out[out_len++] = (char)0xE2;
419  out[out_len++] = (char)0x96;
420  out[out_len++] = (char)0x81;
421  }
422 
423  for (int i = 0; i < text_len; i++) {
424  if (text[i] == ' ') {
425  if (style == CK_SPACE_PREFIX_SPM) {
426  /* SentencePiece style: ▁ (3 bytes: 0xE2 0x96 0x81) */
427  if (out_len + 3 > out_max) return -1;
428  out[out_len++] = (char)0xE2;
429  out[out_len++] = (char)0x96;
430  out[out_len++] = (char)0x81;
431  } else {
432  /* GPT-2 style: Ġ (2 bytes: 0xC4 0xA0) */
433  if (out_len + 2 > out_max) return -1;
434  out[out_len++] = (char)0xC4;
435  out[out_len++] = (char)0xA0;
436  }
437  } else {
438  if (out_len + 1 > out_max) return -1;
439  out[out_len++] = text[i];
440  }
441  }
442  return out_len;
443 }
444 
445 /* ============================================================================
446  * SPM (SentencePiece) Tokenization with Viterbi/DP
447  * ============================================================================ */
448 
449 /*
450  * GGUF Token Type enum values (from llama.cpp gguf/constants.py):
451  * NORMAL = 1
452  * UNKNOWN = 2
453  * CONTROL = 3
454  * USER_DEFINED = 4
455  * UNUSED = 5
456  * BYTE = 6
457  *
458  * IMPORTANT: These must match exactly or token filtering will be incorrect!
459  */
460 #define GGUF_TOKEN_NORMAL 1
461 #define GGUF_TOKEN_UNKNOWN 2
462 #define GGUF_TOKEN_CONTROL 3
463 #define GGUF_TOKEN_USER_DEFINED 4
464 #define GGUF_TOKEN_UNUSED 5
465 #define GGUF_TOKEN_BYTE 6
466 
467 /* Check if token type allows inclusion in DP path (exclude CONTROL, UNUSED, BYTE)
468  * Note: UNKNOWN tokens are allowed because they're needed for unknown content */
469 static inline bool spm_token_allowed_in_dp(const CKTokenizer *tok, int32_t token_id) {
470  if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
471  return true; /* No type info, allow all */
472  }
473  uint8_t t = tok->types[token_id];
474  /* Reject CONTROL, UNUSED, and BYTE tokens (but allow UNKNOWN for fallback) */
475  return t != GGUF_TOKEN_CONTROL && t != GGUF_TOKEN_UNUSED && t != GGUF_TOKEN_BYTE;
476 }
477 
478 /* Check if token is a byte token (for identification) */
479 static inline bool spm_is_byte_token(const CKTokenizer *tok, int32_t token_id) {
480  if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
481  return false;
482  }
483  return tok->types[token_id] == GGUF_TOKEN_BYTE;
484 }
485 
486 /* Find byte token ID using fast lookup table (primary) or <0xXX> fallback */
487 static inline int32_t spm_get_byte_token(const CKTokenizer *tok, unsigned char byte_val) {
488  /* Try fast lookup table first */
489  if (tok->byte_token_id && tok->byte_token_id[byte_val] >= 0) {
490  return tok->byte_token_id[byte_val];
491  }
492  /* Fallback to <0xXX> format */
493  char byte_token[16];
494  int len = snprintf(byte_token, sizeof(byte_token), "<0x%02X>", byte_val);
495  if (len <= 0) return tok->unk_id;
496  return ck_tokenizer_lookup(tok, byte_token);
497 }
498 
499 /* Check if a token string represents a byte token (<0xXX> format) */
500 static inline bool spm_token_is_byte_format(const char *token) {
501  return token && token[0] == '<' && token[1] == '0' &&
502  token[2] == 'x' && token[3] >= '0' && token[3] <= 'F' &&
503  token[4] >= '0' && token[4] <= 'F' && token[5] == '>';
504 }
505 
506 /* Build byte token lookup table from vocab (called during load) */
507 static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size) {
508  /* Reuse existing array or allocate new one */
509  if (!tok->byte_token_id) {
510  tok->byte_token_id = (int32_t *)malloc(256 * sizeof(int32_t));
511  if (!tok->byte_token_id) return;
512  }
513 
514  /* Initialize all entries to -1 */
515  for (int i = 0; i < 256; i++) {
516  tok->byte_token_id[i] = -1;
517  }
518 
519  /* Scan vocab for byte tokens */
520  for (int i = 0; i < vocab_size; i++) {
521  if (!tok->types || tok->types[i] != GGUF_TOKEN_BYTE) continue;
522 
523  const char *token = strings + offsets[i];
524  size_t len = strlen(token);
525 
526  if (len == 1) {
527  /* Raw byte token (single byte) */
528  unsigned char byte_val = (unsigned char)token[0];
529  tok->byte_token_id[byte_val] = i;
530  } else if (spm_token_is_byte_format(token)) {
531  /* <0xXX> format - parse the hex value */
532  unsigned int byte_val;
533  if (sscanf(token, "<0x%02X>", &byte_val) == 1 && byte_val < 256) {
534  tok->byte_token_id[byte_val] = i;
535  }
536  }
537  }
538 }
539 
540 /* Get length of UTF-8 codepoint starting at c (0 if invalid) */
541 static inline int utf8_len(unsigned char c) {
542  if ((c & 0x80) == 0x00) return 1; /* ASCII */
543  if ((c & 0xE0) == 0xC0) return 2; /* 2-byte sequence */
544  if ((c & 0xF0) == 0xE0) return 3; /* 3-byte sequence */
545  if ((c & 0xF8) == 0xF0) return 4; /* 4-byte sequence */
546  return 1; /* Invalid, treat as 1 byte */
547 }
548 
549 /* llama.cpp SPM whitespace handling:
550  * - Optional dummy prefix as ASCII space
551  * - Replace each ASCII space with ▁ (U+2581)
552  * - Do not trim or collapse whitespace
553  */
554 static int preprocess_spm_llama_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix) {
555  int out_len = 0;
556  if (text_len < 0) text_len = (int)strlen(text);
557 
558  if (add_space_prefix && text_len > 0) {
559  if (out_len + 3 > out_max) return -1;
560  out[out_len++] = (char)0xE2;
561  out[out_len++] = (char)0x96;
562  out[out_len++] = (char)0x81;
563  }
564 
565  for (int i = 0; i < text_len;) {
566  if (text[i] == ' ') {
567  int j = i;
568  while (j < text_len && text[j] == ' ') {
569  j++;
570  }
571  int run = j - i;
572 
573  /* Match llama.cpp behavior for this GGUF family:
574  * single separators map to ▁, but multi-space runs remain literal. */
575  if (run == 1) {
576  if (out_len + 3 > out_max) return -1;
577  out[out_len++] = (char)0xE2;
578  out[out_len++] = (char)0x96;
579  out[out_len++] = (char)0x81;
580  } else {
581  if (out_len + run > out_max) return -1;
582  for (int k = 0; k < run; k++) {
583  out[out_len++] = ' ';
584  }
585  }
586  i = j;
587  } else {
588  if (out_len + 1 > out_max) return -1;
589  out[out_len++] = text[i++];
590  }
591  }
592 
593  return out_len;
594 }
595 
596 typedef struct {
597  int prev;
598  int next;
599  const char *text;
600  int n;
601  int node_id;
602 } SpmLlamaSymbol;
603 
604 typedef struct {
605  const char *text;
606  int n;
607  int left;
608  int right;
609 } SpmLlamaNode;
610 
611 static int spm_llama_resegment_node(const CKTokenizer *tok,
612  const SpmLlamaNode *nodes,
613  int node_id,
614  int32_t *ids,
615  int max_ids,
616  int out_idx) {
617  if (!tok || !nodes || node_id < 0 || !ids || out_idx >= max_ids) {
618  return out_idx;
619  }
620 
621  const SpmLlamaNode *node = &nodes[node_id];
622  int32_t token_id = ck_tokenizer_lookup_exact_n(tok, node->text, node->n);
623  if (token_id >= 0) {
624  ids[out_idx++] = token_id;
625  return out_idx;
626  }
627 
628  if (node->left >= 0 && node->right >= 0) {
629  out_idx = spm_llama_resegment_node(tok, nodes, node->left, ids, max_ids, out_idx);
630  out_idx = spm_llama_resegment_node(tok, nodes, node->right, ids, max_ids, out_idx);
631  return out_idx;
632  }
633 
634  for (int i = 0; i < node->n && out_idx < max_ids; i++) {
635  int32_t byte_token = spm_get_byte_token(tok, (unsigned char)node->text[i]);
636  ids[out_idx++] = (byte_token >= 0) ? byte_token : tok->unk_id;
637  }
638  return out_idx;
639 }
640 
641 /* llama.cpp merge-style SPM path (LLAMA_VOCAB_TYPE_SPM). */
643  const char *text,
644  int text_len,
645  int32_t *ids,
646  int max_ids) {
647  if (!tok || !text || !ids || max_ids <= 0) return 0;
648  if (text_len < 0) text_len = (int)strlen(text);
649  if (text_len == 0) return 0;
650 
651  char preprocessed[8192];
652  int pp_len = preprocess_spm_llama_text(text, text_len, preprocessed, (int)sizeof(preprocessed) - 1,
653  tok->config.add_space_prefix);
654  if (pp_len < 0) return 0;
655  preprocessed[pp_len] = '\0';
656 
657  int num_symbols = 0;
658  for (int offs = 0; offs < pp_len;) {
659  int char_len = utf8_len((unsigned char)preprocessed[offs]);
660  if (char_len <= 0) char_len = 1;
661  if (offs + char_len > pp_len) char_len = pp_len - offs;
662  offs += char_len;
663  num_symbols++;
664  }
665  if (num_symbols <= 0) return 0;
666 
667  SpmLlamaSymbol *symbols = (SpmLlamaSymbol *)calloc((size_t)num_symbols, sizeof(SpmLlamaSymbol));
668  int node_cap = 2 * num_symbols + 1;
669  SpmLlamaNode *nodes = (SpmLlamaNode *)calloc((size_t)node_cap, sizeof(SpmLlamaNode));
670  if (!symbols || !nodes) {
671  if (symbols) free(symbols);
672  if (nodes) free(nodes);
673  return 0;
674  }
675 
676  int index = 0;
677  for (int offs = 0; offs < pp_len && index < num_symbols;) {
678  int char_len = utf8_len((unsigned char)preprocessed[offs]);
679  if (char_len <= 0) char_len = 1;
680  if (offs + char_len > pp_len) char_len = pp_len - offs;
681 
682  symbols[index].text = preprocessed + offs;
683  symbols[index].n = char_len;
684  symbols[index].prev = index - 1;
685  symbols[index].next = (index + 1 < num_symbols) ? (index + 1) : -1;
686  symbols[index].node_id = index;
687 
688  nodes[index].text = preprocessed + offs;
689  nodes[index].n = char_len;
690  nodes[index].left = -1;
691  nodes[index].right = -1;
692 
693  offs += char_len;
694  index++;
695  }
696 
697  int node_count = num_symbols;
698  for (;;) {
699  int best_left = -1;
700  int best_right = -1;
701  float best_score = -1e30f;
702 
703  for (int left = 0; left != -1; left = symbols[left].next) {
704  int right = symbols[left].next;
705  if (right < 0) continue;
706 
707  int pair_len = symbols[left].n + symbols[right].n;
708  int32_t token_id = ck_tokenizer_lookup_exact_n(tok, symbols[left].text, pair_len);
709  if (token_id < 0 || token_id >= (int32_t)tok->vocab_size) continue;
710 
711  float score = 0.0f;
712  if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->scores_size) {
713  score = tok->scores[token_id];
714  }
715 
716  if (best_left < 0 || score > best_score || (score == best_score && left < best_left)) {
717  best_left = left;
718  best_right = right;
719  best_score = score;
720  }
721  }
722 
723  if (best_left < 0 || best_right < 0) break;
724  if (node_count >= node_cap) break;
725 
726  SpmLlamaSymbol *left = &symbols[best_left];
727  SpmLlamaSymbol *right = &symbols[best_right];
728 
729  int new_node_id = node_count++;
730  nodes[new_node_id].text = left->text;
731  nodes[new_node_id].n = left->n + right->n;
732  nodes[new_node_id].left = left->node_id;
733  nodes[new_node_id].right = right->node_id;
734 
735  left->n += right->n;
736  left->node_id = new_node_id;
737  left->next = right->next;
738  if (right->next >= 0) {
739  symbols[right->next].prev = best_left;
740  }
741 
742  right->n = 0;
743  right->prev = -1;
744  right->next = -1;
745  }
746 
747  int out_idx = 0;
748  for (int i = 0; i != -1 && out_idx < max_ids; i = symbols[i].next) {
749  out_idx = spm_llama_resegment_node(tok, nodes, symbols[i].node_id, ids, max_ids, out_idx);
750  }
751 
752  free(symbols);
753  free(nodes);
754  return out_idx;
755 }
756 
757 /* Replace spaces with SentencePiece underscore (U+2581)
758  * Also normalize whitespace similarly to SPM:
759  * - Leading spaces: consume them (SPM adds dummy prefix)
760  * - Multiple spaces: collapse to single space
761  * - Trailing spaces: consume them
762  */
763 static int preprocess_spm_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix) {
764  int out_len = 0;
765 
766  /* Count leading spaces */
767  int lead_spaces = 0;
768  while (lead_spaces < text_len && text[lead_spaces] == ' ') {
769  lead_spaces++;
770  }
771 
772  /* Count trailing spaces */
773  int trail_spaces = 0;
774  while (trail_spaces < text_len - lead_spaces &&
775  text[text_len - 1 - trail_spaces] == ' ') {
776  trail_spaces++;
777  }
778 
779  /* Add ▁ at start if there's any non-space content AND text doesn't already start with ▁ */
780  int content_len = text_len - lead_spaces - trail_spaces;
781  int starts_with_prefix = (text_len >= 3 &&
782  (unsigned char)text[0] == 0xE2 &&
783  (unsigned char)text[1] == 0x96 &&
784  (unsigned char)text[2] == 0x81);
785  int inserted_prefix = 0;
786  if (content_len > 0 && !starts_with_prefix && add_space_prefix) {
787  if (out_len + 3 > out_max) return -1;
788  out[out_len++] = (char)0xE2;
789  out[out_len++] = (char)0x96;
790  out[out_len++] = (char)0x81;
791  inserted_prefix = 1;
792  }
793 
794  /* Process middle content: collapse multiple spaces to single ▁ */
795  int i = lead_spaces;
796  int last_was_space = (starts_with_prefix || inserted_prefix) ? 1 : 0;
797  while (i < text_len - trail_spaces) {
798  if (text[i] == ' ') {
799  if (!last_was_space) {
800  /* First space after content - add ▁ */
801  if (out_len + 3 > out_max) return -1;
802  out[out_len++] = (char)0xE2;
803  out[out_len++] = (char)0x96;
804  out[out_len++] = (char)0x81;
805  last_was_space = 1;
806  }
807  /* Skip additional consecutive spaces */
808  } else {
809  if (out_len + 1 > out_max) return -1;
810  out[out_len++] = text[i];
811  last_was_space = 0;
812  }
813  i++;
814  }
815 
816  return out_len;
817 }
818 
819 /* Forward declaration for SPM Viterbi */
820 static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len,
821  size_t pos, int32_t *candidates, int max_candidates);
822 
823 /* Forward declaration for unknown run counting */
824 static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos);
825 
826 /* Forward declaration for byte fallback */
827 static int spm_encode_byte_fallback(const CKTokenizer *tok,
828  const char *text, int text_len,
829  int32_t *ids, int max_ids);
830 
831 /* SPM Viterbi/DP encoding - finds best token sequence using token scores */
833  const char *text,
834  int text_len,
835  int32_t *ids,
836  int max_ids) {
837  if (!tok || !text || !ids || max_ids <= 0) return 0;
838  if (text_len < 0) text_len = (int)strlen(text);
839  if (text_len == 0) return 0;
840  const int dbg = getenv("CK_DEBUG_SPM_ENCODE") ? 1 : 0;
841  if (dbg) {
842  fprintf(stderr, "[SPM] encode start: text_len=%d max_ids=%d\n", text_len, max_ids);
843  }
844 
845  /* Preprocess: replace spaces with ▁ */
846  char preprocessed[8192];
847  int pp_len = preprocess_spm_text(text, text_len, preprocessed, sizeof(preprocessed) - 1,
848  tok->config.add_space_prefix);
849  if (pp_len < 0) return 0;
850  preprocessed[pp_len] = '\0';
851  if (dbg) {
852  fprintf(stderr, "[SPM] preprocessed len=%d: \"%.*s\"\n", pp_len, pp_len, preprocessed);
853  }
854 
855  /* DP arrays - use malloc for large inputs */
856  size_t n = (size_t)pp_len + 1;
857  float *best_score = (float *)malloc(n * sizeof(float));
858  int32_t *best_prev = (int32_t *)malloc(n * sizeof(int32_t));
859  int32_t *best_token = (int32_t *)malloc(n * sizeof(int32_t));
860  if (dbg) {
861  fprintf(stderr, "[SPM] DP alloc n=%zu\n", n);
862  }
863 
864  if (!best_score || !best_prev || !best_token) {
865  if (best_score) free(best_score);
866  if (best_prev) free(best_prev);
867  if (best_token) free(best_token);
868  return 0;
869  }
870 
871  /* Initialize DP */
872  const float neg_inf = -1e30f;
873  const float unknown_penalty = -10.0f; /* SentencePiece-style UNK penalty */
874  for (size_t i = 0; i < n; i++) {
875  best_score[i] = neg_inf;
876  best_prev[i] = -1;
877  best_token[i] = -1;
878  }
879  best_score[0] = 0.0f;
880 
881  /* DP: for each position, find best way to reach it */
882  for (size_t pos = 0; pos < n; pos++) {
883  if (best_score[pos] == neg_inf) continue;
884 
885  /* Find all tokens that match at this position */
886  int32_t candidates[64];
887  int num_cand = spm_find_candidates_at_pos(tok, preprocessed, pp_len, pos, candidates, 64);
888  if (dbg && pos < 8) {
889  fprintf(stderr, "[SPM] pos=%zu cand=%d\n", pos, num_cand);
890  }
891 
892  for (int c = 0; c < num_cand; c++) {
893  int32_t token_id = candidates[c];
894 
895  /* Skip disallowed token types in DP */
896  if (!spm_token_allowed_in_dp(tok, token_id)) {
897  continue;
898  }
899 
900  /* Get token string and length */
901  const char *token = ck_tokenizer_id_to_token(tok, token_id);
902  if (!token) continue;
903 
904  /* Calculate token length in bytes */
905  int token_len = (int)strlen(token);
906 
907  /* For UNK token, use the unknown run length to cover all consecutive unknown bytes */
908  if (token_id == tok->unk_id) {
909  token_len = spm_count_unknown_run(tok, preprocessed, pp_len, pos);
910  if (token_len == 0) token_len = 1; /* At least 1 byte */
911  }
912 
913  size_t next_pos = pos + token_len;
914 
915  if (next_pos >= n) continue;
916 
917  /* Get token score for Viterbi */
918  float token_score = 0.0f;
919  if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->vocab_size) {
920  token_score = tok->scores[token_id];
921  }
922 
923  /* USER_DEFINED tokens get score 0 (like llama.cpp) */
924  if (tok->types && token_id >= 0 && token_id < (int32_t)tok->types_size) {
925  if (tok->types[token_id] == GGUF_TOKEN_USER_DEFINED) {
926  token_score = 0.0f;
927  }
928  }
929  /* Apply UNK penalty (SentencePiece behavior) */
930  if (token_id == tok->unk_id) {
931  token_score += unknown_penalty;
932  }
933 
934  /* Transition: score = best_score[pos] + token_score */
935  float new_score = best_score[pos] + token_score;
936 
937  if (new_score > best_score[next_pos]) {
938  best_score[next_pos] = new_score;
939  best_prev[next_pos] = (int32_t)pos;
940  best_token[next_pos] = token_id;
941  }
942  }
943  }
944 
945  /* Backtrack to find best token sequence */
946  int32_t *reverse_ids = (int32_t *)malloc(max_ids * sizeof(int32_t));
947  if (!reverse_ids) {
948  free(best_score);
949  free(best_prev);
950  free(best_token);
951  return 0;
952  }
953 
954  int num_tokens = 0;
955  int32_t curr = (int32_t)(n - 1);
956 
957  /* Handle trailingUNK by finding valid end */
958  while (curr > 0 && best_token[curr] < 0) {
959  curr = best_prev[curr];
960  }
961 
962  /* Backtrack from end to start, collecting tokens.
963  * We track the token's start position to avoid duplicates. */
964  int last_start = -1; /* Track the start position of last added token */
965  while (curr > 0 && num_tokens < max_ids) {
966  int32_t token_id = best_token[curr];
967  if (token_id >= 0) {
968  /* Use the DP backpointer as the true token start */
969  int token_start = best_prev[curr];
970 
971  /* Only add if this is a new token (different start position) */
972  if (token_start != last_start) {
973  reverse_ids[num_tokens++] = token_id;
974  last_start = token_start;
975  }
976  }
977  curr = best_prev[curr];
978  }
979  if (dbg) {
980  fprintf(stderr, "[SPM] backtrack tokens=%d curr=%d\n", num_tokens, curr);
981  }
982 
983  /* Free DP arrays before using reverse_ids */
984  free(best_score);
985  free(best_prev);
986  free(best_token);
987 
988  if (num_tokens > max_ids) num_tokens = max_ids;
989 
990  /* Backtracking collected tokens in reverse order, so reverse once */
991  for (int i = 0; i < num_tokens / 2; i++) {
992  int32_t tmp = reverse_ids[i];
993  reverse_ids[i] = reverse_ids[num_tokens - 1 - i];
994  reverse_ids[num_tokens - 1 - i] = tmp;
995  }
996 
997  /* Copy to output and merge consecutive UNK tokens (SPM behavior) */
998  int out_idx = 0;
999  for (int i = 0; i < num_tokens && out_idx < max_ids; i++) {
1000  int32_t token_id = reverse_ids[i];
1001 
1002  /* Merge consecutive UNK tokens into one */
1003  if (token_id == tok->unk_id && out_idx > 0 && ids[out_idx - 1] == tok->unk_id) {
1004  continue; /* Skip - already have UNK */
1005  }
1006  ids[out_idx++] = token_id;
1007  }
1008  if (dbg) {
1009  fprintf(stderr, "[SPM] encode done: out=%d\n", out_idx);
1010  }
1011 
1012  free(reverse_ids);
1013 
1014  /* If DP failed to produce valid tokens, use byte-fallback */
1015  if (num_tokens == 0) {
1017  }
1018 
1019  return out_idx;
1020 }
1021 
1022 /* Fallback: encode using byte tokens for any unmatched content.
1023  * Uses the ORIGINAL text (not preprocessed), mapping each byte to a byte token. */
1025  const char *text, int text_len,
1026  int32_t *ids, int max_ids) {
1027  if (!tok || !text || !ids || max_ids <= 0) return 0;
1028  if (text_len < 0) text_len = (int)strlen(text);
1029  if (text_len == 0) return 0;
1030 
1031  int count = 0;
1032  for (int i = 0; i < text_len && count < max_ids; i++) {
1033  unsigned char byte_val = (unsigned char)text[i];
1034  int32_t byte_token = spm_get_byte_token(tok, byte_val);
1035 
1036  /* If we have a byte token, use it; otherwise use UNK */
1037  if (byte_token >= 0 && byte_token != tok->unk_id) {
1038  ids[count++] = byte_token;
1039  } else {
1040  ids[count++] = tok->unk_id;
1041  }
1042  }
1043  return count;
1044 }
1045 
1046 /* Find all candidate tokens matching at position */
1047 static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len,
1048  size_t pos, int32_t *candidates, int max_candidates) {
1049  if (!tok || !text || pos >= (size_t)text_len) return 0;
1050 
1051  int num_found = 0;
1052  int max_len = 64;
1053  if (pos + max_len > (size_t)text_len) max_len = (int)(text_len - pos);
1054 
1055  /* Iterate from longest to shortest to find all matches */
1056  char tmp[65];
1057  for (int len = max_len; len >= 1 && num_found < max_candidates; len--) {
1058  memcpy(tmp, text + pos, len);
1059  tmp[len] = '\0';
1060 
1061  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
1062  if (info && info->id >= 0 && info->id != tok->unk_id) {
1063  /* Skip disallowed token types */
1064  if (!spm_token_allowed_in_dp(tok, info->id)) {
1065  continue;
1066  }
1067 
1068  /* Check if already added */
1069  int dup = 0;
1070  for (int j = 0; j < num_found; j++) {
1071  if (candidates[j] == info->id) {
1072  dup = 1;
1073  break;
1074  }
1075  }
1076  if (!dup) {
1077  candidates[num_found++] = info->id;
1078  }
1079  }
1080  }
1081 
1082  /* If no candidates found, add UNK token as fallback.
1083  * For SPM, UNK should cover all consecutive unknown bytes until a known token or end.
1084  * We handle this by adding UNK with a special marker - we'll check at runtime
1085  * how far we can extend it. */
1086  if (num_found == 0 && tok->unk_id >= 0 && max_candidates > 0) {
1087  /* Only add UNK if it's allowed in DP */
1088  if (spm_token_allowed_in_dp(tok, tok->unk_id)) {
1089  candidates[num_found++] = tok->unk_id;
1090  }
1091  }
1092 
1093  return num_found;
1094 }
1095 
1096 /* Count how many consecutive bytes at text[pos] are not start of any vocab token.
1097  * Also stop at UTF-8 encoded '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token. */
1098 static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos) {
1099  int run = 0;
1100  while (pos + run < (size_t)text_len) {
1101  /* Stop at '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token */
1102  if (pos + run + 3 <= (size_t)text_len &&
1103  (unsigned char)text[pos + run] == 0xE2 &&
1104  (unsigned char)text[pos + run + 1] == 0x96 &&
1105  (unsigned char)text[pos + run + 2] == 0x81) {
1106  break;
1107  }
1108 
1109  /* Check if any vocab token matches at this position */
1110  int max_len = 64;
1111  if (pos + run + max_len > (size_t)text_len) {
1112  max_len = (int)(text_len - pos - run);
1113  }
1114  int found = 0;
1115  for (int len = max_len; len >= 1; len--) {
1116  char tmp[65];
1117  memcpy(tmp, text + pos + run, len);
1118  tmp[len] = '\0';
1119  TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
1120  if (info && info->id >= 0 && info->id != tok->unk_id && spm_token_allowed_in_dp(tok, info->id)) {
1121  found = 1;
1122  break;
1123  }
1124  }
1125  if (found) break;
1126  run++;
1127  }
1128  return run;
1129 }
1130 
1131 /* Encode text to token IDs using greedy longest-match or Viterbi for SPM */
1132 int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) {
1133  if (!tok || !text || !ids || max_ids <= 0) return 0;
1134  if (text_len < 0) text_len = (int)strlen(text);
1135 
1136  /* For SPM tokenizers, use either unigram/Viterbi or llama-style SPM. */
1137  if (tok->config.type == CK_TOKENIZER_SPM) {
1138  int out_idx = 0;
1139  if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
1140  ids[out_idx++] = tok->bos_id;
1141  }
1142  if (text_len == 0) {
1143  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1144  ids[out_idx++] = tok->eos_id;
1145  }
1146  return out_idx;
1147  }
1148  int n = 0;
1149  if (tok->config.spm_mode == CK_SPM_MODE_LLAMA) {
1150  n = ck_tokenizer_encode_spm_llama_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
1151  } else {
1152  n = ck_tokenizer_encode_spm_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
1153  }
1154  if (n <= 0) return n;
1155  out_idx += n;
1156  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1157  ids[out_idx++] = tok->eos_id;
1158  }
1159  return out_idx;
1160  }
1161 
1162  /* For BPE tokenizers, convert spaces to appropriate prefix marker.
1163  * Auto-detect style from vocabulary if not already set. */
1164  char preprocessed[8192];
1165  const char *input = text;
1166  int input_len = text_len;
1167 
1168  if (tok->config.type == CK_TOKENIZER_BPE) {
1169  /* Get or detect space prefix style */
1170  CKSpacePrefixStyle style = ((CKTokenizer *)tok)->config.space_prefix_style;
1171  if (!((CKTokenizer *)tok)->config.space_prefix_detected) {
1173  }
1174 
1175  int pp_len = preprocess_bpe_spaces(text, text_len, preprocessed, sizeof(preprocessed) - 1, style);
1176  if (pp_len > 0) {
1177  preprocessed[pp_len] = '\0';
1178  input = preprocessed;
1179  input_len = pp_len;
1180  }
1181  }
1182 
1183  int out_idx = 0;
1184  if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
1185  ids[out_idx++] = tok->bos_id;
1186  }
1187 
1188  size_t pos = 0;
1189  while (pos < (size_t)input_len && out_idx < max_ids) {
1190  size_t match_len = 0;
1191  int32_t id = find_longest_match(tok, input, input_len, pos, &match_len);
1192 
1193  if (match_len == 0) {
1194  /* Emit UNK for unknown characters */
1195  if (tok->unk_id >= 0) ids[out_idx++] = tok->unk_id;
1196  pos++;
1197  } else {
1198  ids[out_idx++] = id;
1199  pos += match_len;
1200  }
1201  }
1202 
1203  if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
1204  ids[out_idx++] = tok->eos_id;
1205  }
1206 
1207  return out_idx;
1208 }
1209 
1210 /* Decode token IDs to text */
1211 int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len) {
1212  if (!tok || !ids || !text || max_len <= 0) return 0;
1213  int len = 0;
1214  for (int i = 0; i < num_ids; i++) {
1215  int32_t id = ids[i];
1216  if (id < 0) continue;
1217  const char *token = ck_tokenizer_id_to_token(tok, id);
1218  if (!token) continue;
1219  int token_len = (int)strlen(token);
1220 
1221  /* Check for space prefix markers and convert to ASCII space */
1222  unsigned char c0 = (unsigned char)token[0];
1223  unsigned char c1 = (unsigned char)token[1];
1224 
1225  if (c0 == 0xC4 && c1 == 0xA0) {
1226  /* Ġ (U+0120) is 2 bytes - convert to space */
1227  if (len < max_len - 1) text[len++] = ' ';
1228  token += 2; token_len -= 2;
1229  } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
1230  /* ▁ (U+2581) is 3 bytes - convert to space */
1231  if (len < max_len - 1) text[len++] = ' ';
1232  token += 3; token_len -= 3;
1233  }
1234 
1235  for (int j = 0; j < token_len && len < max_len - 1; j++) text[len++] = token[j];
1236  }
1237  text[len] = '\0';
1238  return len;
1239 }
1240 
1241 /* Load vocabulary from memory-mapped binary data */
1243  int vocab_size,
1244  const int32_t *offsets,
1245  const char *strings,
1246  int num_merges,
1247  const int32_t *merges) {
1249 }
1250 
1251 /* Load vocabulary from memory-mapped binary data with scores and types */
1253  int vocab_size,
1254  const int32_t *offsets,
1255  const char *strings,
1256  const float *scores,
1257  const uint8_t *types,
1258  int num_merges,
1259  const int32_t *merges) {
1260  if (!tok || !offsets || !strings) return -1;
1261  ck_tokenizer_reset(tok);
1262 
1263  /* Free any existing scores/types arrays before reallocating */
1264  if (tok->scores) {
1265  free(tok->scores);
1266  tok->scores = NULL;
1267  tok->scores_size = 0;
1268  }
1269  if (tok->types) {
1270  free(tok->types);
1271  tok->types = NULL;
1272  tok->types_size = 0;
1273  }
1274 
1275  /* Allocate scores and types arrays if provided */
1276  if (scores && vocab_size > 0) {
1277  tok->scores = (float *)malloc(vocab_size * sizeof(float));
1278  if (!tok->scores) return -1;
1279  memcpy(tok->scores, scores, vocab_size * sizeof(float));
1280  tok->scores_size = (size_t)vocab_size;
1281  }
1282  if (types && vocab_size > 0) {
1283  tok->types = (uint8_t *)malloc(vocab_size * sizeof(uint8_t));
1284  if (!tok->types) {
1285  if (tok->scores) {
1286  free(tok->scores);
1287  tok->scores = NULL;
1288  }
1289  return -1;
1290  }
1291  memcpy(tok->types, types, vocab_size * sizeof(uint8_t));
1292  tok->types_size = (size_t)vocab_size;
1293  }
1294 
1295  for (int i = 0; i < vocab_size; i++) {
1296  const char *token = strings + offsets[i];
1297  float score = scores ? scores[i] : 0.0f;
1299  }
1300 
1301  /* Build byte token lookup table if types are available */
1302  if (types && vocab_size > 0) {
1304 
1305  /* Log token type statistics */
1306  int count_normal = 0, count_unknown = 0, count_control = 0, count_byte = 0, count_other = 0;
1307  int max_type = 0;
1308  for (int i = 0; i < vocab_size; i++) {
1309  uint8_t t = tok->types[i];
1310  if (t > max_type) max_type = t;
1311  switch (t) {
1312  case GGUF_TOKEN_NORMAL: count_normal++; break;
1313  case GGUF_TOKEN_UNKNOWN: count_unknown++; break;
1314  case GGUF_TOKEN_CONTROL: count_control++; break;
1315  case GGUF_TOKEN_BYTE: count_byte++; break;
1316  default: count_other++; break;
1317  }
1318  }
1319  fprintf(stderr, "[TOKENIZER] Loaded %d tokens: normal=%d, unknown=%d, control=%d, byte=%d, other=%d\n",
1320  vocab_size, count_normal, count_unknown, count_control, count_byte, count_other);
1321  if (max_type > GGUF_TOKEN_BYTE) {
1322  fprintf(stderr, "[TOKENIZER] Warning: Unexpected token type %d\n", max_type);
1323  }
1324  }
1325 
1326  /* TODO: Merges */
1327  (void)num_merges; (void)merges;
1328  return 0;
1329 }
1330 
1331 /* Placeholders for header compliance */
1332 int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }
1333 int ck_tokenizer_load_json(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }
1334 int ck_tokenizer_load_text(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }
1335 int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }
1336 int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority) {
1337  (void)tok; (void)left; (void)right; (void)merged; (void)priority; return 0;
1338 }
#define CK_TOKENIZER_HT_BUCKETS_LARGE
Definition: hash_table.h:142
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:312
void ck_trie_clear(CKTrie *trie)
Definition: trie.c:80
int32_t ck_trie_find_longest(const CKTrie *trie, const char *text, size_t text_len, size_t start_pos, size_t *match_len)
Definition: trie.c:142
int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
Definition: trie.c:110
void ck_trie_free(CKTrie *trie)
Definition: trie.c:51
CKTrie * ck_trie_create(size_t max_nodes)
Definition: trie.c:29
int ck_tokenizer_mempool_init(CKTokenizerMemPool *pool, size_t size)
Definition: memory_pool.c:11
void ck_tokenizer_mempool_free(CKTokenizerMemPool *pool)
Definition: memory_pool.c:28
bool add_space_prefix
Definition: tokenizer.h:77
CKTokenizerType type
Definition: tokenizer.h:74
CKSpmMode spm_mode
Definition: tokenizer.h:84
bool space_prefix_detected
Definition: tokenizer.h:83
CKSpacePrefixStyle space_prefix_style
Definition: tokenizer.h:82
int32_t bos_id
Definition: ck_tokenizer.h:98
float * scores
Definition: tokenizer.h:111
size_t types_size
Definition: tokenizer.h:114
int32_t * byte_token_id
Definition: tokenizer.h:117
CKMemPool pool
Definition: ck_tokenizer.h:78
int32_t unk_id
Definition: ck_tokenizer.h:97
CKTrie * vocab_trie
Definition: tokenizer.h:103
int32_t eos_id
Definition: ck_tokenizer.h:99
CKTokenizerHashTable * vocab
Definition: tokenizer.h:100
uint8_t * types
Definition: tokenizer.h:113
size_t vocab_capacity
Definition: tokenizer.h:108
size_t scores_size
Definition: tokenizer.h:112
char ** id_to_token
Definition: ck_tokenizer.h:86
int32_t mask_id
Definition: tokenizer.h:124
CKTokenizerConfig config
Definition: tokenizer.h:97
int32_t pad_id
Definition: ck_tokenizer.h:100
static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len, size_t pos, int32_t *candidates, int max_candidates)
Definition: tokenizer.c:1047
static int32_t ck_tokenizer_lookup_exact(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:330
void ck_tokenizer_set_add_bos_eos(CKTokenizer *tok, bool add_bos, bool add_eos)
Definition: tokenizer.c:243
static int preprocess_spm_llama_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:554
static bool spm_token_is_byte_format(const char *token)
Definition: tokenizer.c:500
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:323
int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1252
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: tokenizer.c:1211
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
Definition: tokenizer.c:157
static int ck_tokenizer_encode_spm_llama_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:642
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok)
Definition: tokenizer.c:276
static bool spm_token_allowed_in_dp(const CKTokenizer *tok, int32_t token_id)
Definition: tokenizer.c:469
#define GGUF_TOKEN_CONTROL
Definition: tokenizer.c:462
int ck_tokenizer_load_text(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1334
int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1332
int ck_tokenizer_load_json(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1333
void ck_tokenizer_set_spm_mode(CKTokenizer *tok, CKSpmMode spm_mode)
Definition: tokenizer.c:254
static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size)
Definition: tokenizer.c:507
CKTokenizer * ck_tokenizer_create(CKTokenizerType type)
Definition: tokenizer.c:34
static int32_t find_longest_match(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:400
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1242
static int preprocess_spm_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:763
static int spm_encode_byte_fallback(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:1024
static int spm_llama_resegment_node(const CKTokenizer *tok, const SpmLlamaNode *nodes, int node_id, int32_t *ids, int max_ids, int out_idx)
Definition: tokenizer.c:611
static int32_t ck_tokenizer_lookup_exact_n(const CKTokenizer *tok, const char *text, int text_len)
Definition: tokenizer.c:337
#define GGUF_TOKEN_USER_DEFINED
Definition: tokenizer.c:463
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: tokenizer.c:353
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority)
Definition: tokenizer.c:1336
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:1132
void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)
Definition: tokenizer.c:234
#define GGUF_TOKEN_UNUSED
Definition: tokenizer.c:464
static int32_t find_longest_match_trie(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:359
void ck_tokenizer_reset(CKTokenizer *tok)
Definition: tokenizer.c:125
static bool spm_is_byte_token(const CKTokenizer *tok, int32_t token_id)
Definition: tokenizer.c:479
static int32_t spm_get_byte_token(const CKTokenizer *tok, unsigned char byte_val)
Definition: tokenizer.c:487
#define GGUF_TOKEN_BYTE
Definition: tokenizer.c:465
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
Definition: tokenizer.c:213
void ck_tokenizer_free(CKTokenizer *tok)
Definition: tokenizer.c:91
int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1335
static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos)
Definition: tokenizer.c:1098
static int preprocess_bpe_spaces(const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style)
Definition: tokenizer.c:412
#define GGUF_TOKEN_UNKNOWN
Definition: tokenizer.c:461
void ck_tokenizer_set_use_trie(CKTokenizer *tok, bool use_trie)
Definition: tokenizer.c:260
void ck_tokenizer_set_add_space_prefix(CKTokenizer *tok, bool add_space_prefix)
Definition: tokenizer.c:249
static int32_t find_longest_match_hash(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:370
void ck_tokenizer_set_space_prefix_style(CKTokenizer *tok, CKSpacePrefixStyle style)
Definition: tokenizer.c:266
static int ck_tokenizer_encode_spm_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:832
#define GGUF_TOKEN_NORMAL
Definition: tokenizer.c:460
int32_t int32_t int32_t int32_t int32_t mask
Definition: tokenizer.h:233
const int32_t * ids
Definition: tokenizer.h:443
int32_t id
Definition: tokenizer.h:315
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62
const int32_t int num_ids
Definition: tokenizer.h:444
CKTokenizerType
Definition: tokenizer.h:53
@ CK_TOKENIZER_BPE
Definition: tokenizer.h:54
@ CK_TOKENIZER_SPM
Definition: tokenizer.h:56
const char * text
Definition: tokenizer.h:563
bool bool add_eos
Definition: tokenizer.h:242
bool add_space_prefix
Definition: tokenizer.h:252
CKSpmMode spm_mode
Definition: tokenizer.h:260
bool add_bos
Definition: tokenizer.h:242
const char * token
Definition: tokenizer.h:306
int32_t float * score
Definition: tokenizer.h:327
CKSpmMode
Definition: tokenizer.h:67
@ CK_SPM_MODE_UNIGRAM
Definition: tokenizer.h:68
@ CK_SPM_MODE_LLAMA
Definition: tokenizer.h:69
int32_t unk
Definition: tokenizer.h:229
bool use_trie
Definition: tokenizer.h:276
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232
CKSpacePrefixStyle style
Definition: tokenizer.h:287
int32_t int32_t bos
Definition: tokenizer.h:230
const int32_t int int * out_len
Definition: tokenizer.h:445
const CKBPEConfig * config
Definition: true_bpe.h:171
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const int32_t int char int max_len
Definition: true_bpe.h:280
const char int text_len
Definition: true_bpe.h:262
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
const char * left
Definition: true_bpe.h:130
const char int int32_t int max_ids
Definition: true_bpe.h:264
const char const char * right
Definition: true_bpe.h:131