← Back to C-Kernel-Engine Docs Doxygen Source Documentation
true_bpe.c
Go to the documentation of this file.
1 /*
2  * True BPE (Byte-Pair Encoding) Tokenizer [v4 - correctness complete, optimization pending]
3  *
4  * Implements the actual BPE algorithm used by GPT-2, LLaMA, Qwen, etc.
5  * Unlike greedy longest-match, this applies merge rules in priority order.
6  * Achieves 100% token-exact parity with HuggingFace (8/8 tests passing).
7  *
8  * Algorithm:
9  * 1. Split text into initial tokens (characters or bytes)
10  * 2. Find the highest-priority merge that can be applied
11  * 3. Apply that merge (combine two adjacent tokens into one)
12  * 4. Repeat until no more merges possible
13  * 5. Look up final tokens in vocabulary to get IDs
14  *
15  * Data Structures:
16  * - Vocabulary: token string -> token ID (hash table, FNV-1a, 65K buckets)
17  * - Merge rules: (left_id, right_id) -> (merged_id, priority) (hash table, MurmurHash3, 65K buckets)
18  * - Token list: dynamic array for the working token sequence
19  *
20  * TODO: Performance optimizations (correctness is done, these are for throughput):
21  *
22  * 1. Arena allocator for token strings
23  * - token_list_append() and token_list_merge_at() malloc/free per token string
24  * - Replace with scratch arena (~64KB) on CKTrueBPE struct, reset per encode_chunk()
25  * - Strings only live for one encode call, so arena lifetime matches perfectly
26  *
27  * 2. Linked list instead of array shift in merge loop
28  * - token_list_merge_at() shifts entire tail array left by 1: O(n) per merge
29  * - Use doubly-linked list of nodes (allocated from arena): O(1) per merge
30  * - Store only token ID per node, use id_to_token[] for string when needed
31  *
32  * 3. Priority queue for find_best_merge()
33  * - Currently rescans ALL adjacent pairs every iteration: O(n) per merge
34  * - Total merge loop is O(n * m) where m = merges applied
35  * - Use min-heap of candidate merges, pop best, re-insert affected neighbors
36  * - Would reduce to O(n log n) total
37  *
38  * 4. Eliminate redundant string storage
39  * - CKBPEToken stores both .str (malloc'd) and .id
40  * - After initial char->ID lookup, only IDs needed for merge lookup
41  * - Use id_to_token[merged_id] to get string when constructing merged tokens
42  *
43  * Current profile: tokenizer runs 2x per prompt (not per token), so these
44  * matter for batch tokenization / server use / 32K+ context, not for
45  * interactive chat where GEMM kernels dominate at 57% of compute.
46  *
47  * By Anthony Shivakumar
48  */
49 
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <stdint.h>
54 #include <stdbool.h>
55 #include <limits.h>
56 
57 #include "tokenizer/true_bpe.h"
58 
59 /* ═══════════════════════════════════════════════════════════════════════════════
60  * Constants
61  * ═══════════════════════════════════════════════════════════════════════════════ */
62 
63 #define MERGE_HASH_SIZE 65536 /* Size of merge lookup hash table */
64 #define INITIAL_TOKEN_CAPACITY 256 /* Initial capacity for token list */
65 #define MAX_TOKEN_LEN 128 /* Maximum length of a single token string */
66 
67 /* ═══════════════════════════════════════════════════════════════════════════════
68  * Data Structures
69  * ═══════════════════════════════════════════════════════════════════════════════ */
70 
71 /* A single merge rule */
72 typedef struct {
73  int32_t left_id; /* Left token ID */
74  int32_t right_id; /* Right token ID */
75  int32_t merged_id; /* Resulting merged token ID */
76  int32_t priority; /* Lower = higher priority (applied first) */
77 } CKBPEMerge;
78 
79 /* Hash table entry for merge lookup */
80 typedef struct CKMergeEntry {
81  uint64_t key; /* Hash key: (left_id << 32) | right_id */
82  CKBPEMerge merge; /* The merge rule */
83  struct CKMergeEntry *next; /* Chain for collision handling */
84 } CKMergeEntry;
85 
86 /* Merge lookup hash table */
87 typedef struct {
88  CKMergeEntry **buckets;
89  size_t num_buckets;
90  size_t num_entries;
91 } CKMergeTable;
92 
93 /* A working token during BPE encoding */
94 typedef struct {
95  char *str; /* Token string */
96  int32_t id; /* Token ID (-1 if not yet looked up) */
97  uint16_t len; /* String length */
98  bool is_merged; /* True if this is result of a merge */
99 } CKBPEToken;
100 
101 /* Dynamic token list for BPE processing */
102 typedef struct {
103  CKBPEToken *tokens;
104  size_t count;
105  size_t capacity;
106 } CKBPETokenList;
107 
108 #define MAX_SPECIAL_TOKENS 32 /* Maximum number of special tokens */
109 
110 /* Special token entry for pre-BPE matching */
111 typedef struct {
112  char *token; /* Token string to match */
113  int32_t id; /* Token ID to output */
114  int len; /* Length of token string (for faster matching) */
115 } CKSpecialToken;
116 
117 /* Main True BPE tokenizer structure */
118 struct CKTrueBPE {
119  /* Vocabulary: token string -> token ID */
120  CKTokenizerHashTable *vocab;
121 
122  /* Reverse vocabulary: token ID -> token string */
123  char **id_to_token;
124  size_t vocab_size;
125  size_t vocab_capacity;
126 
127  /* Merge rules: (left_id, right_id) -> merged_id with priority */
128  CKMergeTable *merges;
129  int32_t num_merges;
130 
131  /* Special token IDs */
132  int32_t unk_id;
133  int32_t bos_id;
134  int32_t eos_id;
135  int32_t pad_id;
136 
137  /* Special tokens to match before BPE (sorted by length, longest first) */
138  CKSpecialToken special_tokens[MAX_SPECIAL_TOKENS];
139  int num_special_tokens;
140 
141  /* Configuration */
143 
144  /* String buffer for token operations */
145  char *str_buffer;
146  size_t str_buffer_size;
147 };
148 
149 /* ═══════════════════════════════════════════════════════════════════════════════
150  * Merge Table Operations
151  * ═══════════════════════════════════════════════════════════════════════════════ */
152 
153 static uint64_t merge_key(int32_t left_id, int32_t right_id) {
154  return ((uint64_t)left_id << 32) | (uint32_t)right_id;
155 }
156 
157 static size_t merge_hash(uint64_t key, size_t num_buckets) {
158  /* Simple hash mixing */
159  key ^= key >> 33;
160  key *= 0xff51afd7ed558ccdULL;
161  key ^= key >> 33;
162  key *= 0xc4ceb9fe1a85ec53ULL;
163  key ^= key >> 33;
164  return key % num_buckets;
165 }
166 
167 static CKMergeTable *merge_table_create(size_t num_buckets) {
168  CKMergeTable *table = (CKMergeTable *)malloc(sizeof(CKMergeTable));
169  if (!table) return NULL;
170 
171  table->buckets = (CKMergeEntry **)calloc(num_buckets, sizeof(CKMergeEntry *));
172  if (!table->buckets) {
173  free(table);
174  return NULL;
175  }
176 
177  table->num_buckets = num_buckets;
178  table->num_entries = 0;
179  return table;
180 }
181 
182 static void merge_table_free(CKMergeTable *table) {
183  if (!table) return;
184 
185  for (size_t i = 0; i < table->num_buckets; i++) {
186  CKMergeEntry *entry = table->buckets[i];
187  while (entry) {
188  CKMergeEntry *next = entry->next;
189  free(entry);
190  entry = next;
191  }
192  }
193 
194  free(table->buckets);
195  free(table);
196 }
197 
198 static int merge_table_insert(CKMergeTable *table, const CKBPEMerge *merge) {
199  uint64_t key = merge_key(merge->left_id, merge->right_id);
200  size_t bucket = merge_hash(key, table->num_buckets);
201 
202  /* Check if already exists */
203  CKMergeEntry *entry = table->buckets[bucket];
204  while (entry) {
205  if (entry->key == key) {
206  /* Update existing */
207  entry->merge = *merge;
208  return 0;
209  }
210  entry = entry->next;
211  }
212 
213  /* Create new entry */
214  entry = (CKMergeEntry *)malloc(sizeof(CKMergeEntry));
215  if (!entry) return -1;
216 
217  entry->key = key;
218  entry->merge = *merge;
219  entry->next = table->buckets[bucket];
220  table->buckets[bucket] = entry;
221  table->num_entries++;
222 
223  return 0;
224 }
225 
226 static const CKBPEMerge *merge_table_lookup(const CKMergeTable *table, int32_t left_id, int32_t right_id) {
227  uint64_t key = merge_key(left_id, right_id);
228  size_t bucket = merge_hash(key, table->num_buckets);
229 
230  CKMergeEntry *entry = table->buckets[bucket];
231  while (entry) {
232  if (entry->key == key) {
233  return &entry->merge;
234  }
235  entry = entry->next;
236  }
237 
238  return NULL;
239 }
240 
241 /* ═══════════════════════════════════════════════════════════════════════════════
242  * Token List Operations
243  * ═══════════════════════════════════════════════════════════════════════════════ */
244 
245 static CKBPETokenList *token_list_create(size_t initial_capacity) {
246  CKBPETokenList *list = (CKBPETokenList *)malloc(sizeof(CKBPETokenList));
247  if (!list) return NULL;
248 
249  list->tokens = (CKBPEToken *)calloc(initial_capacity, sizeof(CKBPEToken));
250  if (!list->tokens) {
251  free(list);
252  return NULL;
253  }
254 
255  list->count = 0;
256  list->capacity = initial_capacity;
257  return list;
258 }
259 
260 static void token_list_free(CKBPETokenList *list) {
261  if (!list) return;
262 
263  for (size_t i = 0; i < list->count; i++) {
264  if (list->tokens[i].str) {
265  free(list->tokens[i].str);
266  }
267  }
268 
269  free(list->tokens);
270  free(list);
271 }
272 
273 static void token_list_clear(CKBPETokenList *list) {
274  for (size_t i = 0; i < list->count; i++) {
275  if (list->tokens[i].str) {
276  free(list->tokens[i].str);
277  list->tokens[i].str = NULL;
278  }
279  }
280  list->count = 0;
281 }
282 
283 static int token_list_append(CKBPETokenList *list, const char *str, size_t len, int32_t id) {
284  if (list->count >= list->capacity) {
285  size_t new_cap = list->capacity * 2;
286  CKBPEToken *new_tokens = (CKBPEToken *)realloc(list->tokens, new_cap * sizeof(CKBPEToken));
287  if (!new_tokens) return -1;
288  list->tokens = new_tokens;
289  list->capacity = new_cap;
290  /* Zero new entries */
291  memset(list->tokens + list->count, 0, (new_cap - list->count) * sizeof(CKBPEToken));
292  }
293 
294  CKBPEToken *tok = &list->tokens[list->count];
295  tok->str = (char *)malloc(len + 1);
296  if (!tok->str) return -1;
297 
298  memcpy(tok->str, str, len);
299  tok->str[len] = '\0';
300  tok->len = (uint16_t)len;
301  tok->id = id;
302  tok->is_merged = false;
303 
304  list->count++;
305  return 0;
306 }
307 
308 /* Merge tokens at positions i and i+1 into a single token */
309 static int token_list_merge_at(CKBPETokenList *list, size_t pos, const char *merged_str, size_t merged_len, int32_t merged_id) {
310  if (pos + 1 >= list->count) return -1;
311 
312  /* Free old strings */
313  free(list->tokens[pos].str);
314  free(list->tokens[pos + 1].str);
315 
316  /* Create merged token */
317  list->tokens[pos].str = (char *)malloc(merged_len + 1);
318  if (!list->tokens[pos].str) return -1;
319 
320  memcpy(list->tokens[pos].str, merged_str, merged_len);
321  list->tokens[pos].str[merged_len] = '\0';
322  list->tokens[pos].len = (uint16_t)merged_len;
323  list->tokens[pos].id = merged_id;
324  list->tokens[pos].is_merged = true;
325 
326  /* Shift remaining tokens left */
327  for (size_t i = pos + 1; i < list->count - 1; i++) {
328  list->tokens[i] = list->tokens[i + 1];
329  }
330  list->count--;
331 
332  /* Clear the now-unused last slot */
333  list->tokens[list->count].str = NULL;
334 
335  return 0;
336 }
337 
338 /* ═══════════════════════════════════════════════════════════════════════════════
339  * True BPE Tokenizer API
340  * ═══════════════════════════════════════════════════════════════════════════════ */
341 
342 CKTrueBPE *ck_true_bpe_create(void) {
343  CKTrueBPE *bpe = (CKTrueBPE *)calloc(1, sizeof(CKTrueBPE));
344  if (!bpe) return NULL;
345 
346  /* Create vocabulary hash table */
348  if (!bpe->vocab) {
349  free(bpe);
350  return NULL;
351  }
352 
353  /* Create merge table */
354  bpe->merges = merge_table_create(MERGE_HASH_SIZE);
355  if (!bpe->merges) {
356  ck_tokenizer_hash_table_free(bpe->vocab, true);
357  free(bpe);
358  return NULL;
359  }
360 
361  /* Initialize reverse vocabulary */
362  bpe->vocab_capacity = 4096;
363  bpe->id_to_token = (char **)calloc(bpe->vocab_capacity, sizeof(char *));
364  if (!bpe->id_to_token) {
365  merge_table_free(bpe->merges);
366  ck_tokenizer_hash_table_free(bpe->vocab, true);
367  free(bpe);
368  return NULL;
369  }
370 
371  /* String buffer for token operations */
372  bpe->str_buffer_size = 4096;
373  bpe->str_buffer = (char *)malloc(bpe->str_buffer_size);
374  if (!bpe->str_buffer) {
375  free(bpe->id_to_token);
376  merge_table_free(bpe->merges);
377  ck_tokenizer_hash_table_free(bpe->vocab, true);
378  free(bpe);
379  return NULL;
380  }
381 
382  /* Default special token IDs */
383  bpe->unk_id = 0;
384  bpe->bos_id = -1;
385  bpe->eos_id = -1;
386  bpe->pad_id = -1;
387 
388  /* Initialize special tokens array */
389  bpe->num_special_tokens = 0;
390  for (int i = 0; i < MAX_SPECIAL_TOKENS; i++) {
391  bpe->special_tokens[i].token = NULL;
392  bpe->special_tokens[i].id = -1;
393  bpe->special_tokens[i].len = 0;
394  }
395 
396  /* Default config */
397  bpe->config.add_bos = false;
398  bpe->config.add_eos = false;
399  bpe->config.byte_fallback = true;
400  bpe->config.space_prefix_style = CK_SPACE_PREFIX_AUTO;
401 
402  return bpe;
403 }
404 
405 void ck_true_bpe_free(CKTrueBPE *bpe) {
406  if (!bpe) return;
407 
408  if (bpe->vocab) {
409  ck_tokenizer_hash_table_free(bpe->vocab, true);
410  }
411 
412  if (bpe->merges) {
413  merge_table_free(bpe->merges);
414  }
415 
416  if (bpe->id_to_token) {
417  for (size_t i = 0; i < bpe->vocab_size; i++) {
418  if (bpe->id_to_token[i]) {
419  free(bpe->id_to_token[i]);
420  }
421  }
422  free(bpe->id_to_token);
423  }
424 
425  if (bpe->str_buffer) {
426  free(bpe->str_buffer);
427  }
428 
429  /* Free special tokens */
430  for (int i = 0; i < bpe->num_special_tokens; i++) {
431  if (bpe->special_tokens[i].token) {
432  free(bpe->special_tokens[i].token);
433  }
434  }
435 
436  free(bpe);
437 }
438 
439 /* ═══════════════════════════════════════════════════════════════════════════════
440  * Vocabulary Management
441  * ═══════════════════════════════════════════════════════════════════════════════ */
442 
443 /* Token info stored in vocab hash table */
444 typedef struct {
445  int32_t id;
446  float score;
447 } BPETokenInfo;
448 
449 int ck_true_bpe_add_token(CKTrueBPE *bpe, const char *token, int32_t id, float score) {
450  if (!bpe || !token) return -1;
451 
452  /* Ensure reverse vocab has space */
453  if (id >= (int32_t)bpe->vocab_capacity) {
454  size_t new_cap = bpe->vocab_capacity * 2;
455  while (new_cap <= (size_t)id) new_cap *= 2;
456 
457  char **new_array = (char **)realloc(bpe->id_to_token, new_cap * sizeof(char *));
458  if (!new_array) return -1;
459 
460  memset(new_array + bpe->vocab_capacity, 0, (new_cap - bpe->vocab_capacity) * sizeof(char *));
461  bpe->id_to_token = new_array;
462  bpe->vocab_capacity = new_cap;
463  }
464 
465  /* Check if token exists */
466  BPETokenInfo *existing = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
467  if (existing) {
468  existing->id = id;
469  existing->score = score;
470  if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
471  bpe->id_to_token[id] = strdup(token);
472  return 0;
473  }
474 
475  /* Create new token info */
476  BPETokenInfo *info = (BPETokenInfo *)malloc(sizeof(BPETokenInfo));
477  if (!info) return -1;
478 
479  info->id = id;
480  info->score = score;
481 
482  if (ck_tokenizer_hash_table_insert(bpe->vocab, token, info) != 0) {
483  free(info);
484  return -1;
485  }
486 
487  if (id >= (int32_t)bpe->vocab_size) {
488  bpe->vocab_size = id + 1;
489  }
490 
491  if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
492  bpe->id_to_token[id] = strdup(token);
493 
494  return 0;
495 }
496 
497 int ck_true_bpe_add_merge(CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority) {
498  if (!bpe) return -1;
499 
500  CKBPEMerge merge = {
501  .left_id = left_id,
502  .right_id = right_id,
503  .merged_id = merged_id,
504  .priority = priority
505  };
506 
507  int ret = merge_table_insert(bpe->merges, &merge);
508  if (ret == 0) {
509  bpe->num_merges++;
510  }
511  return ret;
512 }
513 
514 int ck_true_bpe_add_merge_by_tokens(CKTrueBPE *bpe, const char *left, const char *right, int32_t priority) {
515  if (!bpe || !left || !right) return -1;
516 
517  /* Look up token IDs */
518  BPETokenInfo *left_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, left);
519  BPETokenInfo *right_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, right);
520 
521  if (!left_info || !right_info) {
522  return -1; /* Tokens not in vocabulary */
523  }
524 
525  /* Create merged token string */
526  size_t left_len = strlen(left);
527  size_t right_len = strlen(right);
528  size_t merged_len = left_len + right_len;
529 
530  if (merged_len >= bpe->str_buffer_size) {
531  return -1; /* Too long */
532  }
533 
534  memcpy(bpe->str_buffer, left, left_len);
535  memcpy(bpe->str_buffer + left_len, right, right_len);
536  bpe->str_buffer[merged_len] = '\0';
537 
538  /* Look up or create merged token */
539  BPETokenInfo *merged_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, bpe->str_buffer);
540  int32_t merged_id;
541 
542  if (merged_info) {
543  merged_id = merged_info->id;
544  } else {
545  /* Merged token should already exist in vocabulary */
546  return -1;
547  }
548 
549  return ck_true_bpe_add_merge(bpe, left_info->id, right_info->id, merged_id, priority);
550 }
551 
552 void ck_true_bpe_set_special_ids(CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad) {
553  if (!bpe) return;
554  bpe->unk_id = unk;
555  bpe->bos_id = bos;
556  bpe->eos_id = eos;
557  bpe->pad_id = pad;
558 }
559 
560 void ck_true_bpe_set_config(CKTrueBPE *bpe, const CKBPEConfig *config) {
561  if (!bpe || !config) return;
562  bpe->config = *config;
563 }
564 
565 int ck_true_bpe_add_special_token(CKTrueBPE *bpe, const char *token, int32_t id) {
566  if (!bpe || !token || id < 0) return -1;
567  if (bpe->num_special_tokens >= MAX_SPECIAL_TOKENS) return -1;
568 
569  int token_len = (int)strlen(token);
570  if (token_len == 0) return -1;
571 
572  /* Check if already exists */
573  for (int i = 0; i < bpe->num_special_tokens; i++) {
574  if (bpe->special_tokens[i].token &&
575  strcmp(bpe->special_tokens[i].token, token) == 0) {
576  /* Update ID for existing token */
577  bpe->special_tokens[i].id = id;
578  return 0;
579  }
580  }
581 
582  /* Find insertion point (keep sorted by length, longest first) */
583  int insert_idx = bpe->num_special_tokens;
584  for (int i = 0; i < bpe->num_special_tokens; i++) {
585  if (token_len > bpe->special_tokens[i].len) {
586  insert_idx = i;
587  break;
588  }
589  }
590 
591  /* Shift existing entries down */
592  for (int i = bpe->num_special_tokens; i > insert_idx; i--) {
593  bpe->special_tokens[i] = bpe->special_tokens[i - 1];
594  }
595 
596  /* Insert new entry */
597  bpe->special_tokens[insert_idx].token = strdup(token);
598  if (!bpe->special_tokens[insert_idx].token) return -1;
599  bpe->special_tokens[insert_idx].id = id;
600  bpe->special_tokens[insert_idx].len = token_len;
601  bpe->num_special_tokens++;
602 
603  return 0;
604 }
605 
606 int ck_true_bpe_load_binary(CKTrueBPE *bpe,
607  int vocab_size,
608  const int32_t *offsets,
609  const char *strings,
610  int num_merges,
611  const int32_t *merges) {
612  if (!bpe || !offsets || !strings || vocab_size <= 0) return -1;
613 
614  for (int i = 0; i < vocab_size; i++) {
615  const char *token = strings + offsets[i];
616  if (ck_true_bpe_add_token(bpe, token, i, 0.0f) != 0) {
617  return -1;
618  }
619  }
620 
621  if (merges && num_merges > 0) {
622  for (int i = 0; i < num_merges; i++) {
623  int32_t left = merges[i * 3 + 0];
624  int32_t right = merges[i * 3 + 1];
625  int32_t merged = merges[i * 3 + 2];
626  if (left < 0 || right < 0 || merged < 0) {
627  continue;
628  }
629  if (ck_true_bpe_add_merge(bpe, left, right, merged, i) != 0) {
630  return -1;
631  }
632  }
633  }
634 
635  return 0;
636 }
637 
638 int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token) {
639  if (!bpe || !token) return -1;
640 
641  BPETokenInfo *info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
642  return info ? info->id : bpe->unk_id;
643 }
644 
645 const char *ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id) {
646  if (!bpe || id < 0 || id >= (int32_t)bpe->vocab_size) return NULL;
647  return bpe->id_to_token[id];
648 }
649 
650 /* ═══════════════════════════════════════════════════════════════════════════════
651  * Space Prefix Detection
652  * ═══════════════════════════════════════════════════════════════════════════════ */
653 
655  if (!bpe) return CK_SPACE_PREFIX_GPT2;
656 
657  if (bpe->config.space_prefix_style != CK_SPACE_PREFIX_AUTO) {
658  return bpe->config.space_prefix_style;
659  }
660 
661  /* Count tokens starting with each style */
662  int gpt2_count = 0; /* Ġ (0xC4 0xA0) */
663  int spm_count = 0; /* ▁ (0xE2 0x96 0x81) */
664 
665  for (size_t i = 0; i < bpe->vocab_size && i < 10000; i++) {
666  const char *token = bpe->id_to_token[i];
667  if (!token) continue;
668 
669  unsigned char c0 = (unsigned char)token[0];
670  unsigned char c1 = (unsigned char)token[1];
671 
672  if (c0 == 0xC4 && c1 == 0xA0) {
673  gpt2_count++;
674  } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
675  spm_count++;
676  }
677  }
678 
679  CKSpacePrefixStyle detected = (spm_count > gpt2_count * 2) ? CK_SPACE_PREFIX_SPM : CK_SPACE_PREFIX_GPT2;
680  bpe->config.space_prefix_style = detected;
681 
682  return detected;
683 }
684 
685 /* ═══════════════════════════════════════════════════════════════════════════════
686  * True BPE Encoding
687  * ═══════════════════════════════════════════════════════════════════════════════ */
688 
689 /*
690  * GPT-2 Byte-Level BPE Character Mapping
691  *
692  * GPT-2 uses a byte-level encoding where certain bytes are mapped to
693  * special Unicode characters to avoid issues with control characters:
694  *
695  * - Space (0x20) → Ġ (U+0120, bytes 0xC4 0xA0)
696  * - Newline (0x0A) → Ċ (U+010A, bytes 0xC4 0x8A)
697  * - Tab (0x09) → ĉ (U+0109, bytes 0xC4 0x89)
698  * - Carriage return (0x0D) → č (U+010D, bytes 0xC4 0x8D)
699  *
700  * The mapping is: byte 0x00-0x20 → U+0100 + byte
701  * Regular printable ASCII (0x21-0x7E) stays as-is
702  */
703 
704 /* Convert a byte to GPT-2 byte-level BPE representation */
705 static int byte_to_gpt2(unsigned char byte, char *out) {
706  if (byte >= 0x21 && byte <= 0x7E && byte != '!') {
707  /* Printable ASCII (except control chars) stays as-is */
708  out[0] = (char)byte;
709  return 1;
710  }
711 
712  /* Control chars and special bytes map to U+0100 + byte */
713  /* This gives us characters like Ġ (U+0120), Ċ (U+010A), etc. */
714  unsigned int codepoint;
715 
716  /* GPT-2's byte_encoder mapping */
717  if (byte == '!') codepoint = byte;
718  else if (byte == '"') codepoint = byte;
719  else if (byte >= '#' && byte <= '~') codepoint = byte;
720  else if (byte == 0x21) codepoint = '!'; /* Already handled above */
721  else {
722  /* Map 0x00-0x20, 0x7F-0xFF to offset range */
723  if (byte <= 0x20) {
724  codepoint = 0x100 + byte; /* 0x00-0x20 → U+0100-U+0120 */
725  } else if (byte >= 0x7F && byte <= 0xA0) {
726  codepoint = 0x100 + byte; /* 0x7F-0xA0 → U+017F-U+01A0 */
727  } else {
728  codepoint = byte; /* Others: 0xA1-0xFF stay as-is */
729  }
730  }
731 
732  /* Encode as UTF-8 */
733  if (codepoint < 0x80) {
734  out[0] = (char)codepoint;
735  return 1;
736  } else if (codepoint < 0x800) {
737  out[0] = (char)(0xC0 | (codepoint >> 6));
738  out[1] = (char)(0x80 | (codepoint & 0x3F));
739  return 2;
740  } else {
741  out[0] = (char)(0xE0 | (codepoint >> 12));
742  out[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
743  out[2] = (char)(0x80 | (codepoint & 0x3F));
744  return 3;
745  }
746 }
747 
748 /* Preprocess text: convert to byte-level BPE representation */
749 static int preprocess_text(const CKTrueBPE *bpe, const char *text, int text_len, char *out, int out_max) {
750  CKSpacePrefixStyle style = bpe->config.space_prefix_style;
751  int out_len = 0;
752 
753  /* For SentencePiece, add ▁ at start */
754  if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
755  if (out_len + 3 > out_max) return -1;
756  out[out_len++] = (char)0xE2;
757  out[out_len++] = (char)0x96;
758  out[out_len++] = (char)0x81;
759  }
760 
761  for (int i = 0; i < text_len; i++) {
762  unsigned char byte = (unsigned char)text[i];
763 
764  if (style == CK_SPACE_PREFIX_SPM) {
765  /* SentencePiece style: only convert spaces */
766  if (byte == ' ') {
767  if (out_len + 3 > out_max) return -1;
768  out[out_len++] = (char)0xE2;
769  out[out_len++] = (char)0x96;
770  out[out_len++] = (char)0x81;
771  } else {
772  if (out_len + 1 > out_max) return -1;
773  out[out_len++] = (char)byte;
774  }
775  } else {
776  /* GPT-2 style: full byte-level encoding */
777  char encoded[4];
778  int enc_len = byte_to_gpt2(byte, encoded);
779  if (out_len + enc_len > out_max) return -1;
780  for (int j = 0; j < enc_len; j++) {
781  out[out_len++] = encoded[j];
782  }
783  }
784  }
785 
786  return out_len;
787 }
788 
789 /* Get UTF-8 character length */
790 static int utf8_char_len(unsigned char c) {
791  if ((c & 0x80) == 0) return 1; /* 0xxxxxxx */
792  if ((c & 0xE0) == 0xC0) return 2; /* 110xxxxx */
793  if ((c & 0xF0) == 0xE0) return 3; /* 1110xxxx */
794  if ((c & 0xF8) == 0xF0) return 4; /* 11110xxx */
795  return 1; /* Invalid, treat as single byte */
796 }
797 
798 /* ═══════════════════════════════════════════════════════════════════════════════
799  * GPT-2 Style Pretokenizer
800  *
801  * The GPT-2 pretokenizer uses a regex to split text into chunks BEFORE BPE:
802  * (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}|
803  * ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
804  *
805  * Key behaviors:
806  * - Words get an optional leading space attached: "hello world" -> ["hello", " world"]
807  * - Multiple spaces before a word: " hello" -> [" ", " hello"] (n-1 spaces, then space+word)
808  * - Numbers stay together: "123" -> ["123"]
809  * - Punctuation may get leading space: " ," -> [" ,"]
810  *
811  * This implementation provides a simplified version that handles common cases.
812  * ═══════════════════════════════════════════════════════════════════════════════ */
813 
814 /* Character classification helpers */
815 static bool is_letter(unsigned char c) {
816  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
817 }
818 
819 static bool is_digit(unsigned char c) {
820  return c >= '0' && c <= '9';
821 }
822 
823 static bool is_whitespace(unsigned char c) {
824  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
825 }
826 
827 /* Check if this is the GPT-2 Ġ character (0xC4 0xA0) */
828 static bool is_gpt2_space(const char *s, int len) {
829  return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0xA0;
830 }
831 
832 /* Check if this is a GPT-2 encoded letter (regular ASCII letter in byte-level) */
833 static bool is_bpe_letter(const char *s, int len) {
834  if (len == 1) {
835  unsigned char c = (unsigned char)s[0];
836  return is_letter(c);
837  }
838  return false;
839 }
840 
841 /* Check if this is a GPT-2 encoded digit */
842 static bool is_bpe_digit(const char *s, int len) {
843  if (len == 1) {
844  unsigned char c = (unsigned char)s[0];
845  return is_digit(c);
846  }
847  return false;
848 }
849 
850 /* Pretokenizer chunk types */
851 typedef enum {
852  CHUNK_WORD, /* Letters (with optional leading space) */
853  CHUNK_NUMBER, /* Digits */
854  CHUNK_WHITESPACE, /* Whitespace (not attached to word) */
855  CHUNK_OTHER /* Punctuation, etc. */
857 
858 /* A chunk from pretokenization */
859 typedef struct {
860  const char *start;
861  int len;
862  ChunkType type;
863 } PretokChunk;
864 
865 /* Check if character is a newline (Ċ = 0xC4 0x8A in byte-level BPE) */
866 static bool is_bpe_newline(const char *s, int len) {
867  return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0x8A;
868 }
869 
870 /* Check if this is a non-letter, non-digit, non-newline character that can prefix a word */
871 static bool is_word_prefix_char(const char *s, int len) {
872  /* In GPT-2 regex: [^\r\n\p{L}\p{N}]? matches any char except newline, letter, digit */
873  if (len == 1) {
874  unsigned char c = (unsigned char)s[0];
875  return !is_letter(c) && !is_digit(c) && c != '\n' && c != '\r';
876  }
877  /* Multi-byte: check if it's not a letter/digit (newline is Ċ which we handle separately) */
878  if (is_gpt2_space(s, len)) return true; /* Space can prefix */
879  if (is_bpe_newline(s, len)) return false; /* Newline cannot prefix */
880  return true; /* Other multi-byte chars can prefix */
881 }
882 
883 /* Check if character is punctuation (not space, not letter, not digit) */
884 static bool is_bpe_punct(const char *s, int len) {
885  if (len == 1) {
886  unsigned char c = (unsigned char)s[0];
887  return !is_letter(c) && !is_digit(c) && c != ' ' && c != '\t' && c != '\n' && c != '\r';
888  }
889  /* Multi-byte: not Ġ (space) or Ċ (newline) */
890  if (is_gpt2_space(s, len)) return false;
891  if (len >= 2 && (unsigned char)s[0] == 0xC4) {
892  unsigned char c1 = (unsigned char)s[1];
893  /* Ċ (newline), ĉ (tab), č (CR) etc. are not punctuation */
894  if (c1 == 0x8A || c1 == 0x89 || c1 == 0x8D) return false;
895  }
896  return true;
897 }
898 
899 /*
900  * GPT-2 Pretokenizer
901  *
902  * Splits byte-level encoded text into chunks for independent BPE processing.
903  *
904  * The GPT-2 regex pattern (in order of matching):
905  * 1. (?i:'s|'t|'re|'ve|'m|'ll|'d) - Contractions
906  * 2. [^\r\n\p{L}\p{N}]?\p{L}+ - Words with optional prefix
907  * 3. \p{N} - Single digit
908  * 4. ?[^\s\p{L}\p{N}]+[\r\n]* - Optional space + punctuation + newlines
909  * 5. \s*[\r\n]+ - Whitespace + newlines
910  * 6. \s+(?!\S) - Trailing whitespace
911  * 7. \s+ - Whitespace
912  *
913  * Returns array of chunks (caller provides buffer).
914  */
915 static int gpt2_pretokenize(const char *text, int text_len, PretokChunk *chunks, int max_chunks) {
916  int num_chunks = 0;
917  int pos = 0;
918 
919  while (pos < text_len && num_chunks < max_chunks) {
920  int chunk_start = pos;
921  int char_len = utf8_char_len((unsigned char)text[pos]);
922  if (pos + char_len > text_len) char_len = text_len - pos;
923 
924  /* Pattern 2: [^\r\n\p{L}\p{N}]?\p{L}+ - Words with optional punctuation prefix */
925  /* This pattern MUST be checked before pattern 4 (punctuation) */
926  /* Check if we have: letter, or punctuation followed by letter */
927  bool is_word = false;
928  int word_start = pos;
929  int prefix_len = 0;
930 
931  if (is_bpe_letter(text + pos, char_len)) {
932  /* Word without prefix */
933  is_word = true;
934  } else if (is_bpe_punct(text + pos, char_len) && !is_gpt2_space(text + pos, text_len - pos)) {
935  /* Check if punctuation is followed by a letter */
936  int after = pos + char_len;
937  if (after < text_len) {
938  int next_len = utf8_char_len((unsigned char)text[after]);
939  if (is_bpe_letter(text + after, next_len)) {
940  /* This is pattern 2: punctuation + letters */
941  is_word = true;
942  prefix_len = char_len;
943  }
944  }
945  }
946 
947  if (is_word) {
948  /* Collect the word (with optional prefix) */
949  pos = word_start + prefix_len; /* Skip prefix if any */
950  while (pos < text_len) {
951  int clen = utf8_char_len((unsigned char)text[pos]);
952  if (is_bpe_letter(text + pos, clen)) {
953  pos += clen;
954  } else {
955  break;
956  }
957  }
958  chunks[num_chunks].start = text + chunk_start;
959  chunks[num_chunks].len = pos - chunk_start;
960  chunks[num_chunks].type = CHUNK_WORD;
961  num_chunks++;
962  continue;
963  }
964 
965  /* Pattern 3: \p{N} - Single digit */
966  if (is_bpe_digit(text + pos, char_len)) {
967  pos += char_len;
968  chunks[num_chunks].start = text + chunk_start;
969  chunks[num_chunks].len = pos - chunk_start;
970  chunks[num_chunks].type = CHUNK_NUMBER;
971  num_chunks++;
972  continue;
973  }
974 
975  /* Pattern 4: ?[^\s\p{L}\p{N}]+[\r\n]* - Optional space + punctuation + newlines */
976  /* At this point, we know the current char is NOT followed by letters (checked above) */
977  /* Check for space (Ġ) followed by punctuation, OR just punctuation */
978  bool has_leading_space = is_gpt2_space(text + pos, text_len - pos);
979  int punct_start = has_leading_space ? pos + 2 : pos;
980 
981  if (punct_start < text_len) {
982  int pchar_len = utf8_char_len((unsigned char)text[punct_start]);
983  if (is_bpe_punct(text + punct_start, pchar_len)) {
984  /* This matches pattern 4: space? + punctuation + newlines? */
985  if (has_leading_space) {
986  pos += 2; /* Include the leading space */
987  }
988  /* Collect punctuation characters */
989  /* Pattern 4 collects ALL consecutive punctuation, NOT stopping before letters */
990  /* The key insight: if we have " (" before "int", the " (" is pattern 4, */
991  /* and "(int" would be pattern 2, but pattern 4 matches first since " (" */
992  /* is a complete match for pattern 4 (space + one punctuation). */
993  while (pos < text_len) {
994  int clen = utf8_char_len((unsigned char)text[pos]);
995  if (is_bpe_punct(text + pos, clen)) {
996  pos += clen;
997  } else {
998  break;
999  }
1000  }
1001  /* Include trailing newlines */
1002  while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
1003  pos += 2;
1004  }
1005  chunks[num_chunks].start = text + chunk_start;
1006  chunks[num_chunks].len = pos - chunk_start;
1007  chunks[num_chunks].type = CHUNK_OTHER;
1008  num_chunks++;
1009  continue;
1010  }
1011  }
1012 
1013  /* Pattern 5/6/7: Whitespace handling */
1014  if (is_gpt2_space(text + pos, text_len - pos)) {
1015  /* Count consecutive spaces */
1016  int space_count = 0;
1017  int space_end = pos;
1018  while (space_end < text_len && is_gpt2_space(text + space_end, text_len - space_end)) {
1019  space_count++;
1020  space_end += 2;
1021  }
1022 
1023  /* Check if spaces are followed by newlines (pattern 5) */
1024  if (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
1025  /* Whitespace + newlines */
1026  while (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
1027  space_end += 2;
1028  }
1029  chunks[num_chunks].start = text + pos;
1030  chunks[num_chunks].len = space_end - pos;
1031  chunks[num_chunks].type = CHUNK_WHITESPACE;
1032  num_chunks++;
1033  pos = space_end;
1034  continue;
1035  }
1036 
1037  /* Check what follows the spaces */
1038  if (space_end < text_len) {
1039  int next_len = utf8_char_len((unsigned char)text[space_end]);
1040  if (is_bpe_letter(text + space_end, next_len)) {
1041  /* Letters follow - output (n-1) spaces, then space+word (pattern 2) */
1042  if (space_count > 1) {
1043  chunks[num_chunks].start = text + pos;
1044  chunks[num_chunks].len = (space_count - 1) * 2;
1045  chunks[num_chunks].type = CHUNK_WHITESPACE;
1046  num_chunks++;
1047  pos += (space_count - 1) * 2;
1048  if (num_chunks >= max_chunks) break;
1049  }
1050  /* Collect space + word */
1051  chunk_start = pos;
1052  pos += 2; /* Skip the Ġ */
1053  while (pos < text_len) {
1054  int clen = utf8_char_len((unsigned char)text[pos]);
1055  if (is_bpe_letter(text + pos, clen)) {
1056  pos += clen;
1057  } else {
1058  break;
1059  }
1060  }
1061  chunks[num_chunks].start = text + chunk_start;
1062  chunks[num_chunks].len = pos - chunk_start;
1063  chunks[num_chunks].type = CHUNK_WORD;
1064  num_chunks++;
1065  continue;
1066  } else if (is_bpe_digit(text + space_end, next_len)) {
1067  /* Digit follows - output (n-1) spaces, then space+digit */
1068  if (space_count > 1) {
1069  chunks[num_chunks].start = text + pos;
1070  chunks[num_chunks].len = (space_count - 1) * 2;
1071  chunks[num_chunks].type = CHUNK_WHITESPACE;
1072  num_chunks++;
1073  pos += (space_count - 1) * 2;
1074  if (num_chunks >= max_chunks) break;
1075  }
1076  /* Space + single digit */
1077  chunk_start = pos;
1078  pos += 2 + 1; /* Ġ + digit */
1079  chunks[num_chunks].start = text + chunk_start;
1080  chunks[num_chunks].len = pos - chunk_start;
1081  chunks[num_chunks].type = CHUNK_NUMBER;
1082  num_chunks++;
1083  continue;
1084  }
1085  /* Pattern 4 would have caught space+punct, so this is trailing space before something else */
1086  }
1087 
1088  /* Just whitespace */
1089  chunks[num_chunks].start = text + pos;
1090  chunks[num_chunks].len = space_count * 2;
1091  chunks[num_chunks].type = CHUNK_WHITESPACE;
1092  num_chunks++;
1093  pos = space_end;
1094  continue;
1095  }
1096 
1097  /* Pattern 5: Newlines (Ċ) */
1098  if (is_bpe_newline(text + pos, text_len - pos)) {
1099  while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
1100  pos += 2;
1101  }
1102  chunks[num_chunks].start = text + chunk_start;
1103  chunks[num_chunks].len = pos - chunk_start;
1104  chunks[num_chunks].type = CHUNK_OTHER;
1105  num_chunks++;
1106  continue;
1107  }
1108 
1109  /* Fallback: single character chunk */
1110  pos += char_len;
1111  chunks[num_chunks].start = text + chunk_start;
1112  chunks[num_chunks].len = pos - chunk_start;
1113  chunks[num_chunks].type = CHUNK_OTHER;
1114  num_chunks++;
1115  }
1116 
1117  return num_chunks;
1118 }
1119 
1120 /* Initialize token list from preprocessed text */
1121 static int init_tokens_from_text(CKTrueBPE *bpe, CKBPETokenList *list, const char *text, int text_len) {
1122  token_list_clear(list);
1123 
1124  int pos = 0;
1125  while (pos < text_len) {
1126  int char_len = utf8_char_len((unsigned char)text[pos]);
1127  if (pos + char_len > text_len) {
1128  char_len = text_len - pos; /* Truncated UTF-8 */
1129  }
1130 
1131  /* Look up this character/byte in vocabulary */
1132  char char_buf[8];
1133  memcpy(char_buf, text + pos, char_len);
1134  char_buf[char_len] = '\0';
1135 
1136  int32_t id = ck_true_bpe_lookup(bpe, char_buf);
1137 
1138  if (token_list_append(list, char_buf, char_len, id) != 0) {
1139  return -1;
1140  }
1141 
1142  pos += char_len;
1143  }
1144 
1145  return 0;
1146 }
1147 
1148 /* Find the best (highest priority = lowest number) merge in the token list */
1149 static int find_best_merge(const CKTrueBPE *bpe, const CKBPETokenList *list,
1150  size_t *best_pos, const CKBPEMerge **best_merge) {
1151  *best_pos = 0;
1152  *best_merge = NULL;
1153  int32_t best_priority = INT32_MAX;
1154 
1155  for (size_t i = 0; i + 1 < list->count; i++) {
1156  int32_t left_id = list->tokens[i].id;
1157  int32_t right_id = list->tokens[i + 1].id;
1158 
1159  if (left_id < 0 || right_id < 0) continue; /* Unknown tokens can't merge */
1160 
1161  const CKBPEMerge *merge = merge_table_lookup(bpe->merges, left_id, right_id);
1162  if (merge && merge->priority < best_priority) {
1163  best_priority = merge->priority;
1164  *best_pos = i;
1165  *best_merge = merge;
1166  }
1167  }
1168 
1169  return (*best_merge != NULL) ? 0 : -1;
1170 }
1171 
1172 /* Apply BPE merges until no more possible */
1173 static int apply_bpe_merges(CKTrueBPE *bpe, CKBPETokenList *list) {
1174  char merged_buf[MAX_TOKEN_LEN * 2];
1175 
1176  while (list->count > 1) {
1177  size_t best_pos;
1178  const CKBPEMerge *best_merge;
1179 
1180  if (find_best_merge(bpe, list, &best_pos, &best_merge) != 0) {
1181  break; /* No more merges possible */
1182  }
1183 
1184  /* Get merged token string */
1185  const char *merged_str = bpe->id_to_token[best_merge->merged_id];
1186  if (!merged_str) {
1187  /* Construct from left + right */
1188  size_t left_len = list->tokens[best_pos].len;
1189  size_t right_len = list->tokens[best_pos + 1].len;
1190 
1191  if (left_len + right_len >= sizeof(merged_buf)) {
1192  break; /* Too long */
1193  }
1194 
1195  memcpy(merged_buf, list->tokens[best_pos].str, left_len);
1196  memcpy(merged_buf + left_len, list->tokens[best_pos + 1].str, right_len);
1197  merged_buf[left_len + right_len] = '\0';
1198  merged_str = merged_buf;
1199  }
1200 
1201  /* Apply the merge */
1202  if (token_list_merge_at(list, best_pos, merged_str, strlen(merged_str), best_merge->merged_id) != 0) {
1203  break;
1204  }
1205  }
1206 
1207  return 0;
1208 }
1209 
1210 /*
1211  * Encode a single chunk (after pretokenization) using BPE
1212  */
1213 static int encode_chunk(CKTrueBPE *bpe, const char *chunk, int chunk_len,
1214  int32_t *ids, int max_ids, CKBPETokenList *list) {
1215  if (chunk_len <= 0) return 0;
1216 
1217  /* First, try to look up the entire chunk as a single token */
1218  char chunk_buf[256];
1219  if (chunk_len < (int)sizeof(chunk_buf)) {
1220  memcpy(chunk_buf, chunk, chunk_len);
1221  chunk_buf[chunk_len] = '\0';
1222  int32_t chunk_id = ck_true_bpe_lookup(bpe, chunk_buf);
1223  if (chunk_id >= 0) {
1224  /* Entire chunk is a single token */
1225  if (max_ids >= 1) {
1226  ids[0] = chunk_id;
1227  return 1;
1228  }
1229  return 0;
1230  }
1231  }
1232 
1233  /* Initialize token list from chunk characters */
1234  if (init_tokens_from_text(bpe, list, chunk, chunk_len) != 0) {
1235  return 0;
1236  }
1237 
1238  /* Apply BPE merges to this chunk */
1239  apply_bpe_merges(bpe, list);
1240 
1241  /* Extract token IDs from this chunk */
1242  int out_idx = 0;
1243  for (size_t i = 0; i < list->count && out_idx < max_ids; i++) {
1244  int32_t id = list->tokens[i].id;
1245 
1246  /* Handle unknown tokens */
1247  if (id < 0) {
1248  if (bpe->config.byte_fallback) {
1249  /* Output each byte as separate token (byte fallback) */
1250  for (size_t j = 0; j < list->tokens[i].len && out_idx < max_ids; j++) {
1251  char byte_token[8];
1252  snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)list->tokens[i].str[j]);
1253  int32_t byte_id = ck_true_bpe_lookup(bpe, byte_token);
1254  ids[out_idx++] = (byte_id >= 0) ? byte_id : bpe->unk_id;
1255  }
1256  } else {
1257  ids[out_idx++] = bpe->unk_id;
1258  }
1259  } else {
1260  ids[out_idx++] = id;
1261  }
1262  }
1263 
1264  return out_idx;
1265 }
1266 
1267 /*
1268  * Helper: Encode a segment of text (no special tokens) using BPE
1269  */
1270 static int encode_text_segment(CKTrueBPE *bpe, const char *text, int text_len,
1271  int32_t *ids, int max_ids) {
1272  if (text_len <= 0) return 0;
1273 
1274  /* Preprocess text (byte-level encoding) */
1275  char preprocessed[16384];
1276  int pp_len = preprocess_text(bpe, text, text_len, preprocessed, sizeof(preprocessed) - 1);
1277  if (pp_len < 0) {
1278  return 0;
1279  }
1280  preprocessed[pp_len] = '\0';
1281 
1282  int out_idx = 0;
1283 
1284  /* For GPT-2 style, use pretokenizer to split into chunks */
1285  if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_GPT2 ||
1286  bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
1287 
1288  /* Pretokenize */
1289  PretokChunk chunks[1024];
1290  int num_chunks = gpt2_pretokenize(preprocessed, pp_len, chunks, 1024);
1291 
1292  /* Create reusable token list */
1293  CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
1294  if (!list) return out_idx;
1295 
1296  /* Process each chunk independently with BPE */
1297  for (int c = 0; c < num_chunks && out_idx < max_ids; c++) {
1298  int chunk_ids = encode_chunk(bpe, chunks[c].start, chunks[c].len,
1299  ids + out_idx, max_ids - out_idx, list);
1300  out_idx += chunk_ids;
1301  }
1302 
1303  token_list_free(list);
1304  } else {
1305  /* SentencePiece style: no pretokenization, process entire text */
1306  CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
1307  if (!list) return out_idx;
1308 
1309  int chunk_ids = encode_chunk(bpe, preprocessed, pp_len,
1310  ids + out_idx, max_ids - out_idx, list);
1311  out_idx += chunk_ids;
1312 
1313  token_list_free(list);
1314  }
1315 
1316  return out_idx;
1317 }
1318 
1319 /*
1320  * Check if text at position matches a special token
1321  * Returns: matched special token index, or -1 if no match
1322  */
1323 static int match_special_token(const CKTrueBPE *bpe, const char *text, int text_len, int pos) {
1324  int remaining = text_len - pos;
1325  const char *cur = text + pos;
1326 
1327  /* Special tokens are sorted longest first, so first match is best */
1328  for (int i = 0; i < bpe->num_special_tokens; i++) {
1329  int tok_len = bpe->special_tokens[i].len;
1330  if (tok_len <= remaining &&
1331  memcmp(cur, bpe->special_tokens[i].token, tok_len) == 0) {
1332  return i;
1333  }
1334  }
1335  return -1;
1336 }
1337 
1338 int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids) {
1339  if (!bpe || !text || !ids || max_ids <= 0) return 0;
1340  if (text_len < 0) text_len = (int)strlen(text);
1341  if (text_len == 0) return 0;
1342 
1343  /* Auto-detect space style if needed */
1344  if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
1346  }
1347 
1348  int out_idx = 0;
1349 
1350  /* Add BOS token if configured */
1351  if (bpe->config.add_bos && bpe->bos_id >= 0 && out_idx < max_ids) {
1352  ids[out_idx++] = bpe->bos_id;
1353  }
1354 
1355  /* If no special tokens registered, use fast path */
1356  if (bpe->num_special_tokens == 0) {
1357  out_idx += encode_text_segment(bpe, text, text_len, ids + out_idx, max_ids - out_idx);
1358  } else {
1359  /* Scan for special tokens and encode segments between them */
1360  int pos = 0;
1361  int segment_start = 0;
1362 
1363  while (pos < text_len && out_idx < max_ids) {
1364  int match = match_special_token(bpe, text, text_len, pos);
1365 
1366  if (match >= 0) {
1367  /* Found special token - first encode any text before it */
1368  if (pos > segment_start) {
1369  int seg_len = pos - segment_start;
1370  out_idx += encode_text_segment(bpe, text + segment_start, seg_len,
1371  ids + out_idx, max_ids - out_idx);
1372  }
1373 
1374  /* Output the special token ID */
1375  if (out_idx < max_ids) {
1376  ids[out_idx++] = bpe->special_tokens[match].id;
1377  }
1378 
1379  /* Advance past the special token */
1380  pos += bpe->special_tokens[match].len;
1381  segment_start = pos;
1382  } else {
1383  /* No special token here, advance to next character */
1384  pos++;
1385  }
1386  }
1387 
1388  /* Encode any remaining text after last special token */
1389  if (segment_start < text_len && out_idx < max_ids) {
1390  out_idx += encode_text_segment(bpe, text + segment_start, text_len - segment_start,
1391  ids + out_idx, max_ids - out_idx);
1392  }
1393  }
1394 
1395  /* Add EOS token if configured */
1396  if (bpe->config.add_eos && bpe->eos_id >= 0 && out_idx < max_ids) {
1397  ids[out_idx++] = bpe->eos_id;
1398  }
1399 
1400  return out_idx;
1401 }
1402 
1403 /* ═══════════════════════════════════════════════════════════════════════════════
1404  * Decoding
1405  * ═══════════════════════════════════════════════════════════════════════════════ */
1406 
1407 /*
1408  * Decode GPT-2 byte-level BPE character back to original byte
1409  *
1410  * GPT-2 maps control characters and special bytes to Unicode codepoints:
1411  * - U+0100 to U+011F: bytes 0x00 to 0x1F (control chars)
1412  * - U+0120: byte 0x20 (space) -> Ġ
1413  * - U+017F to U+01A0: bytes 0x7F to 0xA0
1414  *
1415  * Returns: decoded byte, or -1 if not a GPT-2 byte encoding
1416  */
1417 static int gpt2_decode_byte(const unsigned char *s, int len) {
1418  if (len < 2) return -1;
1419 
1420  /* Check for 2-byte UTF-8 sequence: 110xxxxx 10xxxxxx */
1421  if ((s[0] & 0xE0) == 0xC0 && (s[1] & 0xC0) == 0x80) {
1422  unsigned int codepoint = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
1423 
1424  /* GPT-2 byte range: U+0100 to U+01FF */
1425  if (codepoint >= 0x100 && codepoint <= 0x1FF) {
1426  /* Decode back to original byte */
1427  if (codepoint <= 0x120) {
1428  /* U+0100-U+0120 -> bytes 0x00-0x20 */
1429  return codepoint - 0x100;
1430  } else if (codepoint >= 0x17F && codepoint <= 0x1A0) {
1431  /* U+017F-U+01A0 -> bytes 0x7F-0xA0 */
1432  return codepoint - 0x100;
1433  }
1434  }
1435  }
1436  return -1;
1437 }
1438 
1439 int ck_true_bpe_decode(const CKTrueBPE *bpe, const int32_t *ids, int num_ids, char *text, int max_len) {
1440  if (!bpe || !ids || !text || max_len <= 0) return 0;
1441 
1442  int len = 0;
1443  for (int i = 0; i < num_ids && len < max_len - 1; i++) {
1444  int32_t id = ids[i];
1445  if (id < 0) continue;
1446 
1447  /* Skip special tokens */
1448  if (id == bpe->bos_id || id == bpe->eos_id || id == bpe->pad_id) {
1449  continue;
1450  }
1451 
1452  const char *token = ck_true_bpe_id_to_token(bpe, id);
1453  if (!token) continue;
1454 
1455  int token_len = (int)strlen(token);
1456 
1457  /* Check for SentencePiece space marker ▁ (U+2581) at start */
1458  if (token_len >= 3 &&
1459  (unsigned char)token[0] == 0xE2 &&
1460  (unsigned char)token[1] == 0x96 &&
1461  (unsigned char)token[2] == 0x81) {
1462  /* ▁ -> space */
1463  if (len < max_len - 1) text[len++] = ' ';
1464  token += 3;
1465  token_len -= 3;
1466  }
1467 
1468  /* Process rest of token, decoding GPT-2 byte-level encoding */
1469  int pos = 0;
1470  while (pos < token_len && len < max_len - 1) {
1471  unsigned char c0 = (unsigned char)token[pos];
1472 
1473  /* Try GPT-2 byte decoding for 2-byte UTF-8 sequences */
1474  if (pos + 1 < token_len && (c0 & 0xE0) == 0xC0) {
1475  int decoded = gpt2_decode_byte((unsigned char*)token + pos, token_len - pos);
1476  if (decoded >= 0) {
1477  text[len++] = (char)decoded;
1478  pos += 2;
1479  continue;
1480  }
1481  }
1482 
1483  /* Regular character - copy as-is */
1484  int char_len = 1;
1485  if ((c0 & 0x80) == 0) char_len = 1; /* ASCII */
1486  else if ((c0 & 0xE0) == 0xC0) char_len = 2;
1487  else if ((c0 & 0xF0) == 0xE0) char_len = 3;
1488  else if ((c0 & 0xF8) == 0xF0) char_len = 4;
1489 
1490  /* Copy the character */
1491  for (int j = 0; j < char_len && pos + j < token_len && len < max_len - 1; j++) {
1492  text[len++] = token[pos + j];
1493  }
1494  pos += char_len;
1495  }
1496  }
1497 
1498  text[len] = '\0';
1499  return len;
1500 }
1501 
1502 /* ═══════════════════════════════════════════════════════════════════════════════
1503  * Statistics
1504  * ═══════════════════════════════════════════════════════════════════════════════ */
1505 
1506 size_t ck_true_bpe_vocab_size(const CKTrueBPE *bpe) {
1507  return bpe ? bpe->vocab_size : 0;
1508 }
1509 
1510 int32_t ck_true_bpe_num_merges(const CKTrueBPE *bpe) {
1511  return bpe ? bpe->num_merges : 0;
1512 }
#define CK_TOKENIZER_HT_BUCKETS_LARGE
Definition: hash_table.h:142
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
const int32_t * ids
Definition: tokenizer.h:443
int32_t id
Definition: tokenizer.h:315
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
int32_t float * score
Definition: tokenizer.h:327
int32_t unk
Definition: tokenizer.h:229
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232
CKSpacePrefixStyle style
Definition: tokenizer.h:287
int32_t int32_t bos
Definition: tokenizer.h:230
const int32_t int int * out_len
Definition: tokenizer.h:445
static bool is_bpe_punct(const char *s, int len)
Definition: true_bpe.c:884
int ck_true_bpe_decode(const CKTrueBPE *bpe, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: true_bpe.c:1439
static bool is_gpt2_space(const char *s, int len)
Definition: true_bpe.c:828
static const CKBPEMerge * merge_table_lookup(const CKMergeTable *table, int32_t left_id, int32_t right_id)
Definition: true_bpe.c:226
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1338
static bool is_letter(unsigned char c)
Definition: true_bpe.c:815
static CKBPETokenList * token_list_create(size_t initial_capacity)
Definition: true_bpe.c:245
static int encode_text_segment(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1270
void ck_true_bpe_set_config(CKTrueBPE *bpe, const CKBPEConfig *config)
Definition: true_bpe.c:560
static void merge_table_free(CKMergeTable *table)
Definition: true_bpe.c:182
#define MAX_TOKEN_LEN
Definition: true_bpe.c:65
CKSpacePrefixStyle ck_true_bpe_detect_space_style(CKTrueBPE *bpe)
Definition: true_bpe.c:654
#define MAX_SPECIAL_TOKENS
Definition: true_bpe.c:108
void ck_true_bpe_free(CKTrueBPE *bpe)
Definition: true_bpe.c:405
CKTrueBPE * ck_true_bpe_create(void)
Definition: true_bpe.c:342
void ck_true_bpe_set_special_ids(CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad)
Definition: true_bpe.c:552
static bool is_bpe_letter(const char *s, int len)
Definition: true_bpe.c:833
static int init_tokens_from_text(CKTrueBPE *bpe, CKBPETokenList *list, const char *text, int text_len)
Definition: true_bpe.c:1121
ChunkType
Definition: true_bpe.c:851
@ CHUNK_NUMBER
Definition: true_bpe.c:853
@ CHUNK_OTHER
Definition: true_bpe.c:855
@ CHUNK_WHITESPACE
Definition: true_bpe.c:854
@ CHUNK_WORD
Definition: true_bpe.c:852
static bool is_bpe_newline(const char *s, int len)
Definition: true_bpe.c:866
static bool is_bpe_digit(const char *s, int len)
Definition: true_bpe.c:842
static int gpt2_decode_byte(const unsigned char *s, int len)
Definition: true_bpe.c:1417
int ck_true_bpe_add_merge(CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
Definition: true_bpe.c:497
static void token_list_clear(CKBPETokenList *list)
Definition: true_bpe.c:273
static int encode_chunk(CKTrueBPE *bpe, const char *chunk, int chunk_len, int32_t *ids, int max_ids, CKBPETokenList *list)
Definition: true_bpe.c:1213
static int gpt2_pretokenize(const char *text, int text_len, PretokChunk *chunks, int max_chunks)
Definition: true_bpe.c:915
static int utf8_char_len(unsigned char c)
Definition: true_bpe.c:790
static void token_list_free(CKBPETokenList *list)
Definition: true_bpe.c:260
static int preprocess_text(const CKTrueBPE *bpe, const char *text, int text_len, char *out, int out_max)
Definition: true_bpe.c:749
static int token_list_append(CKBPETokenList *list, const char *str, size_t len, int32_t id)
Definition: true_bpe.c:283
static bool is_whitespace(unsigned char c)
Definition: true_bpe.c:823
int32_t ck_true_bpe_num_merges(const CKTrueBPE *bpe)
Definition: true_bpe.c:1510
int ck_true_bpe_add_special_token(CKTrueBPE *bpe, const char *token, int32_t id)
Definition: true_bpe.c:565
#define MERGE_HASH_SIZE
Definition: true_bpe.c:63
static int match_special_token(const CKTrueBPE *bpe, const char *text, int text_len, int pos)
Definition: true_bpe.c:1323
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: true_bpe.c:606
static size_t merge_hash(uint64_t key, size_t num_buckets)
Definition: true_bpe.c:157
static int apply_bpe_merges(CKTrueBPE *bpe, CKBPETokenList *list)
Definition: true_bpe.c:1173
static int find_best_merge(const CKTrueBPE *bpe, const CKBPETokenList *list, size_t *best_pos, const CKBPEMerge **best_merge)
Definition: true_bpe.c:1149
static bool is_word_prefix_char(const char *s, int len)
Definition: true_bpe.c:871
static int merge_table_insert(CKMergeTable *table, const CKBPEMerge *merge)
Definition: true_bpe.c:198
size_t ck_true_bpe_vocab_size(const CKTrueBPE *bpe)
Definition: true_bpe.c:1506
int ck_true_bpe_add_token(CKTrueBPE *bpe, const char *token, int32_t id, float score)
Definition: true_bpe.c:449
static CKMergeTable * merge_table_create(size_t num_buckets)
Definition: true_bpe.c:167
static uint64_t merge_key(int32_t left_id, int32_t right_id)
Definition: true_bpe.c:153
static bool is_digit(unsigned char c)
Definition: true_bpe.c:819
static int byte_to_gpt2(unsigned char byte, char *out)
Definition: true_bpe.c:705
static int token_list_merge_at(CKBPETokenList *list, size_t pos, const char *merged_str, size_t merged_len, int32_t merged_id)
Definition: true_bpe.c:309
int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token)
Definition: true_bpe.c:638
int ck_true_bpe_add_merge_by_tokens(CKTrueBPE *bpe, const char *left, const char *right, int32_t priority)
Definition: true_bpe.c:514
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
#define INITIAL_TOKEN_CAPACITY
Definition: true_bpe.c:64
const CKBPEConfig * config
Definition: true_bpe.h:171
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const int32_t int char int max_len
Definition: true_bpe.h:280
int32_t left_id
Definition: true_bpe.h:112
const char int text_len
Definition: true_bpe.h:262
int vocab_size
Definition: true_bpe.h:185
int32_t int32_t right_id
Definition: true_bpe.h:113
int const int32_t * offsets
Definition: true_bpe.h:186
const char * left
Definition: true_bpe.h:130
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114
const char int int32_t int max_ids
Definition: true_bpe.h:264
const char const char * right
Definition: true_bpe.h:131
uint32_t start
Definition: utf8.c:214