← Back to C-Kernel-Engine Docs Doxygen Source Documentation
true_bpe.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <limits.h>
#include "tokenizer/true_bpe.h"

Go to the source code of this file.

Macros

#define INITIAL_TOKEN_CAPACITY   256 /* Initial capacity for token list */
 
#define MAX_SPECIAL_TOKENS   32 /* Maximum number of special tokens */
 
#define MAX_TOKEN_LEN   128 /* Maximum length of a single token string */
 
#define MERGE_HASH_SIZE   65536 /* Size of merge lookup hash table */
 

Enumerations

enum  ChunkType { CHUNK_WORD , CHUNK_NUMBER , CHUNK_WHITESPACE , CHUNK_OTHER }
 

Functions

static int apply_bpe_merges (CKTrueBPE *bpe, CKBPETokenList *list)
 
static int byte_to_gpt2 (unsigned char byte, char *out)
 
int ck_true_bpe_add_merge (CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
 
int ck_true_bpe_add_merge_by_tokens (CKTrueBPE *bpe, const char *left, const char *right, int32_t priority)
 
int ck_true_bpe_add_special_token (CKTrueBPE *bpe, const char *token, int32_t id)
 
int ck_true_bpe_add_token (CKTrueBPE *bpe, const char *token, int32_t id, float score)
 
CKTrueBPE * ck_true_bpe_create (void)
 
int ck_true_bpe_decode (const CKTrueBPE *bpe, const int32_t *ids, int num_ids, char *text, int max_len)
 
CKSpacePrefixStyle ck_true_bpe_detect_space_style (CKTrueBPE *bpe)
 
int ck_true_bpe_encode (CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
 
void ck_true_bpe_free (CKTrueBPE *bpe)
 
const char * ck_true_bpe_id_to_token (const CKTrueBPE *bpe, int32_t id)
 
int ck_true_bpe_load_binary (CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
 
int32_t ck_true_bpe_lookup (const CKTrueBPE *bpe, const char *token)
 
int32_t ck_true_bpe_num_merges (const CKTrueBPE *bpe)
 
void ck_true_bpe_set_config (CKTrueBPE *bpe, const CKBPEConfig *config)
 
void ck_true_bpe_set_special_ids (CKTrueBPE *bpe, int32_t unk, int32_t bos, int32_t eos, int32_t pad)
 
size_t ck_true_bpe_vocab_size (const CKTrueBPE *bpe)
 
static int encode_chunk (CKTrueBPE *bpe, const char *chunk, int chunk_len, int32_t *ids, int max_ids, CKBPETokenList *list)
 
static int encode_text_segment (CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
 
static int find_best_merge (const CKTrueBPE *bpe, const CKBPETokenList *list, size_t *best_pos, const CKBPEMerge **best_merge)
 
static int gpt2_decode_byte (const unsigned char *s, int len)
 
static int gpt2_pretokenize (const char *text, int text_len, PretokChunk *chunks, int max_chunks)
 
static int init_tokens_from_text (CKTrueBPE *bpe, CKBPETokenList *list, const char *text, int text_len)
 
static bool is_bpe_digit (const char *s, int len)
 
static bool is_bpe_letter (const char *s, int len)
 
static bool is_bpe_newline (const char *s, int len)
 
static bool is_bpe_punct (const char *s, int len)
 
static bool is_digit (unsigned char c)
 
static bool is_gpt2_space (const char *s, int len)
 
static bool is_letter (unsigned char c)
 
static bool is_whitespace (unsigned char c)
 
static bool is_word_prefix_char (const char *s, int len)
 
static int match_special_token (const CKTrueBPE *bpe, const char *text, int text_len, int pos)
 
static size_t merge_hash (uint64_t key, size_t num_buckets)
 
static uint64_t merge_key (int32_t left_id, int32_t right_id)
 
static CKMergeTable * merge_table_create (size_t num_buckets)
 
static void merge_table_free (CKMergeTable *table)
 
static int merge_table_insert (CKMergeTable *table, const CKBPEMerge *merge)
 
static const CKBPEMerge * merge_table_lookup (const CKMergeTable *table, int32_t left_id, int32_t right_id)
 
static int preprocess_text (const CKTrueBPE *bpe, const char *text, int text_len, char *out, int out_max)
 
static int token_list_append (CKBPETokenList *list, const char *str, size_t len, int32_t id)
 
static void token_list_clear (CKBPETokenList *list)
 
static CKBPETokenList * token_list_create (size_t initial_capacity)
 
static void token_list_free (CKBPETokenList *list)
 
static int token_list_merge_at (CKBPETokenList *list, size_t pos, const char *merged_str, size_t merged_len, int32_t merged_id)
 
static int utf8_char_len (unsigned char c)
 

Macro Definition Documentation

◆ INITIAL_TOKEN_CAPACITY

#define INITIAL_TOKEN_CAPACITY   256 /* Initial capacity for token list */

Definition at line 64 of file true_bpe.c.

◆ MAX_SPECIAL_TOKENS

#define MAX_SPECIAL_TOKENS   32 /* Maximum number of special tokens */

Definition at line 108 of file true_bpe.c.

◆ MAX_TOKEN_LEN

#define MAX_TOKEN_LEN   128 /* Maximum length of a single token string */

Definition at line 65 of file true_bpe.c.

◆ MERGE_HASH_SIZE

#define MERGE_HASH_SIZE   65536 /* Size of merge lookup hash table */

Definition at line 63 of file true_bpe.c.

Enumeration Type Documentation

◆ ChunkType

enum ChunkType
Enumerator
CHUNK_WORD 
CHUNK_NUMBER 
CHUNK_WHITESPACE 
CHUNK_OTHER 

Definition at line 851 of file true_bpe.c.

851  {
852  CHUNK_WORD, /* Letters (with optional leading space) */
853  CHUNK_NUMBER, /* Digits */
854  CHUNK_WHITESPACE, /* Whitespace (not attached to word) */
855  CHUNK_OTHER /* Punctuation, etc. */
856 } ChunkType;
ChunkType
Definition: true_bpe.c:851
@ CHUNK_NUMBER
Definition: true_bpe.c:853
@ CHUNK_OTHER
Definition: true_bpe.c:855
@ CHUNK_WHITESPACE
Definition: true_bpe.c:854
@ CHUNK_WORD
Definition: true_bpe.c:852

Function Documentation

◆ apply_bpe_merges()

static int apply_bpe_merges ( CKTrueBPE *  bpe,
CKBPETokenList *  list 
)
static

Definition at line 1173 of file true_bpe.c.

1173  {
1174  char merged_buf[MAX_TOKEN_LEN * 2];
1175 
1176  while (list->count > 1) {
1177  size_t best_pos;
1178  const CKBPEMerge *best_merge;
1179 
1180  if (find_best_merge(bpe, list, &best_pos, &best_merge) != 0) {
1181  break; /* No more merges possible */
1182  }
1183 
1184  /* Get merged token string */
1185  const char *merged_str = bpe->id_to_token[best_merge->merged_id];
1186  if (!merged_str) {
1187  /* Construct from left + right */
1188  size_t left_len = list->tokens[best_pos].len;
1189  size_t right_len = list->tokens[best_pos + 1].len;
1190 
1191  if (left_len + right_len >= sizeof(merged_buf)) {
1192  break; /* Too long */
1193  }
1194 
1195  memcpy(merged_buf, list->tokens[best_pos].str, left_len);
1196  memcpy(merged_buf + left_len, list->tokens[best_pos + 1].str, right_len);
1197  merged_buf[left_len + right_len] = '\0';
1198  merged_str = merged_buf;
1199  }
1200 
1201  /* Apply the merge */
1202  if (token_list_merge_at(list, best_pos, merged_str, strlen(merged_str), best_merge->merged_id) != 0) {
1203  break;
1204  }
1205  }
1206 
1207  return 0;
1208 }
#define MAX_TOKEN_LEN
Definition: true_bpe.c:65
static int find_best_merge(const CKTrueBPE *bpe, const CKBPETokenList *list, size_t *best_pos, const CKBPEMerge **best_merge)
Definition: true_bpe.c:1149
static int token_list_merge_at(CKBPETokenList *list, size_t pos, const char *merged_str, size_t merged_len, int32_t merged_id)
Definition: true_bpe.c:309

References find_best_merge(), MAX_TOKEN_LEN, and token_list_merge_at().

Referenced by encode_chunk().

◆ byte_to_gpt2()

static int byte_to_gpt2 ( unsigned char  byte,
char *  out 
)
static

Definition at line 705 of file true_bpe.c.

705  {
706  if (byte >= 0x21 && byte <= 0x7E && byte != '!') {
707  /* Printable ASCII (except control chars) stays as-is */
708  out[0] = (char)byte;
709  return 1;
710  }
711 
712  /* Control chars and special bytes map to U+0100 + byte */
713  /* This gives us characters like Ġ (U+0120), Ċ (U+010A), etc. */
714  unsigned int codepoint;
715 
716  /* GPT-2's byte_encoder mapping */
717  if (byte == '!') codepoint = byte;
718  else if (byte == '"') codepoint = byte;
719  else if (byte >= '#' && byte <= '~') codepoint = byte;
720  else if (byte == 0x21) codepoint = '!'; /* Already handled above */
721  else {
722  /* Map 0x00-0x20, 0x7F-0xFF to offset range */
723  if (byte <= 0x20) {
724  codepoint = 0x100 + byte; /* 0x00-0x20 → U+0100-U+0120 */
725  } else if (byte >= 0x7F && byte <= 0xA0) {
726  codepoint = 0x100 + byte; /* 0x7F-0xA0 → U+017F-U+01A0 */
727  } else {
728  codepoint = byte; /* Others: 0xA1-0xFF stay as-is */
729  }
730  }
731 
732  /* Encode as UTF-8 */
733  if (codepoint < 0x80) {
734  out[0] = (char)codepoint;
735  return 1;
736  } else if (codepoint < 0x800) {
737  out[0] = (char)(0xC0 | (codepoint >> 6));
738  out[1] = (char)(0x80 | (codepoint & 0x3F));
739  return 2;
740  } else {
741  out[0] = (char)(0xE0 | (codepoint >> 12));
742  out[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
743  out[2] = (char)(0x80 | (codepoint & 0x3F));
744  return 3;
745  }
746 }

Referenced by preprocess_text().

◆ ck_true_bpe_add_merge()

int ck_true_bpe_add_merge ( CKTrueBPE *  bpe,
int32_t  left_id,
int32_t  right_id,
int32_t  merged_id,
int32_t  priority 
)

Definition at line 497 of file true_bpe.c.

497  {
498  if (!bpe) return -1;
499 
500  CKBPEMerge merge = {
501  .left_id = left_id,
502  .right_id = right_id,
503  .merged_id = merged_id,
504  .priority = priority
505  };
506 
507  int ret = merge_table_insert(bpe->merges, &merge);
508  if (ret == 0) {
509  bpe->num_merges++;
510  }
511  return ret;
512 }
static int merge_table_insert(CKMergeTable *table, const CKBPEMerge *merge)
Definition: true_bpe.c:198
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
int32_t left_id
Definition: true_bpe.h:112
int32_t int32_t right_id
Definition: true_bpe.h:113
int32_t int32_t int32_t merged_id
Definition: true_bpe.h:114

References left_id, merge_table_insert(), merged_id, priority, and right_id.

Referenced by ck_true_bpe_add_merge_by_tokens(), and ck_true_bpe_load_binary().

◆ ck_true_bpe_add_merge_by_tokens()

int ck_true_bpe_add_merge_by_tokens ( CKTrueBPE *  bpe,
const char *  left,
const char *  right,
int32_t  priority 
)

Definition at line 514 of file true_bpe.c.

514  {
515  if (!bpe || !left || !right) return -1;
516 
517  /* Look up token IDs */
518  BPETokenInfo *left_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, left);
519  BPETokenInfo *right_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, right);
520 
521  if (!left_info || !right_info) {
522  return -1; /* Tokens not in vocabulary */
523  }
524 
525  /* Create merged token string */
526  size_t left_len = strlen(left);
527  size_t right_len = strlen(right);
528  size_t merged_len = left_len + right_len;
529 
530  if (merged_len >= bpe->str_buffer_size) {
531  return -1; /* Too long */
532  }
533 
534  memcpy(bpe->str_buffer, left, left_len);
535  memcpy(bpe->str_buffer + left_len, right, right_len);
536  bpe->str_buffer[merged_len] = '\0';
537 
538  /* Look up or create merged token */
539  BPETokenInfo *merged_info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, bpe->str_buffer);
540  int32_t merged_id;
541 
542  if (merged_info) {
543  merged_id = merged_info->id;
544  } else {
545  /* Merged token should already exist in vocabulary */
546  return -1;
547  }
548 
549  return ck_true_bpe_add_merge(bpe, left_info->id, right_info->id, merged_id, priority);
550 }
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
int ck_true_bpe_add_merge(CKTrueBPE *bpe, int32_t left_id, int32_t right_id, int32_t merged_id, int32_t priority)
Definition: true_bpe.c:497
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131

References ck_tokenizer_hash_table_lookup(), ck_true_bpe_add_merge(), left, merged_id, priority, and right.

◆ ck_true_bpe_add_special_token()

int ck_true_bpe_add_special_token ( CKTrueBPE *  bpe,
const char *  token,
int32_t  id 
)

Definition at line 565 of file true_bpe.c.

565  {
566  if (!bpe || !token || id < 0) return -1;
567  if (bpe->num_special_tokens >= MAX_SPECIAL_TOKENS) return -1;
568 
569  int token_len = (int)strlen(token);
570  if (token_len == 0) return -1;
571 
572  /* Check if already exists */
573  for (int i = 0; i < bpe->num_special_tokens; i++) {
574  if (bpe->special_tokens[i].token &&
575  strcmp(bpe->special_tokens[i].token, token) == 0) {
576  /* Update ID for existing token */
577  bpe->special_tokens[i].id = id;
578  return 0;
579  }
580  }
581 
582  /* Find insertion point (keep sorted by length, longest first) */
583  int insert_idx = bpe->num_special_tokens;
584  for (int i = 0; i < bpe->num_special_tokens; i++) {
585  if (token_len > bpe->special_tokens[i].len) {
586  insert_idx = i;
587  break;
588  }
589  }
590 
591  /* Shift existing entries down */
592  for (int i = bpe->num_special_tokens; i > insert_idx; i--) {
593  bpe->special_tokens[i] = bpe->special_tokens[i - 1];
594  }
595 
596  /* Insert new entry */
597  bpe->special_tokens[insert_idx].token = strdup(token);
598  if (!bpe->special_tokens[insert_idx].token) return -1;
599  bpe->special_tokens[insert_idx].id = id;
600  bpe->special_tokens[insert_idx].len = token_len;
601  bpe->num_special_tokens++;
602 
603  return 0;
604 }
int32_t id
Definition: tokenizer.h:315
const char * token
Definition: tokenizer.h:306
#define MAX_SPECIAL_TOKENS
Definition: true_bpe.c:108

References id, MAX_SPECIAL_TOKENS, and token.

Referenced by main().

◆ ck_true_bpe_add_token()

int ck_true_bpe_add_token ( CKTrueBPE *  bpe,
const char *  token,
int32_t  id,
float  score 
)

Definition at line 449 of file true_bpe.c.

449  {
450  if (!bpe || !token) return -1;
451 
452  /* Ensure reverse vocab has space */
453  if (id >= (int32_t)bpe->vocab_capacity) {
454  size_t new_cap = bpe->vocab_capacity * 2;
455  while (new_cap <= (size_t)id) new_cap *= 2;
456 
457  char **new_array = (char **)realloc(bpe->id_to_token, new_cap * sizeof(char *));
458  if (!new_array) return -1;
459 
460  memset(new_array + bpe->vocab_capacity, 0, (new_cap - bpe->vocab_capacity) * sizeof(char *));
461  bpe->id_to_token = new_array;
462  bpe->vocab_capacity = new_cap;
463  }
464 
465  /* Check if token exists */
466  BPETokenInfo *existing = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
467  if (existing) {
468  existing->id = id;
469  existing->score = score;
470  if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
471  bpe->id_to_token[id] = strdup(token);
472  return 0;
473  }
474 
475  /* Create new token info */
476  BPETokenInfo *info = (BPETokenInfo *)malloc(sizeof(BPETokenInfo));
477  if (!info) return -1;
478 
479  info->id = id;
480  info->score = score;
481 
482  if (ck_tokenizer_hash_table_insert(bpe->vocab, token, info) != 0) {
483  free(info);
484  return -1;
485  }
486 
487  if (id >= (int32_t)bpe->vocab_size) {
488  bpe->vocab_size = id + 1;
489  }
490 
491  if (bpe->id_to_token[id]) free(bpe->id_to_token[id]);
492  bpe->id_to_token[id] = strdup(token);
493 
494  return 0;
495 }
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
int32_t float * score
Definition: tokenizer.h:327

References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), id, score, and token.

Referenced by ck_true_bpe_load_binary().

◆ ck_true_bpe_create()

CKTrueBPE* ck_true_bpe_create ( void  )

Definition at line 342 of file true_bpe.c.

342  {
343  CKTrueBPE *bpe = (CKTrueBPE *)calloc(1, sizeof(CKTrueBPE));
344  if (!bpe) return NULL;
345 
346  /* Create vocabulary hash table */
348  if (!bpe->vocab) {
349  free(bpe);
350  return NULL;
351  }
352 
353  /* Create merge table */
354  bpe->merges = merge_table_create(MERGE_HASH_SIZE);
355  if (!bpe->merges) {
356  ck_tokenizer_hash_table_free(bpe->vocab, true);
357  free(bpe);
358  return NULL;
359  }
360 
361  /* Initialize reverse vocabulary */
362  bpe->vocab_capacity = 4096;
363  bpe->id_to_token = (char **)calloc(bpe->vocab_capacity, sizeof(char *));
364  if (!bpe->id_to_token) {
365  merge_table_free(bpe->merges);
366  ck_tokenizer_hash_table_free(bpe->vocab, true);
367  free(bpe);
368  return NULL;
369  }
370 
371  /* String buffer for token operations */
372  bpe->str_buffer_size = 4096;
373  bpe->str_buffer = (char *)malloc(bpe->str_buffer_size);
374  if (!bpe->str_buffer) {
375  free(bpe->id_to_token);
376  merge_table_free(bpe->merges);
377  ck_tokenizer_hash_table_free(bpe->vocab, true);
378  free(bpe);
379  return NULL;
380  }
381 
382  /* Default special token IDs */
383  bpe->unk_id = 0;
384  bpe->bos_id = -1;
385  bpe->eos_id = -1;
386  bpe->pad_id = -1;
387 
388  /* Initialize special tokens array */
389  bpe->num_special_tokens = 0;
390  for (int i = 0; i < MAX_SPECIAL_TOKENS; i++) {
391  bpe->special_tokens[i].token = NULL;
392  bpe->special_tokens[i].id = -1;
393  bpe->special_tokens[i].len = 0;
394  }
395 
396  /* Default config */
397  bpe->config.add_bos = false;
398  bpe->config.add_eos = false;
399  bpe->config.byte_fallback = true;
400  bpe->config.space_prefix_style = CK_SPACE_PREFIX_AUTO;
401 
402  return bpe;
403 }
#define CK_TOKENIZER_HT_BUCKETS_LARGE
Definition: hash_table.h:142
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61
static void merge_table_free(CKMergeTable *table)
Definition: true_bpe.c:182
#define MERGE_HASH_SIZE
Definition: true_bpe.c:63
static CKMergeTable * merge_table_create(size_t num_buckets)
Definition: true_bpe.c:167

References CK_SPACE_PREFIX_AUTO, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, MAX_SPECIAL_TOKENS, MERGE_HASH_SIZE, merge_table_create(), and merge_table_free().

Referenced by main(), and run_inference().

◆ ck_true_bpe_decode()

int ck_true_bpe_decode ( const CKTrueBPE *  bpe,
const int32_t *  ids,
int  num_ids,
char *  text,
int  max_len 
)

Definition at line 1439 of file true_bpe.c.

1439  {
1440  if (!bpe || !ids || !text || max_len <= 0) return 0;
1441 
1442  int len = 0;
1443  for (int i = 0; i < num_ids && len < max_len - 1; i++) {
1444  int32_t id = ids[i];
1445  if (id < 0) continue;
1446 
1447  /* Skip special tokens */
1448  if (id == bpe->bos_id || id == bpe->eos_id || id == bpe->pad_id) {
1449  continue;
1450  }
1451 
1452  const char *token = ck_true_bpe_id_to_token(bpe, id);
1453  if (!token) continue;
1454 
1455  int token_len = (int)strlen(token);
1456 
1457  /* Check for SentencePiece space marker ▁ (U+2581) at start */
1458  if (token_len >= 3 &&
1459  (unsigned char)token[0] == 0xE2 &&
1460  (unsigned char)token[1] == 0x96 &&
1461  (unsigned char)token[2] == 0x81) {
1462  /* ▁ -> space */
1463  if (len < max_len - 1) text[len++] = ' ';
1464  token += 3;
1465  token_len -= 3;
1466  }
1467 
1468  /* Process rest of token, decoding GPT-2 byte-level encoding */
1469  int pos = 0;
1470  while (pos < token_len && len < max_len - 1) {
1471  unsigned char c0 = (unsigned char)token[pos];
1472 
1473  /* Try GPT-2 byte decoding for 2-byte UTF-8 sequences */
1474  if (pos + 1 < token_len && (c0 & 0xE0) == 0xC0) {
1475  int decoded = gpt2_decode_byte((unsigned char*)token + pos, token_len - pos);
1476  if (decoded >= 0) {
1477  text[len++] = (char)decoded;
1478  pos += 2;
1479  continue;
1480  }
1481  }
1482 
1483  /* Regular character - copy as-is */
1484  int char_len = 1;
1485  if ((c0 & 0x80) == 0) char_len = 1; /* ASCII */
1486  else if ((c0 & 0xE0) == 0xC0) char_len = 2;
1487  else if ((c0 & 0xF0) == 0xE0) char_len = 3;
1488  else if ((c0 & 0xF8) == 0xF0) char_len = 4;
1489 
1490  /* Copy the character */
1491  for (int j = 0; j < char_len && pos + j < token_len && len < max_len - 1; j++) {
1492  text[len++] = token[pos + j];
1493  }
1494  pos += char_len;
1495  }
1496  }
1497 
1498  text[len] = '\0';
1499  return len;
1500 }
const int32_t * ids
Definition: tokenizer.h:443
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
static int gpt2_decode_byte(const unsigned char *s, int len)
Definition: true_bpe.c:1417
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
const int32_t int char int max_len
Definition: true_bpe.h:280

References ck_true_bpe_id_to_token(), gpt2_decode_byte(), ids, max_len, num_ids, text, and token.

◆ ck_true_bpe_detect_space_style()

CKSpacePrefixStyle ck_true_bpe_detect_space_style ( CKTrueBPE *  bpe)

Definition at line 654 of file true_bpe.c.

654  {
655  if (!bpe) return CK_SPACE_PREFIX_GPT2;
656 
657  if (bpe->config.space_prefix_style != CK_SPACE_PREFIX_AUTO) {
658  return bpe->config.space_prefix_style;
659  }
660 
661  /* Count tokens starting with each style */
662  int gpt2_count = 0; /* Ġ (0xC4 0xA0) */
663  int spm_count = 0; /* ▁ (0xE2 0x96 0x81) */
664 
665  for (size_t i = 0; i < bpe->vocab_size && i < 10000; i++) {
666  const char *token = bpe->id_to_token[i];
667  if (!token) continue;
668 
669  unsigned char c0 = (unsigned char)token[0];
670  unsigned char c1 = (unsigned char)token[1];
671 
672  if (c0 == 0xC4 && c1 == 0xA0) {
673  gpt2_count++;
674  } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
675  spm_count++;
676  }
677  }
678 
679  CKSpacePrefixStyle detected = (spm_count > gpt2_count * 2) ? CK_SPACE_PREFIX_SPM : CK_SPACE_PREFIX_GPT2;
680  bpe->config.space_prefix_style = detected;
681 
682  return detected;
683 }
CKSpacePrefixStyle
Definition: tokenizer.h:60
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, and token.

Referenced by ck_true_bpe_encode().

◆ ck_true_bpe_encode()

int ck_true_bpe_encode ( CKTrueBPE *  bpe,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)

Definition at line 1338 of file true_bpe.c.

1338  {
1339  if (!bpe || !text || !ids || max_ids <= 0) return 0;
1340  if (text_len < 0) text_len = (int)strlen(text);
1341  if (text_len == 0) return 0;
1342 
1343  /* Auto-detect space style if needed */
1344  if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
1346  }
1347 
1348  int out_idx = 0;
1349 
1350  /* Add BOS token if configured */
1351  if (bpe->config.add_bos && bpe->bos_id >= 0 && out_idx < max_ids) {
1352  ids[out_idx++] = bpe->bos_id;
1353  }
1354 
1355  /* If no special tokens registered, use fast path */
1356  if (bpe->num_special_tokens == 0) {
1357  out_idx += encode_text_segment(bpe, text, text_len, ids + out_idx, max_ids - out_idx);
1358  } else {
1359  /* Scan for special tokens and encode segments between them */
1360  int pos = 0;
1361  int segment_start = 0;
1362 
1363  while (pos < text_len && out_idx < max_ids) {
1364  int match = match_special_token(bpe, text, text_len, pos);
1365 
1366  if (match >= 0) {
1367  /* Found special token - first encode any text before it */
1368  if (pos > segment_start) {
1369  int seg_len = pos - segment_start;
1370  out_idx += encode_text_segment(bpe, text + segment_start, seg_len,
1371  ids + out_idx, max_ids - out_idx);
1372  }
1373 
1374  /* Output the special token ID */
1375  if (out_idx < max_ids) {
1376  ids[out_idx++] = bpe->special_tokens[match].id;
1377  }
1378 
1379  /* Advance past the special token */
1380  pos += bpe->special_tokens[match].len;
1381  segment_start = pos;
1382  } else {
1383  /* No special token here, advance to next character */
1384  pos++;
1385  }
1386  }
1387 
1388  /* Encode any remaining text after last special token */
1389  if (segment_start < text_len && out_idx < max_ids) {
1390  out_idx += encode_text_segment(bpe, text + segment_start, text_len - segment_start,
1391  ids + out_idx, max_ids - out_idx);
1392  }
1393  }
1394 
1395  /* Add EOS token if configured */
1396  if (bpe->config.add_eos && bpe->eos_id >= 0 && out_idx < max_ids) {
1397  ids[out_idx++] = bpe->eos_id;
1398  }
1399 
1400  return out_idx;
1401 }
static int encode_text_segment(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1270
CKSpacePrefixStyle ck_true_bpe_detect_space_style(CKTrueBPE *bpe)
Definition: true_bpe.c:654
static int match_special_token(const CKTrueBPE *bpe, const char *text, int text_len, int pos)
Definition: true_bpe.c:1323
const char int text_len
Definition: true_bpe.h:262
const char int int32_t int max_ids
Definition: true_bpe.h:264

References CK_SPACE_PREFIX_AUTO, ck_true_bpe_detect_space_style(), encode_text_segment(), ids, match_special_token(), max_ids, text, and text_len.

Referenced by main(), run_inference(), and run_prompt().

◆ ck_true_bpe_free()

void ck_true_bpe_free ( CKTrueBPE *  bpe)

Definition at line 405 of file true_bpe.c.

405  {
406  if (!bpe) return;
407 
408  if (bpe->vocab) {
409  ck_tokenizer_hash_table_free(bpe->vocab, true);
410  }
411 
412  if (bpe->merges) {
413  merge_table_free(bpe->merges);
414  }
415 
416  if (bpe->id_to_token) {
417  for (size_t i = 0; i < bpe->vocab_size; i++) {
418  if (bpe->id_to_token[i]) {
419  free(bpe->id_to_token[i]);
420  }
421  }
422  free(bpe->id_to_token);
423  }
424 
425  if (bpe->str_buffer) {
426  free(bpe->str_buffer);
427  }
428 
429  /* Free special tokens */
430  for (int i = 0; i < bpe->num_special_tokens; i++) {
431  if (bpe->special_tokens[i].token) {
432  free(bpe->special_tokens[i].token);
433  }
434  }
435 
436  free(bpe);
437 }

References ck_tokenizer_hash_table_free(), and merge_table_free().

Referenced by main(), and run_inference().

◆ ck_true_bpe_id_to_token()

const char* ck_true_bpe_id_to_token ( const CKTrueBPE *  bpe,
int32_t  id 
)

Definition at line 645 of file true_bpe.c.

645  {
646  if (!bpe || id < 0 || id >= (int32_t)bpe->vocab_size) return NULL;
647  return bpe->id_to_token[id];
648 }

References id.

Referenced by ck_true_bpe_decode(), main(), run_inference(), and run_prompt().

◆ ck_true_bpe_load_binary()

int ck_true_bpe_load_binary ( CKTrueBPE *  bpe,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
int  num_merges,
const int32_t *  merges 
)

Definition at line 606 of file true_bpe.c.

611  {
612  if (!bpe || !offsets || !strings || vocab_size <= 0) return -1;
613 
614  for (int i = 0; i < vocab_size; i++) {
615  const char *token = strings + offsets[i];
616  if (ck_true_bpe_add_token(bpe, token, i, 0.0f) != 0) {
617  return -1;
618  }
619  }
620 
621  if (merges && num_merges > 0) {
622  for (int i = 0; i < num_merges; i++) {
623  int32_t left = merges[i * 3 + 0];
624  int32_t right = merges[i * 3 + 1];
625  int32_t merged = merges[i * 3 + 2];
626  if (left < 0 || right < 0 || merged < 0) {
627  continue;
628  }
629  if (ck_true_bpe_add_merge(bpe, left, right, merged, i) != 0) {
630  return -1;
631  }
632  }
633  }
634 
635  return 0;
636 }
int ck_true_bpe_add_token(CKTrueBPE *bpe, const char *token, int32_t id, float score)
Definition: true_bpe.c:449
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186

References ck_true_bpe_add_merge(), ck_true_bpe_add_token(), left, merges, num_merges, offsets, right, strings, token, and vocab_size.

Referenced by main(), and run_inference().

◆ ck_true_bpe_lookup()

int32_t ck_true_bpe_lookup ( const CKTrueBPE *  bpe,
const char *  token 
)

Definition at line 638 of file true_bpe.c.

638  {
639  if (!bpe || !token) return -1;
640 
641  BPETokenInfo *info = (BPETokenInfo *)ck_tokenizer_hash_table_lookup(bpe->vocab, token);
642  return info ? info->id : bpe->unk_id;
643 }

References ck_tokenizer_hash_table_lookup(), and token.

Referenced by encode_chunk(), init_tokens_from_text(), and main().

◆ ck_true_bpe_num_merges()

int32_t ck_true_bpe_num_merges ( const CKTrueBPE *  bpe)

Definition at line 1510 of file true_bpe.c.

1510  {
1511  return bpe ? bpe->num_merges : 0;
1512 }

◆ ck_true_bpe_set_config()

void ck_true_bpe_set_config ( CKTrueBPE *  bpe,
const CKBPEConfig config 
)

Definition at line 560 of file true_bpe.c.

560  {
561  if (!bpe || !config) return;
562  bpe->config = *config;
563 }
const CKBPEConfig * config
Definition: true_bpe.h:171

References config.

◆ ck_true_bpe_set_special_ids()

void ck_true_bpe_set_special_ids ( CKTrueBPE *  bpe,
int32_t  unk,
int32_t  bos,
int32_t  eos,
int32_t  pad 
)

Definition at line 552 of file true_bpe.c.

552  {
553  if (!bpe) return;
554  bpe->unk_id = unk;
555  bpe->bos_id = bos;
556  bpe->eos_id = eos;
557  bpe->pad_id = pad;
558 }
int32_t unk
Definition: tokenizer.h:229
int32_t int32_t int32_t eos
Definition: tokenizer.h:231
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232
int32_t int32_t bos
Definition: tokenizer.h:230

References bos, eos, pad, and unk.

◆ ck_true_bpe_vocab_size()

size_t ck_true_bpe_vocab_size ( const CKTrueBPE *  bpe)

Definition at line 1506 of file true_bpe.c.

1506  {
1507  return bpe ? bpe->vocab_size : 0;
1508 }

◆ encode_chunk()

static int encode_chunk ( CKTrueBPE *  bpe,
const char *  chunk,
int  chunk_len,
int32_t *  ids,
int  max_ids,
CKBPETokenList *  list 
)
static

Definition at line 1213 of file true_bpe.c.

1214  {
1215  if (chunk_len <= 0) return 0;
1216 
1217  /* First, try to look up the entire chunk as a single token */
1218  char chunk_buf[256];
1219  if (chunk_len < (int)sizeof(chunk_buf)) {
1220  memcpy(chunk_buf, chunk, chunk_len);
1221  chunk_buf[chunk_len] = '\0';
1222  int32_t chunk_id = ck_true_bpe_lookup(bpe, chunk_buf);
1223  if (chunk_id >= 0) {
1224  /* Entire chunk is a single token */
1225  if (max_ids >= 1) {
1226  ids[0] = chunk_id;
1227  return 1;
1228  }
1229  return 0;
1230  }
1231  }
1232 
1233  /* Initialize token list from chunk characters */
1234  if (init_tokens_from_text(bpe, list, chunk, chunk_len) != 0) {
1235  return 0;
1236  }
1237 
1238  /* Apply BPE merges to this chunk */
1239  apply_bpe_merges(bpe, list);
1240 
1241  /* Extract token IDs from this chunk */
1242  int out_idx = 0;
1243  for (size_t i = 0; i < list->count && out_idx < max_ids; i++) {
1244  int32_t id = list->tokens[i].id;
1245 
1246  /* Handle unknown tokens */
1247  if (id < 0) {
1248  if (bpe->config.byte_fallback) {
1249  /* Output each byte as separate token (byte fallback) */
1250  for (size_t j = 0; j < list->tokens[i].len && out_idx < max_ids; j++) {
1251  char byte_token[8];
1252  snprintf(byte_token, sizeof(byte_token), "<0x%02X>", (unsigned char)list->tokens[i].str[j]);
1253  int32_t byte_id = ck_true_bpe_lookup(bpe, byte_token);
1254  ids[out_idx++] = (byte_id >= 0) ? byte_id : bpe->unk_id;
1255  }
1256  } else {
1257  ids[out_idx++] = bpe->unk_id;
1258  }
1259  } else {
1260  ids[out_idx++] = id;
1261  }
1262  }
1263 
1264  return out_idx;
1265 }
static int init_tokens_from_text(CKTrueBPE *bpe, CKBPETokenList *list, const char *text, int text_len)
Definition: true_bpe.c:1121
static int apply_bpe_merges(CKTrueBPE *bpe, CKBPETokenList *list)
Definition: true_bpe.c:1173
int32_t ck_true_bpe_lookup(const CKTrueBPE *bpe, const char *token)
Definition: true_bpe.c:638

References apply_bpe_merges(), ck_true_bpe_lookup(), id, ids, init_tokens_from_text(), and max_ids.

Referenced by encode_text_segment().

◆ encode_text_segment()

static int encode_text_segment ( CKTrueBPE *  bpe,
const char *  text,
int  text_len,
int32_t *  ids,
int  max_ids 
)
static

Definition at line 1270 of file true_bpe.c.

1271  {
1272  if (text_len <= 0) return 0;
1273 
1274  /* Preprocess text (byte-level encoding) */
1275  char preprocessed[16384];
1276  int pp_len = preprocess_text(bpe, text, text_len, preprocessed, sizeof(preprocessed) - 1);
1277  if (pp_len < 0) {
1278  return 0;
1279  }
1280  preprocessed[pp_len] = '\0';
1281 
1282  int out_idx = 0;
1283 
1284  /* For GPT-2 style, use pretokenizer to split into chunks */
1285  if (bpe->config.space_prefix_style == CK_SPACE_PREFIX_GPT2 ||
1286  bpe->config.space_prefix_style == CK_SPACE_PREFIX_AUTO) {
1287 
1288  /* Pretokenize */
1289  PretokChunk chunks[1024];
1290  int num_chunks = gpt2_pretokenize(preprocessed, pp_len, chunks, 1024);
1291 
1292  /* Create reusable token list */
1293  CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
1294  if (!list) return out_idx;
1295 
1296  /* Process each chunk independently with BPE */
1297  for (int c = 0; c < num_chunks && out_idx < max_ids; c++) {
1298  int chunk_ids = encode_chunk(bpe, chunks[c].start, chunks[c].len,
1299  ids + out_idx, max_ids - out_idx, list);
1300  out_idx += chunk_ids;
1301  }
1302 
1303  token_list_free(list);
1304  } else {
1305  /* SentencePiece style: no pretokenization, process entire text */
1306  CKBPETokenList *list = token_list_create(INITIAL_TOKEN_CAPACITY);
1307  if (!list) return out_idx;
1308 
1309  int chunk_ids = encode_chunk(bpe, preprocessed, pp_len,
1310  ids + out_idx, max_ids - out_idx, list);
1311  out_idx += chunk_ids;
1312 
1313  token_list_free(list);
1314  }
1315 
1316  return out_idx;
1317 }
static CKBPETokenList * token_list_create(size_t initial_capacity)
Definition: true_bpe.c:245
static int encode_chunk(CKTrueBPE *bpe, const char *chunk, int chunk_len, int32_t *ids, int max_ids, CKBPETokenList *list)
Definition: true_bpe.c:1213
static int gpt2_pretokenize(const char *text, int text_len, PretokChunk *chunks, int max_chunks)
Definition: true_bpe.c:915
static void token_list_free(CKBPETokenList *list)
Definition: true_bpe.c:260
static int preprocess_text(const CKTrueBPE *bpe, const char *text, int text_len, char *out, int out_max)
Definition: true_bpe.c:749
#define INITIAL_TOKEN_CAPACITY
Definition: true_bpe.c:64
uint32_t start
Definition: utf8.c:214

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, encode_chunk(), gpt2_pretokenize(), ids, INITIAL_TOKEN_CAPACITY, max_ids, preprocess_text(), start, text, text_len, token_list_create(), and token_list_free().

Referenced by ck_true_bpe_encode().

◆ find_best_merge()

static int find_best_merge ( const CKTrueBPE *  bpe,
const CKBPETokenList *  list,
size_t *  best_pos,
const CKBPEMerge **  best_merge 
)
static

Definition at line 1149 of file true_bpe.c.

1150  {
1151  *best_pos = 0;
1152  *best_merge = NULL;
1153  int32_t best_priority = INT32_MAX;
1154 
1155  for (size_t i = 0; i + 1 < list->count; i++) {
1156  int32_t left_id = list->tokens[i].id;
1157  int32_t right_id = list->tokens[i + 1].id;
1158 
1159  if (left_id < 0 || right_id < 0) continue; /* Unknown tokens can't merge */
1160 
1161  const CKBPEMerge *merge = merge_table_lookup(bpe->merges, left_id, right_id);
1162  if (merge && merge->priority < best_priority) {
1163  best_priority = merge->priority;
1164  *best_pos = i;
1165  *best_merge = merge;
1166  }
1167  }
1168 
1169  return (*best_merge != NULL) ? 0 : -1;
1170 }
static const CKBPEMerge * merge_table_lookup(const CKMergeTable *table, int32_t left_id, int32_t right_id)
Definition: true_bpe.c:226

References left_id, merge_table_lookup(), and right_id.

Referenced by apply_bpe_merges().

◆ gpt2_decode_byte()

static int gpt2_decode_byte ( const unsigned char *  s,
int  len 
)
static

Definition at line 1417 of file true_bpe.c.

1417  {
1418  if (len < 2) return -1;
1419 
1420  /* Check for 2-byte UTF-8 sequence: 110xxxxx 10xxxxxx */
1421  if ((s[0] & 0xE0) == 0xC0 && (s[1] & 0xC0) == 0x80) {
1422  unsigned int codepoint = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
1423 
1424  /* GPT-2 byte range: U+0100 to U+01FF */
1425  if (codepoint >= 0x100 && codepoint <= 0x1FF) {
1426  /* Decode back to original byte */
1427  if (codepoint <= 0x120) {
1428  /* U+0100-U+0120 -> bytes 0x00-0x20 */
1429  return codepoint - 0x100;
1430  } else if (codepoint >= 0x17F && codepoint <= 0x1A0) {
1431  /* U+017F-U+01A0 -> bytes 0x7F-0xA0 */
1432  return codepoint - 0x100;
1433  }
1434  }
1435  }
1436  return -1;
1437 }

Referenced by ck_true_bpe_decode().

◆ gpt2_pretokenize()

static int gpt2_pretokenize ( const char *  text,
int  text_len,
PretokChunk *  chunks,
int  max_chunks 
)
static

Definition at line 915 of file true_bpe.c.

915  {
916  int num_chunks = 0;
917  int pos = 0;
918 
919  while (pos < text_len && num_chunks < max_chunks) {
920  int chunk_start = pos;
921  int char_len = utf8_char_len((unsigned char)text[pos]);
922  if (pos + char_len > text_len) char_len = text_len - pos;
923 
924  /* Pattern 2: [^\r\n\p{L}\p{N}]?\p{L}+ - Words with optional punctuation prefix */
925  /* This pattern MUST be checked before pattern 4 (punctuation) */
926  /* Check if we have: letter, or punctuation followed by letter */
927  bool is_word = false;
928  int word_start = pos;
929  int prefix_len = 0;
930 
931  if (is_bpe_letter(text + pos, char_len)) {
932  /* Word without prefix */
933  is_word = true;
934  } else if (is_bpe_punct(text + pos, char_len) && !is_gpt2_space(text + pos, text_len - pos)) {
935  /* Check if punctuation is followed by a letter */
936  int after = pos + char_len;
937  if (after < text_len) {
938  int next_len = utf8_char_len((unsigned char)text[after]);
939  if (is_bpe_letter(text + after, next_len)) {
940  /* This is pattern 2: punctuation + letters */
941  is_word = true;
942  prefix_len = char_len;
943  }
944  }
945  }
946 
947  if (is_word) {
948  /* Collect the word (with optional prefix) */
949  pos = word_start + prefix_len; /* Skip prefix if any */
950  while (pos < text_len) {
951  int clen = utf8_char_len((unsigned char)text[pos]);
952  if (is_bpe_letter(text + pos, clen)) {
953  pos += clen;
954  } else {
955  break;
956  }
957  }
958  chunks[num_chunks].start = text + chunk_start;
959  chunks[num_chunks].len = pos - chunk_start;
960  chunks[num_chunks].type = CHUNK_WORD;
961  num_chunks++;
962  continue;
963  }
964 
965  /* Pattern 3: \p{N} - Single digit */
966  if (is_bpe_digit(text + pos, char_len)) {
967  pos += char_len;
968  chunks[num_chunks].start = text + chunk_start;
969  chunks[num_chunks].len = pos - chunk_start;
970  chunks[num_chunks].type = CHUNK_NUMBER;
971  num_chunks++;
972  continue;
973  }
974 
975  /* Pattern 4: ?[^\s\p{L}\p{N}]+[\r\n]* - Optional space + punctuation + newlines */
976  /* At this point, we know the current char is NOT followed by letters (checked above) */
977  /* Check for space (Ġ) followed by punctuation, OR just punctuation */
978  bool has_leading_space = is_gpt2_space(text + pos, text_len - pos);
979  int punct_start = has_leading_space ? pos + 2 : pos;
980 
981  if (punct_start < text_len) {
982  int pchar_len = utf8_char_len((unsigned char)text[punct_start]);
983  if (is_bpe_punct(text + punct_start, pchar_len)) {
984  /* This matches pattern 4: space? + punctuation + newlines? */
985  if (has_leading_space) {
986  pos += 2; /* Include the leading space */
987  }
988  /* Collect punctuation characters */
989  /* Pattern 4 collects ALL consecutive punctuation, NOT stopping before letters */
990  /* The key insight: if we have " (" before "int", the " (" is pattern 4, */
991  /* and "(int" would be pattern 2, but pattern 4 matches first since " (" */
992  /* is a complete match for pattern 4 (space + one punctuation). */
993  while (pos < text_len) {
994  int clen = utf8_char_len((unsigned char)text[pos]);
995  if (is_bpe_punct(text + pos, clen)) {
996  pos += clen;
997  } else {
998  break;
999  }
1000  }
1001  /* Include trailing newlines */
1002  while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
1003  pos += 2;
1004  }
1005  chunks[num_chunks].start = text + chunk_start;
1006  chunks[num_chunks].len = pos - chunk_start;
1007  chunks[num_chunks].type = CHUNK_OTHER;
1008  num_chunks++;
1009  continue;
1010  }
1011  }
1012 
1013  /* Pattern 5/6/7: Whitespace handling */
1014  if (is_gpt2_space(text + pos, text_len - pos)) {
1015  /* Count consecutive spaces */
1016  int space_count = 0;
1017  int space_end = pos;
1018  while (space_end < text_len && is_gpt2_space(text + space_end, text_len - space_end)) {
1019  space_count++;
1020  space_end += 2;
1021  }
1022 
1023  /* Check if spaces are followed by newlines (pattern 5) */
1024  if (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
1025  /* Whitespace + newlines */
1026  while (space_end < text_len && is_bpe_newline(text + space_end, text_len - space_end)) {
1027  space_end += 2;
1028  }
1029  chunks[num_chunks].start = text + pos;
1030  chunks[num_chunks].len = space_end - pos;
1031  chunks[num_chunks].type = CHUNK_WHITESPACE;
1032  num_chunks++;
1033  pos = space_end;
1034  continue;
1035  }
1036 
1037  /* Check what follows the spaces */
1038  if (space_end < text_len) {
1039  int next_len = utf8_char_len((unsigned char)text[space_end]);
1040  if (is_bpe_letter(text + space_end, next_len)) {
1041  /* Letters follow - output (n-1) spaces, then space+word (pattern 2) */
1042  if (space_count > 1) {
1043  chunks[num_chunks].start = text + pos;
1044  chunks[num_chunks].len = (space_count - 1) * 2;
1045  chunks[num_chunks].type = CHUNK_WHITESPACE;
1046  num_chunks++;
1047  pos += (space_count - 1) * 2;
1048  if (num_chunks >= max_chunks) break;
1049  }
1050  /* Collect space + word */
1051  chunk_start = pos;
1052  pos += 2; /* Skip the Ġ */
1053  while (pos < text_len) {
1054  int clen = utf8_char_len((unsigned char)text[pos]);
1055  if (is_bpe_letter(text + pos, clen)) {
1056  pos += clen;
1057  } else {
1058  break;
1059  }
1060  }
1061  chunks[num_chunks].start = text + chunk_start;
1062  chunks[num_chunks].len = pos - chunk_start;
1063  chunks[num_chunks].type = CHUNK_WORD;
1064  num_chunks++;
1065  continue;
1066  } else if (is_bpe_digit(text + space_end, next_len)) {
1067  /* Digit follows - output (n-1) spaces, then space+digit */
1068  if (space_count > 1) {
1069  chunks[num_chunks].start = text + pos;
1070  chunks[num_chunks].len = (space_count - 1) * 2;
1071  chunks[num_chunks].type = CHUNK_WHITESPACE;
1072  num_chunks++;
1073  pos += (space_count - 1) * 2;
1074  if (num_chunks >= max_chunks) break;
1075  }
1076  /* Space + single digit */
1077  chunk_start = pos;
1078  pos += 2 + 1; /* Ġ + digit */
1079  chunks[num_chunks].start = text + chunk_start;
1080  chunks[num_chunks].len = pos - chunk_start;
1081  chunks[num_chunks].type = CHUNK_NUMBER;
1082  num_chunks++;
1083  continue;
1084  }
1085  /* Pattern 4 would have caught space+punct, so this is trailing space before something else */
1086  }
1087 
1088  /* Just whitespace */
1089  chunks[num_chunks].start = text + pos;
1090  chunks[num_chunks].len = space_count * 2;
1091  chunks[num_chunks].type = CHUNK_WHITESPACE;
1092  num_chunks++;
1093  pos = space_end;
1094  continue;
1095  }
1096 
1097  /* Pattern 5: Newlines (Ċ) */
1098  if (is_bpe_newline(text + pos, text_len - pos)) {
1099  while (pos < text_len && is_bpe_newline(text + pos, text_len - pos)) {
1100  pos += 2;
1101  }
1102  chunks[num_chunks].start = text + chunk_start;
1103  chunks[num_chunks].len = pos - chunk_start;
1104  chunks[num_chunks].type = CHUNK_OTHER;
1105  num_chunks++;
1106  continue;
1107  }
1108 
1109  /* Fallback: single character chunk */
1110  pos += char_len;
1111  chunks[num_chunks].start = text + chunk_start;
1112  chunks[num_chunks].len = pos - chunk_start;
1113  chunks[num_chunks].type = CHUNK_OTHER;
1114  num_chunks++;
1115  }
1116 
1117  return num_chunks;
1118 }
static bool is_bpe_punct(const char *s, int len)
Definition: true_bpe.c:884
static bool is_gpt2_space(const char *s, int len)
Definition: true_bpe.c:828
static bool is_bpe_letter(const char *s, int len)
Definition: true_bpe.c:833
static bool is_bpe_newline(const char *s, int len)
Definition: true_bpe.c:866
static bool is_bpe_digit(const char *s, int len)
Definition: true_bpe.c:842
static int utf8_char_len(unsigned char c)
Definition: true_bpe.c:790

References CHUNK_NUMBER, CHUNK_OTHER, CHUNK_WHITESPACE, CHUNK_WORD, is_bpe_digit(), is_bpe_letter(), is_bpe_newline(), is_bpe_punct(), is_gpt2_space(), text, text_len, and utf8_char_len().

Referenced by encode_text_segment().

◆ init_tokens_from_text()

static int init_tokens_from_text ( CKTrueBPE *  bpe,
CKBPETokenList *  list,
const char *  text,
int  text_len 
)
static

Definition at line 1121 of file true_bpe.c.

1121  {
1122  token_list_clear(list);
1123 
1124  int pos = 0;
1125  while (pos < text_len) {
1126  int char_len = utf8_char_len((unsigned char)text[pos]);
1127  if (pos + char_len > text_len) {
1128  char_len = text_len - pos; /* Truncated UTF-8 */
1129  }
1130 
1131  /* Look up this character/byte in vocabulary */
1132  char char_buf[8];
1133  memcpy(char_buf, text + pos, char_len);
1134  char_buf[char_len] = '\0';
1135 
1136  int32_t id = ck_true_bpe_lookup(bpe, char_buf);
1137 
1138  if (token_list_append(list, char_buf, char_len, id) != 0) {
1139  return -1;
1140  }
1141 
1142  pos += char_len;
1143  }
1144 
1145  return 0;
1146 }
static void token_list_clear(CKBPETokenList *list)
Definition: true_bpe.c:273
static int token_list_append(CKBPETokenList *list, const char *str, size_t len, int32_t id)
Definition: true_bpe.c:283

References ck_true_bpe_lookup(), text, text_len, token_list_append(), token_list_clear(), and utf8_char_len().

Referenced by encode_chunk().

◆ is_bpe_digit()

static bool is_bpe_digit ( const char *  s,
int  len 
)
static

Definition at line 842 of file true_bpe.c.

842  {
843  if (len == 1) {
844  unsigned char c = (unsigned char)s[0];
845  return is_digit(c);
846  }
847  return false;
848 }
static bool is_digit(unsigned char c)
Definition: true_bpe.c:819

References is_digit().

Referenced by gpt2_pretokenize().

◆ is_bpe_letter()

static bool is_bpe_letter ( const char *  s,
int  len 
)
static

Definition at line 833 of file true_bpe.c.

833  {
834  if (len == 1) {
835  unsigned char c = (unsigned char)s[0];
836  return is_letter(c);
837  }
838  return false;
839 }
static bool is_letter(unsigned char c)
Definition: true_bpe.c:815

References is_letter().

Referenced by gpt2_pretokenize().

◆ is_bpe_newline()

static bool is_bpe_newline ( const char *  s,
int  len 
)
static

Definition at line 866 of file true_bpe.c.

866  {
867  return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0x8A;
868 }

Referenced by gpt2_pretokenize(), and is_word_prefix_char().

◆ is_bpe_punct()

static bool is_bpe_punct ( const char *  s,
int  len 
)
static

Definition at line 884 of file true_bpe.c.

884  {
885  if (len == 1) {
886  unsigned char c = (unsigned char)s[0];
887  return !is_letter(c) && !is_digit(c) && c != ' ' && c != '\t' && c != '\n' && c != '\r';
888  }
889  /* Multi-byte: not Ġ (space) or Ċ (newline) */
890  if (is_gpt2_space(s, len)) return false;
891  if (len >= 2 && (unsigned char)s[0] == 0xC4) {
892  unsigned char c1 = (unsigned char)s[1];
893  /* Ċ (newline), ĉ (tab), č (CR) etc. are not punctuation */
894  if (c1 == 0x8A || c1 == 0x89 || c1 == 0x8D) return false;
895  }
896  return true;
897 }

References is_digit(), is_gpt2_space(), and is_letter().

Referenced by gpt2_pretokenize().

◆ is_digit()

static bool is_digit ( unsigned char  c)
static

Definition at line 819 of file true_bpe.c.

819  {
820  return c >= '0' && c <= '9';
821 }

Referenced by is_bpe_digit(), is_bpe_punct(), and is_word_prefix_char().

◆ is_gpt2_space()

static bool is_gpt2_space ( const char *  s,
int  len 
)
static

Definition at line 828 of file true_bpe.c.

828  {
829  return len >= 2 && (unsigned char)s[0] == 0xC4 && (unsigned char)s[1] == 0xA0;
830 }

Referenced by gpt2_pretokenize(), is_bpe_punct(), and is_word_prefix_char().

◆ is_letter()

static bool is_letter ( unsigned char  c)
static

Definition at line 815 of file true_bpe.c.

815  {
816  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
817 }

Referenced by is_bpe_letter(), is_bpe_punct(), and is_word_prefix_char().

◆ is_whitespace()

static bool is_whitespace ( unsigned char  c)
static

Definition at line 823 of file true_bpe.c.

823  {
824  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
825 }

◆ is_word_prefix_char()

static bool is_word_prefix_char ( const char *  s,
int  len 
)
static

Definition at line 871 of file true_bpe.c.

871  {
872  /* In GPT-2 regex: [^\r\n\p{L}\p{N}]? matches any char except newline, letter, digit */
873  if (len == 1) {
874  unsigned char c = (unsigned char)s[0];
875  return !is_letter(c) && !is_digit(c) && c != '\n' && c != '\r';
876  }
877  /* Multi-byte: check if it's not a letter/digit (newline is Ċ which we handle separately) */
878  if (is_gpt2_space(s, len)) return true; /* Space can prefix */
879  if (is_bpe_newline(s, len)) return false; /* Newline cannot prefix */
880  return true; /* Other multi-byte chars can prefix */
881 }

References is_bpe_newline(), is_digit(), is_gpt2_space(), and is_letter().

◆ match_special_token()

static int match_special_token ( const CKTrueBPE *  bpe,
const char *  text,
int  text_len,
int  pos 
)
static

Definition at line 1323 of file true_bpe.c.

1323  {
1324  int remaining = text_len - pos;
1325  const char *cur = text + pos;
1326 
1327  /* Special tokens are sorted longest first, so first match is best */
1328  for (int i = 0; i < bpe->num_special_tokens; i++) {
1329  int tok_len = bpe->special_tokens[i].len;
1330  if (tok_len <= remaining &&
1331  memcmp(cur, bpe->special_tokens[i].token, tok_len) == 0) {
1332  return i;
1333  }
1334  }
1335  return -1;
1336 }

References text, and text_len.

Referenced by ck_true_bpe_encode().

◆ merge_hash()

static size_t merge_hash ( uint64_t  key,
size_t  num_buckets 
)
static

Definition at line 157 of file true_bpe.c.

157  {
158  /* Simple hash mixing */
159  key ^= key >> 33;
160  key *= 0xff51afd7ed558ccdULL;
161  key ^= key >> 33;
162  key *= 0xc4ceb9fe1a85ec53ULL;
163  key ^= key >> 33;
164  return key % num_buckets;
165 }

Referenced by merge_table_insert(), and merge_table_lookup().

◆ merge_key()

static uint64_t merge_key ( int32_t  left_id,
int32_t  right_id 
)
static

Definition at line 153 of file true_bpe.c.

153  {
154  return ((uint64_t)left_id << 32) | (uint32_t)right_id;
155 }

References left_id, and right_id.

Referenced by merge_table_insert(), and merge_table_lookup().

◆ merge_table_create()

static CKMergeTable* merge_table_create ( size_t  num_buckets)
static

Definition at line 167 of file true_bpe.c.

167  {
168  CKMergeTable *table = (CKMergeTable *)malloc(sizeof(CKMergeTable));
169  if (!table) return NULL;
170 
171  table->buckets = (CKMergeEntry **)calloc(num_buckets, sizeof(CKMergeEntry *));
172  if (!table->buckets) {
173  free(table);
174  return NULL;
175  }
176 
177  table->num_buckets = num_buckets;
178  table->num_entries = 0;
179  return table;
180 }

Referenced by ck_true_bpe_create().

◆ merge_table_free()

static void merge_table_free ( CKMergeTable *  table)
static

Definition at line 182 of file true_bpe.c.

182  {
183  if (!table) return;
184 
185  for (size_t i = 0; i < table->num_buckets; i++) {
186  CKMergeEntry *entry = table->buckets[i];
187  while (entry) {
188  CKMergeEntry *next = entry->next;
189  free(entry);
190  entry = next;
191  }
192  }
193 
194  free(table->buckets);
195  free(table);
196 }

Referenced by ck_true_bpe_create(), and ck_true_bpe_free().

◆ merge_table_insert()

static int merge_table_insert ( CKMergeTable *  table,
const CKBPEMerge *  merge 
)
static

Definition at line 198 of file true_bpe.c.

198  {
199  uint64_t key = merge_key(merge->left_id, merge->right_id);
200  size_t bucket = merge_hash(key, table->num_buckets);
201 
202  /* Check if already exists */
203  CKMergeEntry *entry = table->buckets[bucket];
204  while (entry) {
205  if (entry->key == key) {
206  /* Update existing */
207  entry->merge = *merge;
208  return 0;
209  }
210  entry = entry->next;
211  }
212 
213  /* Create new entry */
214  entry = (CKMergeEntry *)malloc(sizeof(CKMergeEntry));
215  if (!entry) return -1;
216 
217  entry->key = key;
218  entry->merge = *merge;
219  entry->next = table->buckets[bucket];
220  table->buckets[bucket] = entry;
221  table->num_entries++;
222 
223  return 0;
224 }
static size_t merge_hash(uint64_t key, size_t num_buckets)
Definition: true_bpe.c:157
static uint64_t merge_key(int32_t left_id, int32_t right_id)
Definition: true_bpe.c:153

References merge_hash(), and merge_key().

Referenced by ck_true_bpe_add_merge().

◆ merge_table_lookup()

static const CKBPEMerge* merge_table_lookup ( const CKMergeTable *  table,
int32_t  left_id,
int32_t  right_id 
)
static

Definition at line 226 of file true_bpe.c.

226  {
227  uint64_t key = merge_key(left_id, right_id);
228  size_t bucket = merge_hash(key, table->num_buckets);
229 
230  CKMergeEntry *entry = table->buckets[bucket];
231  while (entry) {
232  if (entry->key == key) {
233  return &entry->merge;
234  }
235  entry = entry->next;
236  }
237 
238  return NULL;
239 }

References left_id, merge_hash(), merge_key(), and right_id.

Referenced by find_best_merge().

◆ preprocess_text()

static int preprocess_text ( const CKTrueBPE *  bpe,
const char *  text,
int  text_len,
char *  out,
int  out_max 
)
static

Definition at line 749 of file true_bpe.c.

749  {
750  CKSpacePrefixStyle style = bpe->config.space_prefix_style;
751  int out_len = 0;
752 
753  /* For SentencePiece, add ▁ at start */
754  if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
755  if (out_len + 3 > out_max) return -1;
756  out[out_len++] = (char)0xE2;
757  out[out_len++] = (char)0x96;
758  out[out_len++] = (char)0x81;
759  }
760 
761  for (int i = 0; i < text_len; i++) {
762  unsigned char byte = (unsigned char)text[i];
763 
764  if (style == CK_SPACE_PREFIX_SPM) {
765  /* SentencePiece style: only convert spaces */
766  if (byte == ' ') {
767  if (out_len + 3 > out_max) return -1;
768  out[out_len++] = (char)0xE2;
769  out[out_len++] = (char)0x96;
770  out[out_len++] = (char)0x81;
771  } else {
772  if (out_len + 1 > out_max) return -1;
773  out[out_len++] = (char)byte;
774  }
775  } else {
776  /* GPT-2 style: full byte-level encoding */
777  char encoded[4];
778  int enc_len = byte_to_gpt2(byte, encoded);
779  if (out_len + enc_len > out_max) return -1;
780  for (int j = 0; j < enc_len; j++) {
781  out[out_len++] = encoded[j];
782  }
783  }
784  }
785 
786  return out_len;
787 }
CKSpacePrefixStyle style
Definition: tokenizer.h:287
const int32_t int int * out_len
Definition: tokenizer.h:445
static int byte_to_gpt2(unsigned char byte, char *out)
Definition: true_bpe.c:705

References byte_to_gpt2(), CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.

Referenced by encode_text_segment().

◆ token_list_append()

static int token_list_append ( CKBPETokenList *  list,
const char *  str,
size_t  len,
int32_t  id 
)
static

Definition at line 283 of file true_bpe.c.

283  {
284  if (list->count >= list->capacity) {
285  size_t new_cap = list->capacity * 2;
286  CKBPEToken *new_tokens = (CKBPEToken *)realloc(list->tokens, new_cap * sizeof(CKBPEToken));
287  if (!new_tokens) return -1;
288  list->tokens = new_tokens;
289  list->capacity = new_cap;
290  /* Zero new entries */
291  memset(list->tokens + list->count, 0, (new_cap - list->count) * sizeof(CKBPEToken));
292  }
293 
294  CKBPEToken *tok = &list->tokens[list->count];
295  tok->str = (char *)malloc(len + 1);
296  if (!tok->str) return -1;
297 
298  memcpy(tok->str, str, len);
299  tok->str[len] = '\0';
300  tok->len = (uint16_t)len;
301  tok->id = id;
302  tok->is_merged = false;
303 
304  list->count++;
305  return 0;
306 }

References id.

Referenced by init_tokens_from_text().

◆ token_list_clear()

static void token_list_clear ( CKBPETokenList *  list)
static

Definition at line 273 of file true_bpe.c.

273  {
274  for (size_t i = 0; i < list->count; i++) {
275  if (list->tokens[i].str) {
276  free(list->tokens[i].str);
277  list->tokens[i].str = NULL;
278  }
279  }
280  list->count = 0;
281 }

Referenced by init_tokens_from_text().

◆ token_list_create()

static CKBPETokenList* token_list_create ( size_t  initial_capacity)
static

Definition at line 245 of file true_bpe.c.

245  {
246  CKBPETokenList *list = (CKBPETokenList *)malloc(sizeof(CKBPETokenList));
247  if (!list) return NULL;
248 
249  list->tokens = (CKBPEToken *)calloc(initial_capacity, sizeof(CKBPEToken));
250  if (!list->tokens) {
251  free(list);
252  return NULL;
253  }
254 
255  list->count = 0;
256  list->capacity = initial_capacity;
257  return list;
258 }

Referenced by encode_text_segment().

◆ token_list_free()

static void token_list_free ( CKBPETokenList *  list)
static

Definition at line 260 of file true_bpe.c.

260  {
261  if (!list) return;
262 
263  for (size_t i = 0; i < list->count; i++) {
264  if (list->tokens[i].str) {
265  free(list->tokens[i].str);
266  }
267  }
268 
269  free(list->tokens);
270  free(list);
271 }

Referenced by encode_text_segment().

◆ token_list_merge_at()

static int token_list_merge_at ( CKBPETokenList *  list,
size_t  pos,
const char *  merged_str,
size_t  merged_len,
int32_t  merged_id 
)
static

Definition at line 309 of file true_bpe.c.

309  {
310  if (pos + 1 >= list->count) return -1;
311 
312  /* Free old strings */
313  free(list->tokens[pos].str);
314  free(list->tokens[pos + 1].str);
315 
316  /* Create merged token */
317  list->tokens[pos].str = (char *)malloc(merged_len + 1);
318  if (!list->tokens[pos].str) return -1;
319 
320  memcpy(list->tokens[pos].str, merged_str, merged_len);
321  list->tokens[pos].str[merged_len] = '\0';
322  list->tokens[pos].len = (uint16_t)merged_len;
323  list->tokens[pos].id = merged_id;
324  list->tokens[pos].is_merged = true;
325 
326  /* Shift remaining tokens left */
327  for (size_t i = pos + 1; i < list->count - 1; i++) {
328  list->tokens[i] = list->tokens[i + 1];
329  }
330  list->count--;
331 
332  /* Clear the now-unused last slot */
333  list->tokens[list->count].str = NULL;
334 
335  return 0;
336 }

References merged_id.

Referenced by apply_bpe_merges().

◆ utf8_char_len()

static int utf8_char_len ( unsigned char  c)
static

Definition at line 790 of file true_bpe.c.

790  {
791  if ((c & 0x80) == 0) return 1; /* 0xxxxxxx */
792  if ((c & 0xE0) == 0xC0) return 2; /* 110xxxxx */
793  if ((c & 0xF0) == 0xE0) return 3; /* 1110xxxx */
794  if ((c & 0xF8) == 0xF0) return 4; /* 11110xxx */
795  return 1; /* Invalid, treat as single byte */
796 }

Referenced by gpt2_pretokenize(), and init_tokens_from_text().