C-Kernel-Engine/doxygen/tokenizer_8c_source.html

 /*

  * C-Kernel-Engine Greedy Tokenizer

  *

  * High-performance tokenizer with:

  * - Greedy longest-match encoding

  * - BPE and WordPiece support

  * - MurmurHash3 for fast lookups

  * - AVX-512 string comparison (when available)

  *

  * By Anthony Shivakumar

  */


 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>

 #include <stdint.h>

 #include <stdbool.h>

 #include <ctype.h>


 #include "tokenizer/tokenizer.h"

 #include "tokenizer/murmurhash3.h"

 #include "tokenizer/hash_table.h"


 /* Token info stored in hash table value */

 typedef struct {

     int32_t id;

     float score;

     bool is_special;

 } TokenInfo;


 /* Tokenizer structure - defined in tokenizer.h via typedef */


 /* Create a new tokenizer */

 CKTokenizer *ck_tokenizer_create(CKTokenizerType type) {

     CKTokenizer *tok = (CKTokenizer *)malloc(sizeof(CKTokenizer));

     if (!tok) {

         return NULL;

     }


     memset(tok, 0, sizeof(*tok));


     /* Create hash table for vocabulary */

     tok->vocab = ck_tokenizer_hash_table_create(CK_TOKENIZER_HT_BUCKETS_LARGE);

     if (!tok->vocab) {

         free(tok);

         return NULL;

     }


     /* Create trie for fast lookups (1M nodes for ~50k vocab) */

     tok->vocab_trie = ck_trie_create(1000000);

     if (!tok->vocab_trie) {

         ck_tokenizer_hash_table_free(tok->vocab, true);

         free(tok);

         return NULL;

     }


     /* Initialize reverse vocab */

     tok->vocab_capacity = 4096;

     tok->id_to_token = (char **)calloc(tok->vocab_capacity, sizeof(char *));

     if (!tok->id_to_token) {

         ck_tokenizer_hash_table_free(tok->vocab, true);

         free(tok);

         return NULL;

     }


     /* Set default special tokens */

     tok->unk_id = 0;

     tok->bos_id = 1;

     tok->eos_id = 2;

     tok->pad_id = -1;

     tok->mask_id = -1;


     /* Initialize scores and types for SPM */

     tok->scores = NULL;

     tok->types = NULL;


     /* Set config */

     tok->config.type = type;

     tok->config.add_bos = false;

     tok->config.add_eos = false;

     tok->config.add_space_prefix = true;

     tok->config.unk_score = -1e10f;

     tok->config.spm_mode = CK_SPM_MODE_UNIGRAM;


     ck_tokenizer_mempool_init(&tok->pool, 1024 * 1024);


     return tok;

 }


 /* Free a tokenizer */

 void ck_tokenizer_free(CKTokenizer *tok) {

     if (!tok) return;


     /* Free vocabulary entries */

     if (tok->vocab) {

         ck_tokenizer_hash_table_free(tok->vocab, true);

     }


     /* Free trie */

     if (tok->vocab_trie) {

         ck_trie_free(tok->vocab_trie);

     }


     /* Free reverse vocab strings */

     if (tok->id_to_token) {

         /* Note: strings were strdup'd in add_token */

         for (size_t i = 0; i < tok->vocab_size; i++) {

             if (tok->id_to_token[i]) {

                 free(tok->id_to_token[i]);

             }

         }

         free(tok->id_to_token);

     }


     /* Free SPM-related arrays */

     if (tok->scores) free(tok->scores);

     if (tok->types) free(tok->types);

     if (tok->byte_token_id) free(tok->byte_token_id);


     ck_tokenizer_mempool_free(&tok->pool);

     free(tok);

 }


 /* Reset tokenizer state */

 void ck_tokenizer_reset(CKTokenizer *tok) {

     if (!tok) return;


     ck_tokenizer_hash_table_clear(tok->vocab, true);


     if (tok->vocab_trie) {

         ck_trie_clear(tok->vocab_trie);

     }


     for (size_t i = 0; i < tok->vocab_size; i++) {

         if (tok->id_to_token[i]) {

             free(tok->id_to_token[i]);

             tok->id_to_token[i] = NULL;

         }

     }


     tok->vocab_size = 0;


     /* Reset SPM-related arrays using actual allocated sizes */

     if (tok->scores && tok->scores_size > 0) {

         memset(tok->scores, 0, tok->scores_size * sizeof(float));

     }

     if (tok->types && tok->types_size > 0) {

         memset(tok->types, 0, tok->types_size * sizeof(uint8_t));

     }

     /* Clear byte lookup table */

     if (tok->byte_token_id) {

         memset(tok->byte_token_id, -1, 256 * sizeof(int32_t));

     }

 }


 /* Add a token to vocabulary */

 int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score) {

     if (!tok || !token) {

         return -1;

     }


     /* Ensure we have space in reverse vocab */

     if (id >= (int32_t)tok->vocab_capacity) {

         size_t new_cap = tok->vocab_capacity * 2;

         while (new_cap <= (size_t)id) {

             new_cap *= 2;

         }

         char **new_array = (char **)realloc(tok->id_to_token, new_cap * sizeof(char *));

         if (!new_array) {

             return -1;

         }

         memset(new_array + tok->vocab_capacity, 0, (new_cap - tok->vocab_capacity) * sizeof(char *));

         tok->id_to_token = new_array;

         tok->vocab_capacity = new_cap;

     }


     /* Check if token already exists */

     TokenInfo *existing = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);

     if (existing) {

         existing->id = id;

         existing->score = score;

         if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;

         if (tok->id_to_token[id]) free(tok->id_to_token[id]);

         tok->id_to_token[id] = strdup(token);

         return 0;

     }


     /* Create new token info */

     TokenInfo *info = (TokenInfo *)malloc(sizeof(TokenInfo));

     if (!info) return -1;

     info->id = id;

     info->score = score;

     info->is_special = false;


     if (ck_tokenizer_hash_table_insert(tok->vocab, token, info) != 0) {

         free(info);

         return -1;

     }


     /* Also add to trie for fast longest-match lookups */

     if (tok->vocab_trie) {

         ck_trie_insert(tok->vocab_trie, token, id, false, 0);

     }


     if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;

     if (tok->id_to_token[id]) free(tok->id_to_token[id]);

     tok->id_to_token[id] = strdup(token);


     return 0;

 }


 /* Add special token */

 int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id) {

     if (!tok || !name) return -1;

     if (ck_tokenizer_add_token(tok, name, id, -1e10f) != 0) return -1;


     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, name);

     if (info) info->is_special = true;


     /* Also add to trie as special */

     if (tok->vocab_trie) {

         ck_trie_insert(tok->vocab_trie, name, id, true, 0);

     }


     if (strcmp(name, "<unk>") == 0 || strcmp(name, "[UNK]") == 0) tok->unk_id = id;

     else if (strcmp(name, "<s>") == 0 || strcmp(name, "<bos>") == 0 || strcmp(name, "[BOS]") == 0) tok->bos_id = id;

     else if (strcmp(name, "</s>") == 0 || strcmp(name, "<eos>") == 0 || strcmp(name, "[EOS]") == 0) tok->eos_id = id;

     else if (strcmp(name, "<pad>") == 0 || strcmp(name, "[PAD]") == 0) tok->pad_id = id;


     return 0;

 }


 /* Set special token IDs */

 void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask) {

     if (!tok) return;

     tok->unk_id = unk;

     tok->bos_id = bos;

     tok->eos_id = eos;

     tok->pad_id = pad;

     tok->mask_id = mask;

 }


 void ck_tokenizer_set_add_bos_eos(CKTokenizer *tok, bool add_bos, bool add_eos) {

     if (!tok) return;

     tok->config.add_bos = add_bos;

     tok->config.add_eos = add_eos;

 }


 void ck_tokenizer_set_add_space_prefix(CKTokenizer *tok, bool add_space_prefix) {

     if (!tok) return;

     tok->config.add_space_prefix = add_space_prefix;

 }


 void ck_tokenizer_set_spm_mode(CKTokenizer *tok, CKSpmMode spm_mode) {

     if (!tok) return;

     tok->config.spm_mode = spm_mode;

 }


 /* Set whether to use trie for lookups */

 void ck_tokenizer_set_use_trie(CKTokenizer *tok, bool use_trie) {

     if (!tok) return;

     tok->config.use_trie = use_trie;

 }


 /* Set space prefix style for BPE tokenizers */

 void ck_tokenizer_set_space_prefix_style(CKTokenizer *tok, CKSpacePrefixStyle style) {

     if (!tok) return;

     tok->config.space_prefix_style = style;

     if (style != CK_SPACE_PREFIX_AUTO) {

         tok->config.space_prefix_detected = true;

     }

 }


 /* Auto-detect space prefix style from vocabulary.

  * Checks for presence of tokens starting with Ġ (GPT-2) vs ▁ (SentencePiece). */

 CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok) {

     if (!tok) return CK_SPACE_PREFIX_GPT2;


     /* Already detected? */

     if (tok->config.space_prefix_detected && tok->config.space_prefix_style != CK_SPACE_PREFIX_AUTO) {

         return tok->config.space_prefix_style;

     }


     /* Count tokens starting with each style:

      * Ġ (U+0120) = bytes 0xC4 0xA0

      * ▁ (U+2581) = bytes 0xE2 0x96 0x81

      */

     int gpt2_count = 0;

     int spm_count = 0;


     for (size_t i = 0; i < tok->vocab_size && i < 10000; i++) {  /* Sample first 10k tokens */

         const char *token = tok->id_to_token[i];

         if (!token) continue;


         unsigned char c0 = (unsigned char)token[0];

         unsigned char c1 = (unsigned char)token[1];


         /* Check for Ġ (0xC4 0xA0) */

         if (c0 == 0xC4 && c1 == 0xA0) {

             gpt2_count++;

         }

         /* Check for ▁ (0xE2 0x96 0x81) */

         else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {

             spm_count++;

         }

     }


     /* Determine style based on counts */

     CKSpacePrefixStyle detected;

     if (spm_count > gpt2_count * 2) {

         detected = CK_SPACE_PREFIX_SPM;

     } else {

         detected = CK_SPACE_PREFIX_GPT2;  /* Default to GPT-2 if similar counts */

     }


     tok->config.space_prefix_style = detected;

     tok->config.space_prefix_detected = true;


     return detected;

 }


 /* Look up token ID */

 int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token) {

     if (!tok || !token) return -1;

     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);

     return info ? info->id : tok->unk_id;

 }


 /* Internal exact lookup (returns -1 if token string is not in vocab). */

 static int32_t ck_tokenizer_lookup_exact(const CKTokenizer *tok, const char *token) {

     if (!tok || !token) return -1;

     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);

     return info ? info->id : -1;

 }


 /* Internal exact lookup for non-null-terminated text slices. */

 static int32_t ck_tokenizer_lookup_exact_n(const CKTokenizer *tok, const char *text, int text_len) {

     if (!tok || !text || text_len <= 0) return -1;

     char stack_buf[512];

     char *tmp = stack_buf;

     if (text_len >= (int)sizeof(stack_buf)) {

         tmp = (char *)malloc((size_t)text_len + 1);

         if (!tmp) return -1;

     }

     memcpy(tmp, text, (size_t)text_len);

     tmp[text_len] = '\0';

     int32_t id = ck_tokenizer_lookup_exact(tok, tmp);

     if (tmp != stack_buf) free(tmp);

     return id;

 }


 /* Get token string by ID */

 const char *ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id) {

     if (!tok || id < 0 || id >= (int32_t)tok->vocab_size) return NULL;

     return tok->id_to_token[id];

 }


 /* Find longest matching token at position using trie (O(k) where k = token length) */

 static int32_t find_longest_match_trie(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {

     if (!tok || !tok->vocab_trie || !text || pos >= text_len) {

         *match_len = 0;

         return tok ? tok->unk_id : -1;

     }


     int32_t token_id = ck_trie_find_longest(tok->vocab_trie, text, text_len, pos, match_len);

     return token_id >= 0 ? token_id : tok->unk_id;

 }


 /* Find longest matching token at position using hash table (O(n*k) worst case) */

 static int32_t find_longest_match_hash(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {

     if (!tok || !text || pos >= text_len) {

         *match_len = 0;

         return tok ? tok->unk_id : -1;

     }


     size_t max_len = 64;

     if (pos + max_len > text_len) max_len = text_len - pos;


     int32_t best_id = tok->unk_id;

     size_t best_len = 0;


     for (size_t len = max_len; len >= 1; len--) {

         char tmp[65];

         memcpy(tmp, text + pos, len);

         tmp[len] = '\0';


         TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);

         if (info) {

             best_id = info->id;

             best_len = len;

             break;

         }

     }


     *match_len = best_len;

     return best_id;

 }


 /* Find longest matching token at position - dispatches to trie or hash table */

 static int32_t find_longest_match(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len) {

     if (tok->config.use_trie) {

         return find_longest_match_trie(tok, text, text_len, pos, match_len);

     } else {

         return find_longest_match_hash(tok, text, text_len, pos, match_len);

     }

 }


 /* Convert ASCII spaces to space prefix marker.

  * GPT-2/Qwen use Ġ (U+0120, bytes 0xC4 0xA0) - replaces spaces only

  * LLaMA/SentencePiece use ▁ (U+2581, bytes 0xE2 0x96 0x81) - adds prefix at start AND replaces spaces

  * Returns new length, or -1 if buffer too small. */

 static int preprocess_bpe_spaces(const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style) {

     int out_len = 0;


     /* For SentencePiece, add ▁ at the start of text (unless text starts with space) */

     if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {

         if (out_len + 3 > out_max) return -1;

         out[out_len++] = (char)0xE2;

         out[out_len++] = (char)0x96;

         out[out_len++] = (char)0x81;

     }


     for (int i = 0; i < text_len; i++) {

         if (text[i] == ' ') {

             if (style == CK_SPACE_PREFIX_SPM) {

                 /* SentencePiece style: ▁ (3 bytes: 0xE2 0x96 0x81) */

                 if (out_len + 3 > out_max) return -1;

                 out[out_len++] = (char)0xE2;

                 out[out_len++] = (char)0x96;

                 out[out_len++] = (char)0x81;

             } else {

                 /* GPT-2 style: Ġ (2 bytes: 0xC4 0xA0) */

                 if (out_len + 2 > out_max) return -1;

                 out[out_len++] = (char)0xC4;

                 out[out_len++] = (char)0xA0;

             }

         } else {

             if (out_len + 1 > out_max) return -1;

             out[out_len++] = text[i];

         }

     }

     return out_len;

 }


 /* ============================================================================

  * SPM (SentencePiece) Tokenization with Viterbi/DP

  * ============================================================================ */


 /*

  * GGUF Token Type enum values (from llama.cpp gguf/constants.py):

  *   NORMAL       = 1

  *   UNKNOWN      = 2

  *   CONTROL      = 3

  *   USER_DEFINED = 4

  *   UNUSED       = 5

  *   BYTE         = 6

  *

  * IMPORTANT: These must match exactly or token filtering will be incorrect!

  */

 #define GGUF_TOKEN_NORMAL       1

 #define GGUF_TOKEN_UNKNOWN      2

 #define GGUF_TOKEN_CONTROL      3

 #define GGUF_TOKEN_USER_DEFINED 4

 #define GGUF_TOKEN_UNUSED      5

 #define GGUF_TOKEN_BYTE         6


 /* Check if token type allows inclusion in DP path (exclude CONTROL, UNUSED, BYTE)

  * Note: UNKNOWN tokens are allowed because they're needed for unknown content */

 static inline bool spm_token_allowed_in_dp(const CKTokenizer *tok, int32_t token_id) {

     if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {

         return true;  /* No type info, allow all */

     }

     uint8_t t = tok->types[token_id];

     /* Reject CONTROL, UNUSED, and BYTE tokens (but allow UNKNOWN for fallback) */

     return t != GGUF_TOKEN_CONTROL && t != GGUF_TOKEN_UNUSED && t != GGUF_TOKEN_BYTE;

 }


 /* Check if token is a byte token (for identification) */

 static inline bool spm_is_byte_token(const CKTokenizer *tok, int32_t token_id) {

     if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {

         return false;

     }

     return tok->types[token_id] == GGUF_TOKEN_BYTE;

 }


 /* Find byte token ID using fast lookup table (primary) or <0xXX> fallback */

 static inline int32_t spm_get_byte_token(const CKTokenizer *tok, unsigned char byte_val) {

     /* Try fast lookup table first */

     if (tok->byte_token_id && tok->byte_token_id[byte_val] >= 0) {

         return tok->byte_token_id[byte_val];

     }

     /* Fallback to <0xXX> format */

     char byte_token[16];

     int len = snprintf(byte_token, sizeof(byte_token), "<0x%02X>", byte_val);

     if (len <= 0) return tok->unk_id;

     return ck_tokenizer_lookup(tok, byte_token);

 }


 /* Check if a token string represents a byte token (<0xXX> format) */

 static inline bool spm_token_is_byte_format(const char *token) {

     return token && token[0] == '<' && token[1] == '0' &&

            token[2] == 'x' && token[3] >= '0' && token[3] <= 'F' &&

            token[4] >= '0' && token[4] <= 'F' && token[5] == '>';

 }


 /* Build byte token lookup table from vocab (called during load) */

 static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size) {

     /* Reuse existing array or allocate new one */

     if (!tok->byte_token_id) {

         tok->byte_token_id = (int32_t *)malloc(256 * sizeof(int32_t));

         if (!tok->byte_token_id) return;

     }


     /* Initialize all entries to -1 */

     for (int i = 0; i < 256; i++) {

         tok->byte_token_id[i] = -1;

     }


     /* Scan vocab for byte tokens */

     for (int i = 0; i < vocab_size; i++) {

         if (!tok->types || tok->types[i] != GGUF_TOKEN_BYTE) continue;


         const char *token = strings + offsets[i];

         size_t len = strlen(token);


         if (len == 1) {

             /* Raw byte token (single byte) */

             unsigned char byte_val = (unsigned char)token[0];

             tok->byte_token_id[byte_val] = i;

         } else if (spm_token_is_byte_format(token)) {

             /* <0xXX> format - parse the hex value */

             unsigned int byte_val;

             if (sscanf(token, "<0x%02X>", &byte_val) == 1 && byte_val < 256) {

                 tok->byte_token_id[byte_val] = i;

             }

         }

     }

 }


 /* Get length of UTF-8 codepoint starting at c (0 if invalid) */

 static inline int utf8_len(unsigned char c) {

     if ((c & 0x80) == 0x00) return 1;  /* ASCII */

     if ((c & 0xE0) == 0xC0) return 2;  /* 2-byte sequence */

     if ((c & 0xF0) == 0xE0) return 3;  /* 3-byte sequence */

     if ((c & 0xF8) == 0xF0) return 4;  /* 4-byte sequence */

     return 1;  /* Invalid, treat as 1 byte */

 }


 /* llama.cpp SPM whitespace handling:

  * - Optional dummy prefix as ASCII space

  * - Replace each ASCII space with ▁ (U+2581)

  * - Do not trim or collapse whitespace

  */

 static int preprocess_spm_llama_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix) {

     int out_len = 0;

     if (text_len < 0) text_len = (int)strlen(text);


     if (add_space_prefix && text_len > 0) {

         if (out_len + 3 > out_max) return -1;

         out[out_len++] = (char)0xE2;

         out[out_len++] = (char)0x96;

         out[out_len++] = (char)0x81;

     }


     for (int i = 0; i < text_len;) {

         if (text[i] == ' ') {

             int j = i;

             while (j < text_len && text[j] == ' ') {

                 j++;

             }

             int run = j - i;


             /* Match llama.cpp behavior for this GGUF family:

              * single separators map to ▁, but multi-space runs remain literal. */

             if (run == 1) {

                 if (out_len + 3 > out_max) return -1;

                 out[out_len++] = (char)0xE2;

                 out[out_len++] = (char)0x96;

                 out[out_len++] = (char)0x81;

             } else {

                 if (out_len + run > out_max) return -1;

                 for (int k = 0; k < run; k++) {

                     out[out_len++] = ' ';

                 }

             }

             i = j;

         } else {

             if (out_len + 1 > out_max) return -1;

             out[out_len++] = text[i++];

         }

     }


     return out_len;

 }


 typedef struct {

     int prev;

     int next;

     const char *text;

     int n;

     int node_id;

 } SpmLlamaSymbol;


 typedef struct {

     const char *text;

     int n;

     int left;

     int right;

 } SpmLlamaNode;


 static int spm_llama_resegment_node(const CKTokenizer *tok,

                                     const SpmLlamaNode *nodes,

                                     int node_id,

                                     int32_t *ids,

                                     int max_ids,

                                     int out_idx) {

     if (!tok || !nodes || node_id < 0 || !ids || out_idx >= max_ids) {

         return out_idx;

     }


     const SpmLlamaNode *node = &nodes[node_id];

     int32_t token_id = ck_tokenizer_lookup_exact_n(tok, node->text, node->n);

     if (token_id >= 0) {

         ids[out_idx++] = token_id;

         return out_idx;

     }


     if (node->left >= 0 && node->right >= 0) {

         out_idx = spm_llama_resegment_node(tok, nodes, node->left, ids, max_ids, out_idx);

         out_idx = spm_llama_resegment_node(tok, nodes, node->right, ids, max_ids, out_idx);

         return out_idx;

     }


     for (int i = 0; i < node->n && out_idx < max_ids; i++) {

         int32_t byte_token = spm_get_byte_token(tok, (unsigned char)node->text[i]);

         ids[out_idx++] = (byte_token >= 0) ? byte_token : tok->unk_id;

     }

     return out_idx;

 }


 /* llama.cpp merge-style SPM path (LLAMA_VOCAB_TYPE_SPM). */

 static int ck_tokenizer_encode_spm_llama_impl(const CKTokenizer *tok,

                                               const char *text,

                                               int text_len,

                                               int32_t *ids,

                                               int max_ids) {

     if (!tok || !text || !ids || max_ids <= 0) return 0;

     if (text_len < 0) text_len = (int)strlen(text);

     if (text_len == 0) return 0;


     char preprocessed[8192];

     int pp_len = preprocess_spm_llama_text(text, text_len, preprocessed, (int)sizeof(preprocessed) - 1,

                                            tok->config.add_space_prefix);

     if (pp_len < 0) return 0;

     preprocessed[pp_len] = '\0';


     int num_symbols = 0;

     for (int offs = 0; offs < pp_len;) {

         int char_len = utf8_len((unsigned char)preprocessed[offs]);

         if (char_len <= 0) char_len = 1;

         if (offs + char_len > pp_len) char_len = pp_len - offs;

         offs += char_len;

         num_symbols++;

     }

     if (num_symbols <= 0) return 0;


     SpmLlamaSymbol *symbols = (SpmLlamaSymbol *)calloc((size_t)num_symbols, sizeof(SpmLlamaSymbol));

     int node_cap = 2 * num_symbols + 1;

     SpmLlamaNode *nodes = (SpmLlamaNode *)calloc((size_t)node_cap, sizeof(SpmLlamaNode));

     if (!symbols || !nodes) {

         if (symbols) free(symbols);

         if (nodes) free(nodes);

         return 0;

     }


     int index = 0;

     for (int offs = 0; offs < pp_len && index < num_symbols;) {

         int char_len = utf8_len((unsigned char)preprocessed[offs]);

         if (char_len <= 0) char_len = 1;

         if (offs + char_len > pp_len) char_len = pp_len - offs;


         symbols[index].text = preprocessed + offs;

         symbols[index].n = char_len;

         symbols[index].prev = index - 1;

         symbols[index].next = (index + 1 < num_symbols) ? (index + 1) : -1;

         symbols[index].node_id = index;


         nodes[index].text = preprocessed + offs;

         nodes[index].n = char_len;

         nodes[index].left = -1;

         nodes[index].right = -1;


         offs += char_len;

         index++;

     }


     int node_count = num_symbols;

     for (;;) {

         int best_left = -1;

         int best_right = -1;

         float best_score = -1e30f;


         for (int left = 0; left != -1; left = symbols[left].next) {

             int right = symbols[left].next;

             if (right < 0) continue;


             int pair_len = symbols[left].n + symbols[right].n;

             int32_t token_id = ck_tokenizer_lookup_exact_n(tok, symbols[left].text, pair_len);

             if (token_id < 0 || token_id >= (int32_t)tok->vocab_size) continue;


             float score = 0.0f;

             if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->scores_size) {

                 score = tok->scores[token_id];

             }


             if (best_left < 0 || score > best_score || (score == best_score && left < best_left)) {

                 best_left = left;

                 best_right = right;

                 best_score = score;

             }

         }


         if (best_left < 0 || best_right < 0) break;

         if (node_count >= node_cap) break;


         SpmLlamaSymbol *left = &symbols[best_left];

         SpmLlamaSymbol *right = &symbols[best_right];


         int new_node_id = node_count++;

         nodes[new_node_id].text = left->text;

         nodes[new_node_id].n = left->n + right->n;

         nodes[new_node_id].left = left->node_id;

         nodes[new_node_id].right = right->node_id;


         left->n += right->n;

         left->node_id = new_node_id;

         left->next = right->next;

         if (right->next >= 0) {

             symbols[right->next].prev = best_left;

         }


         right->n = 0;

         right->prev = -1;

         right->next = -1;

     }


     int out_idx = 0;

     for (int i = 0; i != -1 && out_idx < max_ids; i = symbols[i].next) {

         out_idx = spm_llama_resegment_node(tok, nodes, symbols[i].node_id, ids, max_ids, out_idx);

     }


     free(symbols);

     free(nodes);

     return out_idx;

 }


 /* Replace spaces with SentencePiece underscore (U+2581)

  * Also normalize whitespace similarly to SPM:

  * - Leading spaces: consume them (SPM adds dummy prefix)

  * - Multiple spaces: collapse to single space

  * - Trailing spaces: consume them

  */

 static int preprocess_spm_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix) {

     int out_len = 0;


     /* Count leading spaces */

     int lead_spaces = 0;

     while (lead_spaces < text_len && text[lead_spaces] == ' ') {

         lead_spaces++;

     }


     /* Count trailing spaces */

     int trail_spaces = 0;

     while (trail_spaces < text_len - lead_spaces &&

            text[text_len - 1 - trail_spaces] == ' ') {

         trail_spaces++;

     }


     /* Add ▁ at start if there's any non-space content AND text doesn't already start with ▁ */

     int content_len = text_len - lead_spaces - trail_spaces;

     int starts_with_prefix = (text_len >= 3 &&

                               (unsigned char)text[0] == 0xE2 &&

                               (unsigned char)text[1] == 0x96 &&

                               (unsigned char)text[2] == 0x81);

     int inserted_prefix = 0;

     if (content_len > 0 && !starts_with_prefix && add_space_prefix) {

         if (out_len + 3 > out_max) return -1;

         out[out_len++] = (char)0xE2;

         out[out_len++] = (char)0x96;

         out[out_len++] = (char)0x81;

         inserted_prefix = 1;

     }


     /* Process middle content: collapse multiple spaces to single ▁ */

     int i = lead_spaces;

     int last_was_space = (starts_with_prefix || inserted_prefix) ? 1 : 0;

     while (i < text_len - trail_spaces) {

         if (text[i] == ' ') {

             if (!last_was_space) {

                 /* First space after content - add ▁ */

                 if (out_len + 3 > out_max) return -1;

                 out[out_len++] = (char)0xE2;

                 out[out_len++] = (char)0x96;

                 out[out_len++] = (char)0x81;

                 last_was_space = 1;

             }

             /* Skip additional consecutive spaces */

         } else {

             if (out_len + 1 > out_max) return -1;

             out[out_len++] = text[i];

             last_was_space = 0;

         }

         i++;

     }


     return out_len;

 }


 /* Forward declaration for SPM Viterbi */

 static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len,

                                       size_t pos, int32_t *candidates, int max_candidates);


 /* Forward declaration for unknown run counting */

 static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos);


 /* Forward declaration for byte fallback */

 static int spm_encode_byte_fallback(const CKTokenizer *tok,

                                     const char *text, int text_len,

                                     int32_t *ids, int max_ids);


 /* SPM Viterbi/DP encoding - finds best token sequence using token scores */

 static int ck_tokenizer_encode_spm_impl(const CKTokenizer *tok,

                                        const char *text,

                                        int text_len,

                                        int32_t *ids,

                                        int max_ids) {

     if (!tok || !text || !ids || max_ids <= 0) return 0;

     if (text_len < 0) text_len = (int)strlen(text);

     if (text_len == 0) return 0;

     const int dbg = getenv("CK_DEBUG_SPM_ENCODE") ? 1 : 0;

     if (dbg) {

         fprintf(stderr, "[SPM] encode start: text_len=%d max_ids=%d\n", text_len, max_ids);

     }


     /* Preprocess: replace spaces with ▁ */

     char preprocessed[8192];

     int pp_len = preprocess_spm_text(text, text_len, preprocessed, sizeof(preprocessed) - 1,

                                      tok->config.add_space_prefix);

     if (pp_len < 0) return 0;

     preprocessed[pp_len] = '\0';

     if (dbg) {

         fprintf(stderr, "[SPM] preprocessed len=%d: \"%.*s\"\n", pp_len, pp_len, preprocessed);

     }


     /* DP arrays - use malloc for large inputs */

     size_t n = (size_t)pp_len + 1;

     float *best_score = (float *)malloc(n * sizeof(float));

     int32_t *best_prev = (int32_t *)malloc(n * sizeof(int32_t));

     int32_t *best_token = (int32_t *)malloc(n * sizeof(int32_t));

     if (dbg) {

         fprintf(stderr, "[SPM] DP alloc n=%zu\n", n);

     }


     if (!best_score || !best_prev || !best_token) {

         if (best_score) free(best_score);

         if (best_prev) free(best_prev);

         if (best_token) free(best_token);

         return 0;

     }


     /* Initialize DP */

     const float neg_inf = -1e30f;

     const float unknown_penalty = -10.0f;  /* SentencePiece-style UNK penalty */

     for (size_t i = 0; i < n; i++) {

         best_score[i] = neg_inf;

         best_prev[i] = -1;

         best_token[i] = -1;

     }

     best_score[0] = 0.0f;


     /* DP: for each position, find best way to reach it */

     for (size_t pos = 0; pos < n; pos++) {

         if (best_score[pos] == neg_inf) continue;


         /* Find all tokens that match at this position */

         int32_t candidates[64];

         int num_cand = spm_find_candidates_at_pos(tok, preprocessed, pp_len, pos, candidates, 64);

         if (dbg && pos < 8) {

             fprintf(stderr, "[SPM] pos=%zu cand=%d\n", pos, num_cand);

         }


         for (int c = 0; c < num_cand; c++) {

             int32_t token_id = candidates[c];


             /* Skip disallowed token types in DP */

             if (!spm_token_allowed_in_dp(tok, token_id)) {

                 continue;

             }


             /* Get token string and length */

             const char *token = ck_tokenizer_id_to_token(tok, token_id);

             if (!token) continue;


             /* Calculate token length in bytes */

             int token_len = (int)strlen(token);


             /* For UNK token, use the unknown run length to cover all consecutive unknown bytes */

             if (token_id == tok->unk_id) {

                 token_len = spm_count_unknown_run(tok, preprocessed, pp_len, pos);

                 if (token_len == 0) token_len = 1;  /* At least 1 byte */

             }


             size_t next_pos = pos + token_len;


             if (next_pos >= n) continue;


             /* Get token score for Viterbi */

             float token_score = 0.0f;

             if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->vocab_size) {

                 token_score = tok->scores[token_id];

             }


             /* USER_DEFINED tokens get score 0 (like llama.cpp) */

             if (tok->types && token_id >= 0 && token_id < (int32_t)tok->types_size) {

                 if (tok->types[token_id] == GGUF_TOKEN_USER_DEFINED) {

                     token_score = 0.0f;

                 }

             }

             /* Apply UNK penalty (SentencePiece behavior) */

             if (token_id == tok->unk_id) {

                 token_score += unknown_penalty;

             }


             /* Transition: score = best_score[pos] + token_score */

             float new_score = best_score[pos] + token_score;


             if (new_score > best_score[next_pos]) {

                 best_score[next_pos] = new_score;

                 best_prev[next_pos] = (int32_t)pos;

                 best_token[next_pos] = token_id;

             }

         }

     }


     /* Backtrack to find best token sequence */

     int32_t *reverse_ids = (int32_t *)malloc(max_ids * sizeof(int32_t));

     if (!reverse_ids) {

         free(best_score);

         free(best_prev);

         free(best_token);

         return 0;

     }


     int num_tokens = 0;

     int32_t curr = (int32_t)(n - 1);


     /* Handle trailingUNK by finding valid end */

     while (curr > 0 && best_token[curr] < 0) {

         curr = best_prev[curr];

     }


     /* Backtrack from end to start, collecting tokens.

      * We track the token's start position to avoid duplicates. */

     int last_start = -1;  /* Track the start position of last added token */

     while (curr > 0 && num_tokens < max_ids) {

         int32_t token_id = best_token[curr];

         if (token_id >= 0) {

             /* Use the DP backpointer as the true token start */

             int token_start = best_prev[curr];


             /* Only add if this is a new token (different start position) */

             if (token_start != last_start) {

                 reverse_ids[num_tokens++] = token_id;

                 last_start = token_start;

             }

         }

         curr = best_prev[curr];

     }

     if (dbg) {

         fprintf(stderr, "[SPM] backtrack tokens=%d curr=%d\n", num_tokens, curr);

     }


     /* Free DP arrays before using reverse_ids */

     free(best_score);

     free(best_prev);

     free(best_token);


     if (num_tokens > max_ids) num_tokens = max_ids;


     /* Backtracking collected tokens in reverse order, so reverse once */

     for (int i = 0; i < num_tokens / 2; i++) {

         int32_t tmp = reverse_ids[i];

         reverse_ids[i] = reverse_ids[num_tokens - 1 - i];

         reverse_ids[num_tokens - 1 - i] = tmp;

     }


     /* Copy to output and merge consecutive UNK tokens (SPM behavior) */

     int out_idx = 0;

     for (int i = 0; i < num_tokens && out_idx < max_ids; i++) {

         int32_t token_id = reverse_ids[i];


         /* Merge consecutive UNK tokens into one */

         if (token_id == tok->unk_id && out_idx > 0 && ids[out_idx - 1] == tok->unk_id) {

             continue;  /* Skip - already have UNK */

         }

         ids[out_idx++] = token_id;

     }

     if (dbg) {

         fprintf(stderr, "[SPM] encode done: out=%d\n", out_idx);

     }


     free(reverse_ids);


     /* If DP failed to produce valid tokens, use byte-fallback */

     if (num_tokens == 0) {

         return spm_encode_byte_fallback(tok, text, text_len, ids, max_ids);

     }


     return out_idx;

 }


 /* Fallback: encode using byte tokens for any unmatched content.

  * Uses the ORIGINAL text (not preprocessed), mapping each byte to a byte token. */

 static int spm_encode_byte_fallback(const CKTokenizer *tok,

                                     const char *text, int text_len,

                                     int32_t *ids, int max_ids) {

     if (!tok || !text || !ids || max_ids <= 0) return 0;

     if (text_len < 0) text_len = (int)strlen(text);

     if (text_len == 0) return 0;


     int count = 0;

     for (int i = 0; i < text_len && count < max_ids; i++) {

         unsigned char byte_val = (unsigned char)text[i];

         int32_t byte_token = spm_get_byte_token(tok, byte_val);


         /* If we have a byte token, use it; otherwise use UNK */

         if (byte_token >= 0 && byte_token != tok->unk_id) {

             ids[count++] = byte_token;

         } else {

             ids[count++] = tok->unk_id;

         }

     }

     return count;

 }


 /* Find all candidate tokens matching at position */

 static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len,

                                       size_t pos, int32_t *candidates, int max_candidates) {

     if (!tok || !text || pos >= (size_t)text_len) return 0;


     int num_found = 0;

     int max_len = 64;

     if (pos + max_len > (size_t)text_len) max_len = (int)(text_len - pos);


     /* Iterate from longest to shortest to find all matches */

     char tmp[65];

     for (int len = max_len; len >= 1 && num_found < max_candidates; len--) {

         memcpy(tmp, text + pos, len);

         tmp[len] = '\0';


         TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);

         if (info && info->id >= 0 && info->id != tok->unk_id) {

             /* Skip disallowed token types */

             if (!spm_token_allowed_in_dp(tok, info->id)) {

                 continue;

             }


             /* Check if already added */

             int dup = 0;

             for (int j = 0; j < num_found; j++) {

                 if (candidates[j] == info->id) {

                     dup = 1;

                     break;

                 }

             }

             if (!dup) {

                 candidates[num_found++] = info->id;

             }

         }

     }


     /* If no candidates found, add UNK token as fallback.

      * For SPM, UNK should cover all consecutive unknown bytes until a known token or end.

      * We handle this by adding UNK with a special marker - we'll check at runtime

      * how far we can extend it. */

     if (num_found == 0 && tok->unk_id >= 0 && max_candidates > 0) {

         /* Only add UNK if it's allowed in DP */

         if (spm_token_allowed_in_dp(tok, tok->unk_id)) {

             candidates[num_found++] = tok->unk_id;

         }

     }


     return num_found;

 }


 /* Count how many consecutive bytes at text[pos] are not start of any vocab token.

  * Also stop at UTF-8 encoded '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token. */

 static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos) {

     int run = 0;

     while (pos + run < (size_t)text_len) {

         /* Stop at '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token */

         if (pos + run + 3 <= (size_t)text_len &&

             (unsigned char)text[pos + run] == 0xE2 &&

             (unsigned char)text[pos + run + 1] == 0x96 &&

             (unsigned char)text[pos + run + 2] == 0x81) {

             break;

         }


         /* Check if any vocab token matches at this position */

         int max_len = 64;

         if (pos + run + max_len > (size_t)text_len) {

             max_len = (int)(text_len - pos - run);

         }

         int found = 0;

         for (int len = max_len; len >= 1; len--) {

             char tmp[65];

             memcpy(tmp, text + pos + run, len);

             tmp[len] = '\0';

             TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);

             if (info && info->id >= 0 && info->id != tok->unk_id && spm_token_allowed_in_dp(tok, info->id)) {

                 found = 1;

                 break;

             }

         }

         if (found) break;

         run++;

     }

     return run;

 }


 /* Encode text to token IDs using greedy longest-match or Viterbi for SPM */

 int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids) {

     if (!tok || !text || !ids || max_ids <= 0) return 0;

     if (text_len < 0) text_len = (int)strlen(text);


     /* For SPM tokenizers, use either unigram/Viterbi or llama-style SPM. */

     if (tok->config.type == CK_TOKENIZER_SPM) {

         int out_idx = 0;

         if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {

             ids[out_idx++] = tok->bos_id;

         }

         if (text_len == 0) {

             if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {

                 ids[out_idx++] = tok->eos_id;

             }

             return out_idx;

         }

         int n = 0;

         if (tok->config.spm_mode == CK_SPM_MODE_LLAMA) {

             n = ck_tokenizer_encode_spm_llama_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);

         } else {

             n = ck_tokenizer_encode_spm_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);

         }

         if (n <= 0) return n;

         out_idx += n;

         if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {

             ids[out_idx++] = tok->eos_id;

         }

         return out_idx;

     }


     /* For BPE tokenizers, convert spaces to appropriate prefix marker.

      * Auto-detect style from vocabulary if not already set. */

     char preprocessed[8192];

     const char *input = text;

     int input_len = text_len;


     if (tok->config.type == CK_TOKENIZER_BPE) {

         /* Get or detect space prefix style */

         CKSpacePrefixStyle style = ((CKTokenizer *)tok)->config.space_prefix_style;

         if (!((CKTokenizer *)tok)->config.space_prefix_detected) {

             style = ck_tokenizer_detect_space_prefix_style((CKTokenizer *)tok);

         }


         int pp_len = preprocess_bpe_spaces(text, text_len, preprocessed, sizeof(preprocessed) - 1, style);

         if (pp_len > 0) {

             preprocessed[pp_len] = '\0';

             input = preprocessed;

             input_len = pp_len;

         }

     }


     int out_idx = 0;

     if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {

         ids[out_idx++] = tok->bos_id;

     }


     size_t pos = 0;

     while (pos < (size_t)input_len && out_idx < max_ids) {

         size_t match_len = 0;

         int32_t id = find_longest_match(tok, input, input_len, pos, &match_len);


         if (match_len == 0) {

             /* Emit UNK for unknown characters */

             if (tok->unk_id >= 0) ids[out_idx++] = tok->unk_id;

             pos++;

         } else {

             ids[out_idx++] = id;

             pos += match_len;

         }

     }


     if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {

         ids[out_idx++] = tok->eos_id;

     }


     return out_idx;

 }


 /* Decode token IDs to text */

 int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len) {

     if (!tok || !ids || !text || max_len <= 0) return 0;

     int len = 0;

     for (int i = 0; i < num_ids; i++) {

         int32_t id = ids[i];

         if (id < 0) continue;

         const char *token = ck_tokenizer_id_to_token(tok, id);

         if (!token) continue;

         int token_len = (int)strlen(token);


         /* Check for space prefix markers and convert to ASCII space */

         unsigned char c0 = (unsigned char)token[0];

         unsigned char c1 = (unsigned char)token[1];


         if (c0 == 0xC4 && c1 == 0xA0) {

             /* Ġ (U+0120) is 2 bytes - convert to space */

             if (len < max_len - 1) text[len++] = ' ';

             token += 2; token_len -= 2;

         } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {

             /* ▁ (U+2581) is 3 bytes - convert to space */

             if (len < max_len - 1) text[len++] = ' ';

             token += 3; token_len -= 3;

         }


         for (int j = 0; j < token_len && len < max_len - 1; j++) text[len++] = token[j];

     }

     text[len] = '\0';

     return len;

 }


 /* Load vocabulary from memory-mapped binary data */

 int ck_tokenizer_load_binary(CKTokenizer *tok,

                              int vocab_size,

                              const int32_t *offsets,

                              const char *strings,

                              int num_merges,

                              const int32_t *merges) {

     return ck_tokenizer_load_binary_with_scores(tok, vocab_size, offsets, strings, NULL, NULL, num_merges, merges);

 }


 /* Load vocabulary from memory-mapped binary data with scores and types */

 int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok,

                                          int vocab_size,

                                          const int32_t *offsets,

                                          const char *strings,

                                          const float *scores,

                                          const uint8_t *types,

                                          int num_merges,

                                          const int32_t *merges) {

     if (!tok || !offsets || !strings) return -1;

     ck_tokenizer_reset(tok);


     /* Free any existing scores/types arrays before reallocating */

     if (tok->scores) {

         free(tok->scores);

         tok->scores = NULL;

         tok->scores_size = 0;

     }

     if (tok->types) {

         free(tok->types);

         tok->types = NULL;

         tok->types_size = 0;

     }


     /* Allocate scores and types arrays if provided */

     if (scores && vocab_size > 0) {

         tok->scores = (float *)malloc(vocab_size * sizeof(float));

         if (!tok->scores) return -1;

         memcpy(tok->scores, scores, vocab_size * sizeof(float));

         tok->scores_size = (size_t)vocab_size;

     }

     if (types && vocab_size > 0) {

         tok->types = (uint8_t *)malloc(vocab_size * sizeof(uint8_t));

         if (!tok->types) {

             if (tok->scores) {

                 free(tok->scores);

                 tok->scores = NULL;

             }

             return -1;

         }

         memcpy(tok->types, types, vocab_size * sizeof(uint8_t));

         tok->types_size = (size_t)vocab_size;

     }


     for (int i = 0; i < vocab_size; i++) {

         const char *token = strings + offsets[i];

         float score = scores ? scores[i] : 0.0f;

         ck_tokenizer_add_token(tok, token, i, score);

     }


     /* Build byte token lookup table if types are available */

     if (types && vocab_size > 0) {

         spm_build_byte_lookup(tok, strings, offsets, vocab_size);


         /* Log token type statistics */

         int count_normal = 0, count_unknown = 0, count_control = 0, count_byte = 0, count_other = 0;

         int max_type = 0;

         for (int i = 0; i < vocab_size; i++) {

             uint8_t t = tok->types[i];

             if (t > max_type) max_type = t;

             switch (t) {

                 case GGUF_TOKEN_NORMAL: count_normal++; break;

                 case GGUF_TOKEN_UNKNOWN: count_unknown++; break;

                 case GGUF_TOKEN_CONTROL: count_control++; break;

                 case GGUF_TOKEN_BYTE: count_byte++; break;

                 default: count_other++; break;

             }

         }

         fprintf(stderr, "[TOKENIZER] Loaded %d tokens: normal=%d, unknown=%d, control=%d, byte=%d, other=%d\n",

                 vocab_size, count_normal, count_unknown, count_control, count_byte, count_other);

         if (max_type > GGUF_TOKEN_BYTE) {

             fprintf(stderr, "[TOKENIZER] Warning: Unexpected token type %d\n", max_type);

         }

     }


     /* TODO: Merges */

     (void)num_merges; (void)merges;

     return 0;

 }


 /* Placeholders for header compliance */

 int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }

 int ck_tokenizer_load_json(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }

 int ck_tokenizer_load_text(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }

 int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path) { (void)tok; (void)path; return -1; }

 int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority) {

     (void)tok; (void)left; (void)right; (void)merged; (void)priority; return 0;

 }

hash_table.h

CK_TOKENIZER_HT_BUCKETS_LARGE
#define CK_TOKENIZER_HT_BUCKETS_LARGE
Definition: hash_table.h:142

ck_tokenizer_hash_table_free
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140

ck_tokenizer_hash_table_create
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80

ck_tokenizer_hash_table_insert
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158

ck_tokenizer_hash_table_lookup
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198

ck_tokenizer_hash_table_clear
void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:312

ck_trie_clear
void ck_trie_clear(CKTrie *trie)
Definition: trie.c:80

ck_trie_find_longest
int32_t ck_trie_find_longest(const CKTrie *trie, const char *text, size_t text_len, size_t start_pos, size_t *match_len)
Definition: trie.c:142

ck_trie_insert
int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
Definition: trie.c:110

ck_trie_free
void ck_trie_free(CKTrie *trie)
Definition: trie.c:51

ck_trie_create
CKTrie * ck_trie_create(size_t max_nodes)
Definition: trie.c:29

ck_tokenizer_mempool_init
int ck_tokenizer_mempool_init(CKTokenizerMemPool *pool, size_t size)
Definition: memory_pool.c:11

ck_tokenizer_mempool_free
void ck_tokenizer_mempool_free(CKTokenizerMemPool *pool)
Definition: memory_pool.c:28

murmurhash3.h

CKTokenizerConfig::add_bos
bool add_bos
Definition: tokenizer.h:75

CKTokenizerConfig::unk_score
float unk_score
Definition: tokenizer.h:80

CKTokenizerConfig::add_space_prefix
bool add_space_prefix
Definition: tokenizer.h:77

CKTokenizerConfig::use_trie
bool use_trie
Definition: tokenizer.h:81

CKTokenizerConfig::type
CKTokenizerType type
Definition: tokenizer.h:74

CKTokenizerConfig::add_eos
bool add_eos
Definition: tokenizer.h:76

CKTokenizerConfig::spm_mode
CKSpmMode spm_mode
Definition: tokenizer.h:84

CKTokenizerConfig::space_prefix_detected
bool space_prefix_detected
Definition: tokenizer.h:83

CKTokenizerConfig::space_prefix_style
CKSpacePrefixStyle space_prefix_style
Definition: tokenizer.h:82

CKTokenizer
Definition: ck_tokenizer.h:76

CKTokenizer::bos_id
int32_t bos_id
Definition: ck_tokenizer.h:98

CKTokenizer::scores
float * scores
Definition: tokenizer.h:111

CKTokenizer::types_size
size_t types_size
Definition: tokenizer.h:114

CKTokenizer::byte_token_id
int32_t * byte_token_id
Definition: tokenizer.h:117

CKTokenizer::pool
CKMemPool pool
Definition: ck_tokenizer.h:78

CKTokenizer::unk_id
int32_t unk_id
Definition: ck_tokenizer.h:97

CKTokenizer::vocab_trie
CKTrie * vocab_trie
Definition: tokenizer.h:103

CKTokenizer::eos_id
int32_t eos_id
Definition: ck_tokenizer.h:99

CKTokenizer::vocab
CKTokenizerHashTable * vocab
Definition: tokenizer.h:100

CKTokenizer::types
uint8_t * types
Definition: tokenizer.h:113

CKTokenizer::vocab_capacity
size_t vocab_capacity
Definition: tokenizer.h:108

CKTokenizer::scores_size
size_t scores_size
Definition: tokenizer.h:112

CKTokenizer::id_to_token
char ** id_to_token
Definition: ck_tokenizer.h:86

CKTokenizer::mask_id
int32_t mask_id
Definition: tokenizer.h:124

CKTokenizer::vocab_size
int vocab_size
Definition: ck_tokenizer.h:81

CKTokenizer::config
CKTokenizerConfig config
Definition: tokenizer.h:97

CKTokenizer::pad_id
int32_t pad_id
Definition: ck_tokenizer.h:100

spm_find_candidates_at_pos
static int spm_find_candidates_at_pos(const CKTokenizer *tok, const char *text, int text_len, size_t pos, int32_t *candidates, int max_candidates)
Definition: tokenizer.c:1047

ck_tokenizer_lookup_exact
static int32_t ck_tokenizer_lookup_exact(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:330

ck_tokenizer_set_add_bos_eos
void ck_tokenizer_set_add_bos_eos(CKTokenizer *tok, bool add_bos, bool add_eos)
Definition: tokenizer.c:243

preprocess_spm_llama_text
static int preprocess_spm_llama_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:554

spm_token_is_byte_format
static bool spm_token_is_byte_format(const char *token)
Definition: tokenizer.c:500

ck_tokenizer_lookup
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token)
Definition: tokenizer.c:323

ck_tokenizer_load_binary_with_scores
int ck_tokenizer_load_binary_with_scores(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, const float *scores, const uint8_t *types, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1252

ck_tokenizer_decode
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: tokenizer.c:1211

ck_tokenizer_add_token
int ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int32_t id, float score)
Definition: tokenizer.c:157

ck_tokenizer_encode_spm_llama_impl
static int ck_tokenizer_encode_spm_llama_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:642

ck_tokenizer_detect_space_prefix_style
CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style(CKTokenizer *tok)
Definition: tokenizer.c:276

spm_token_allowed_in_dp
static bool spm_token_allowed_in_dp(const CKTokenizer *tok, int32_t token_id)
Definition: tokenizer.c:469

GGUF_TOKEN_CONTROL
#define GGUF_TOKEN_CONTROL
Definition: tokenizer.c:462

ck_tokenizer_load_text
int ck_tokenizer_load_text(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1334

ck_tokenizer_load_gguf
int ck_tokenizer_load_gguf(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1332

ck_tokenizer_load_json
int ck_tokenizer_load_json(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1333

ck_tokenizer_set_spm_mode
void ck_tokenizer_set_spm_mode(CKTokenizer *tok, CKSpmMode spm_mode)
Definition: tokenizer.c:254

spm_build_byte_lookup
static void spm_build_byte_lookup(CKTokenizer *tok, const char *strings, const int32_t *offsets, int vocab_size)
Definition: tokenizer.c:507

ck_tokenizer_create
CKTokenizer * ck_tokenizer_create(CKTokenizerType type)
Definition: tokenizer.c:34

find_longest_match
static int32_t find_longest_match(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:400

ck_tokenizer_load_binary
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: tokenizer.c:1242

preprocess_spm_text
static int preprocess_spm_text(const char *text, int text_len, char *out, int out_max, bool add_space_prefix)
Definition: tokenizer.c:763

spm_encode_byte_fallback
static int spm_encode_byte_fallback(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:1024

spm_llama_resegment_node
static int spm_llama_resegment_node(const CKTokenizer *tok, const SpmLlamaNode *nodes, int node_id, int32_t *ids, int max_ids, int out_idx)
Definition: tokenizer.c:611

ck_tokenizer_lookup_exact_n
static int32_t ck_tokenizer_lookup_exact_n(const CKTokenizer *tok, const char *text, int text_len)
Definition: tokenizer.c:337

GGUF_TOKEN_USER_DEFINED
#define GGUF_TOKEN_USER_DEFINED
Definition: tokenizer.c:463

ck_tokenizer_id_to_token
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: tokenizer.c:353

ck_tokenizer_add_merge
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority)
Definition: tokenizer.c:1336

utf8_len
static int utf8_len(unsigned char c)
Definition: tokenizer.c:541

ck_tokenizer_encode
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:1132

ck_tokenizer_set_special_ids
void ck_tokenizer_set_special_ids(CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)
Definition: tokenizer.c:234

GGUF_TOKEN_UNUSED
#define GGUF_TOKEN_UNUSED
Definition: tokenizer.c:464

find_longest_match_trie
static int32_t find_longest_match_trie(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:359

ck_tokenizer_reset
void ck_tokenizer_reset(CKTokenizer *tok)
Definition: tokenizer.c:125

spm_is_byte_token
static bool spm_is_byte_token(const CKTokenizer *tok, int32_t token_id)
Definition: tokenizer.c:479

spm_get_byte_token
static int32_t spm_get_byte_token(const CKTokenizer *tok, unsigned char byte_val)
Definition: tokenizer.c:487

GGUF_TOKEN_BYTE
#define GGUF_TOKEN_BYTE
Definition: tokenizer.c:465

ck_tokenizer_add_special_token
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
Definition: tokenizer.c:213

ck_tokenizer_free
void ck_tokenizer_free(CKTokenizer *tok)
Definition: tokenizer.c:91

ck_tokenizer_load_merges
int ck_tokenizer_load_merges(CKTokenizer *tok, const char *path)
Definition: tokenizer.c:1335

spm_count_unknown_run
static int spm_count_unknown_run(const CKTokenizer *tok, const char *text, int text_len, size_t pos)
Definition: tokenizer.c:1098

preprocess_bpe_spaces
static int preprocess_bpe_spaces(const char *text, int text_len, char *out, int out_max, CKSpacePrefixStyle style)
Definition: tokenizer.c:412

GGUF_TOKEN_UNKNOWN
#define GGUF_TOKEN_UNKNOWN
Definition: tokenizer.c:461

ck_tokenizer_set_use_trie
void ck_tokenizer_set_use_trie(CKTokenizer *tok, bool use_trie)
Definition: tokenizer.c:260

ck_tokenizer_set_add_space_prefix
void ck_tokenizer_set_add_space_prefix(CKTokenizer *tok, bool add_space_prefix)
Definition: tokenizer.c:249

find_longest_match_hash
static int32_t find_longest_match_hash(const CKTokenizer *tok, const char *text, size_t text_len, size_t pos, size_t *match_len)
Definition: tokenizer.c:370

ck_tokenizer_set_space_prefix_style
void ck_tokenizer_set_space_prefix_style(CKTokenizer *tok, CKSpacePrefixStyle style)
Definition: tokenizer.c:266

ck_tokenizer_encode_spm_impl
static int ck_tokenizer_encode_spm_impl(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: tokenizer.c:832

GGUF_TOKEN_NORMAL
#define GGUF_TOKEN_NORMAL
Definition: tokenizer.c:460

tokenizer.h

mask
int32_t int32_t int32_t int32_t int32_t mask
Definition: tokenizer.h:233

ids
const int32_t * ids
Definition: tokenizer.h:443

id
int32_t id
Definition: tokenizer.h:315

CKSpacePrefixStyle
CKSpacePrefixStyle
Definition: tokenizer.h:60

CK_SPACE_PREFIX_AUTO
@ CK_SPACE_PREFIX_AUTO
Definition: tokenizer.h:61

CK_SPACE_PREFIX_SPM
@ CK_SPACE_PREFIX_SPM
Definition: tokenizer.h:63

CK_SPACE_PREFIX_GPT2
@ CK_SPACE_PREFIX_GPT2
Definition: tokenizer.h:62

num_ids
const int32_t int num_ids
Definition: tokenizer.h:444

CKTokenizerType
CKTokenizerType
Definition: tokenizer.h:53

CK_TOKENIZER_BPE
@ CK_TOKENIZER_BPE
Definition: tokenizer.h:54

CK_TOKENIZER_SPM
@ CK_TOKENIZER_SPM
Definition: tokenizer.h:56

text
const char * text
Definition: tokenizer.h:563

add_eos
bool bool add_eos
Definition: tokenizer.h:242

add_space_prefix
bool add_space_prefix
Definition: tokenizer.h:252

spm_mode
CKSpmMode spm_mode
Definition: tokenizer.h:260

add_bos
bool add_bos
Definition: tokenizer.h:242

token
const char * token
Definition: tokenizer.h:306

score
int32_t float * score
Definition: tokenizer.h:327

CKSpmMode
CKSpmMode
Definition: tokenizer.h:67

CK_SPM_MODE_UNIGRAM
@ CK_SPM_MODE_UNIGRAM
Definition: tokenizer.h:68

CK_SPM_MODE_LLAMA
@ CK_SPM_MODE_LLAMA
Definition: tokenizer.h:69

unk
int32_t unk
Definition: tokenizer.h:229

use_trie
bool use_trie
Definition: tokenizer.h:276

eos
int32_t int32_t int32_t eos
Definition: tokenizer.h:231

pad
int32_t int32_t int32_t int32_t pad
Definition: tokenizer.h:232

style
CKSpacePrefixStyle style
Definition: tokenizer.h:287

bos
int32_t int32_t bos
Definition: tokenizer.h:230

out_len
const int32_t int int * out_len
Definition: tokenizer.h:445

config
const CKBPEConfig * config
Definition: true_bpe.h:171

num_merges
int const int32_t const char int num_merges
Definition: true_bpe.h:188

strings
int const int32_t const char * strings
Definition: true_bpe.h:187

merges
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189

priority
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115

max_len
const int32_t int char int max_len
Definition: true_bpe.h:280

text_len
const char int text_len
Definition: true_bpe.h:262

vocab_size
int vocab_size
Definition: true_bpe.h:185

offsets
int const int32_t * offsets
Definition: true_bpe.h:186

left
const char * left
Definition: true_bpe.h:130

max_ids
const char int int32_t int max_ids
Definition: true_bpe.h:264

right
const char const char * right
Definition: true_bpe.h:131