#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <ctype.h>
#include "tokenizer/tokenizer.h"
#include "tokenizer/murmurhash3.h"
#include "tokenizer/hash_table.h"

Macros
#define	GGUF_TOKEN_BYTE 6

#define	GGUF_TOKEN_CONTROL 3

#define	GGUF_TOKEN_NORMAL 1

#define	GGUF_TOKEN_UNKNOWN 2

#define	GGUF_TOKEN_UNUSED 5

#define	GGUF_TOKEN_USER_DEFINED 4

Functions
int	ck_tokenizer_add_merge (CKTokenizer *tok, int32_t left, int32_t right, int32_t merged, int32_t priority)

int	ck_tokenizer_add_special_token (CKTokenizer tok, const char name, int32_t id)

int	ck_tokenizer_add_token (CKTokenizer tok, const char token, int32_t id, float score)

CKTokenizer *	ck_tokenizer_create (CKTokenizerType type)

int	ck_tokenizer_decode (const CKTokenizer tok, const int32_t ids, int num_ids, char *text, int max_len)

CKSpacePrefixStyle	ck_tokenizer_detect_space_prefix_style (CKTokenizer *tok)

int	ck_tokenizer_encode (const CKTokenizer tok, const char text, int text_len, int32_t *ids, int max_ids)

static int	ck_tokenizer_encode_spm_impl (const CKTokenizer tok, const char text, int text_len, int32_t *ids, int max_ids)

static int	ck_tokenizer_encode_spm_llama_impl (const CKTokenizer tok, const char text, int text_len, int32_t *ids, int max_ids)

void	ck_tokenizer_free (CKTokenizer *tok)

const char *	ck_tokenizer_id_to_token (const CKTokenizer *tok, int32_t id)

int	ck_tokenizer_load_binary (CKTokenizer tok, int vocab_size, const int32_t offsets, const char strings, int num_merges, const int32_t merges)

int	ck_tokenizer_load_binary_with_scores (CKTokenizer tok, int vocab_size, const int32_t offsets, const char strings, const float scores, const uint8_t types, int num_merges, const int32_t merges)

int	ck_tokenizer_load_gguf (CKTokenizer tok, const char path)

int	ck_tokenizer_load_json (CKTokenizer tok, const char path)

int	ck_tokenizer_load_merges (CKTokenizer tok, const char path)

int	ck_tokenizer_load_text (CKTokenizer tok, const char path)

int32_t	ck_tokenizer_lookup (const CKTokenizer tok, const char token)

static int32_t	ck_tokenizer_lookup_exact (const CKTokenizer tok, const char token)

static int32_t	ck_tokenizer_lookup_exact_n (const CKTokenizer tok, const char text, int text_len)

void	ck_tokenizer_reset (CKTokenizer *tok)

void	ck_tokenizer_set_add_bos_eos (CKTokenizer *tok, bool add_bos, bool add_eos)

void	ck_tokenizer_set_add_space_prefix (CKTokenizer *tok, bool add_space_prefix)

void	ck_tokenizer_set_space_prefix_style (CKTokenizer *tok, CKSpacePrefixStyle style)

void	ck_tokenizer_set_special_ids (CKTokenizer *tok, int32_t unk, int32_t bos, int32_t eos, int32_t pad, int32_t mask)

void	ck_tokenizer_set_spm_mode (CKTokenizer *tok, CKSpmMode spm_mode)

void	ck_tokenizer_set_use_trie (CKTokenizer *tok, bool use_trie)

static int32_t	find_longest_match (const CKTokenizer tok, const char text, size_t text_len, size_t pos, size_t *match_len)

static int32_t	find_longest_match_hash (const CKTokenizer tok, const char text, size_t text_len, size_t pos, size_t *match_len)

static int32_t	find_longest_match_trie (const CKTokenizer tok, const char text, size_t text_len, size_t pos, size_t *match_len)

static int	preprocess_bpe_spaces (const char text, int text_len, char out, int out_max, CKSpacePrefixStyle style)

static int	preprocess_spm_llama_text (const char text, int text_len, char out, int out_max, bool add_space_prefix)

static int	preprocess_spm_text (const char text, int text_len, char out, int out_max, bool add_space_prefix)

static void	spm_build_byte_lookup (CKTokenizer tok, const char strings, const int32_t *offsets, int vocab_size)

static int	spm_count_unknown_run (const CKTokenizer tok, const char text, int text_len, size_t pos)

static int	spm_encode_byte_fallback (const CKTokenizer tok, const char text, int text_len, int32_t *ids, int max_ids)

static int	spm_find_candidates_at_pos (const CKTokenizer tok, const char text, int text_len, size_t pos, int32_t *candidates, int max_candidates)

static int32_t	spm_get_byte_token (const CKTokenizer *tok, unsigned char byte_val)

static bool	spm_is_byte_token (const CKTokenizer *tok, int32_t token_id)

static int	spm_llama_resegment_node (const CKTokenizer tok, const SpmLlamaNode nodes, int node_id, int32_t *ids, int max_ids, int out_idx)

static bool	spm_token_allowed_in_dp (const CKTokenizer *tok, int32_t token_id)

static bool	spm_token_is_byte_format (const char *token)

static int	utf8_len (unsigned char c)

Macro Definition Documentation

◆ GGUF_TOKEN_BYTE

#define GGUF_TOKEN_BYTE 6

Definition at line 465 of file tokenizer.c.

◆ GGUF_TOKEN_CONTROL

#define GGUF_TOKEN_CONTROL 3

Definition at line 462 of file tokenizer.c.

◆ GGUF_TOKEN_NORMAL

#define GGUF_TOKEN_NORMAL 1

Definition at line 460 of file tokenizer.c.

◆ GGUF_TOKEN_UNKNOWN

#define GGUF_TOKEN_UNKNOWN 2

Definition at line 461 of file tokenizer.c.

◆ GGUF_TOKEN_UNUSED

#define GGUF_TOKEN_UNUSED 5

Definition at line 464 of file tokenizer.c.

◆ GGUF_TOKEN_USER_DEFINED

#define GGUF_TOKEN_USER_DEFINED 4

Definition at line 463 of file tokenizer.c.

Function Documentation

◆ ck_tokenizer_add_merge()

int ck_tokenizer_add_merge	(	CKTokenizer *	tok,
		int32_t	left_id,
		int32_t	right_id,
		int32_t	merged_id,
		int32_t	priority
	)

Add a BPE merge rule.

Parameters

tok	Tokenizer
left_id	Left token ID
right_id	Right token ID
merged_id	Merged token ID
priority	Lower = higher priority (applied first)

Returns: 0 on success, -1 on error

Definition at line 1336 of file tokenizer.c.

                                                                                                             {
     (void)tok; (void)left; (void)right; (void)merged; (void)priority; return 0;
 }

References left, priority, and right.

◆ ck_tokenizer_add_special_token()

int ck_tokenizer_add_special_token	(	CKTokenizer *	tok,
		const char *	name,
		int32_t	id
	)

Add special token (UNK, BOS, EOS, PAD, MASK).

Parameters

tok	Tokenizer
name	Special token name ("unk", "bos", "eos", "pad", "mask")
id	Token ID

Returns: 0 on success, -1 on error

Definition at line 213 of file tokenizer.c.

                                                                                    {
     if (!tok || !name) return -1;
     if (ck_tokenizer_add_token(tok, name, id, -1e10f) != 0) return -1;
  
     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, name);
     if (info) info->is_special = true;
  
     /* Also add to trie as special */
     if (tok->vocab_trie) {
         ck_trie_insert(tok->vocab_trie, name, id, true, 0);
     }
  
     if (strcmp(name, "<unk>") == 0 || strcmp(name, "[UNK]") == 0) tok->unk_id = id;
     else if (strcmp(name, "<s>") == 0 || strcmp(name, "<bos>") == 0 || strcmp(name, "[BOS]") == 0) tok->bos_id = id;
     else if (strcmp(name, "</s>") == 0 || strcmp(name, "<eos>") == 0 || strcmp(name, "[EOS]") == 0) tok->eos_id = id;
     else if (strcmp(name, "<pad>") == 0 || strcmp(name, "[PAD]") == 0) tok->pad_id = id;
  
     return 0;
 }

References CKTokenizer::bos_id, ck_tokenizer_add_token(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), CKTokenizer::eos_id, id, CKTokenizer::pad_id, CKTokenizer::unk_id, CKTokenizer::vocab, and CKTokenizer::vocab_trie.

Referenced by main().

◆ ck_tokenizer_add_token()

int ck_tokenizer_add_token	(	CKTokenizer *	tok,
		const char *	token,
		int32_t	id,
		float	score
	)

Add a token to vocabulary.

Parameters

tok	Tokenizer
token	Token string
id	Token ID
score	Token score (for SPM)

Returns: 0 on success, -1 on error

Definition at line 157 of file tokenizer.c.

                                                                                          {
     if (!tok || !token) {
         return -1;
     }
  
     /* Ensure we have space in reverse vocab */
     if (id >= (int32_t)tok->vocab_capacity) {
         size_t new_cap = tok->vocab_capacity * 2;
         while (new_cap <= (size_t)id) {
             new_cap *= 2;
         }
         char **new_array = (char **)realloc(tok->id_to_token, new_cap * sizeof(char *));
         if (!new_array) {
             return -1;
         }
         memset(new_array + tok->vocab_capacity, 0, (new_cap - tok->vocab_capacity) * sizeof(char *));
         tok->id_to_token = new_array;
         tok->vocab_capacity = new_cap;
     }
  
     /* Check if token already exists */
     TokenInfo *existing = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
     if (existing) {
         existing->id = id;
         existing->score = score;
         if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
         if (tok->id_to_token[id]) free(tok->id_to_token[id]);
         tok->id_to_token[id] = strdup(token);
         return 0;
     }
  
     /* Create new token info */
     TokenInfo *info = (TokenInfo *)malloc(sizeof(TokenInfo));
     if (!info) return -1;
     info->id = id;
     info->score = score;
     info->is_special = false;
  
     if (ck_tokenizer_hash_table_insert(tok->vocab, token, info) != 0) {
         free(info);
         return -1;
     }
  
     /* Also add to trie for fast longest-match lookups */
     if (tok->vocab_trie) {
         ck_trie_insert(tok->vocab_trie, token, id, false, 0);
     }
  
     if (id >= (int32_t)tok->vocab_size) tok->vocab_size = id + 1;
     if (tok->id_to_token[id]) free(tok->id_to_token[id]);
     tok->id_to_token[id] = strdup(token);
  
     return 0;
 }

References ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), ck_trie_insert(), id, CKTokenizer::id_to_token, score, token, CKTokenizer::vocab, CKTokenizer::vocab_capacity, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_load_binary_with_scores().

◆ ck_tokenizer_create()

CKTokenizer* ck_tokenizer_create ( CKTokenizerType type )

Definition at line 34 of file tokenizer.c.

                                                        {
     CKTokenizer *tok = (CKTokenizer *)malloc(sizeof(CKTokenizer));
     if (!tok) {
         return NULL;
     }
  
     memset(tok, 0, sizeof(*tok));
  
     /* Create hash table for vocabulary */
     tok->vocab = ck_tokenizer_hash_table_create(CK_TOKENIZER_HT_BUCKETS_LARGE);
     if (!tok->vocab) {
         free(tok);
         return NULL;
     }
  
     /* Create trie for fast lookups (1M nodes for ~50k vocab) */
     tok->vocab_trie = ck_trie_create(1000000);
     if (!tok->vocab_trie) {
         ck_tokenizer_hash_table_free(tok->vocab, true);
         free(tok);
         return NULL;
     }
  
     /* Initialize reverse vocab */
     tok->vocab_capacity = 4096;
     tok->id_to_token = (char **)calloc(tok->vocab_capacity, sizeof(char *));
     if (!tok->id_to_token) {
         ck_tokenizer_hash_table_free(tok->vocab, true);
         free(tok);
         return NULL;
     }
  
     /* Set default special tokens */
     tok->unk_id = 0;
     tok->bos_id = 1;
     tok->eos_id = 2;
     tok->pad_id = -1;
     tok->mask_id = -1;
  
     /* Initialize scores and types for SPM */
     tok->scores = NULL;
     tok->types = NULL;
  
     /* Set config */
     tok->config.type = type;
     tok->config.add_bos = false;
     tok->config.add_eos = false;
     tok->config.add_space_prefix = true;
     tok->config.unk_score = -1e10f;
     tok->config.spm_mode = CK_SPM_MODE_UNIGRAM;
  
     ck_tokenizer_mempool_init(&tok->pool, 1024 * 1024);
  
     return tok;
 }

References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, CKTokenizerConfig::add_space_prefix, CKTokenizer::bos_id, CK_SPM_MODE_UNIGRAM, ck_tokenizer_hash_table_create(), ck_tokenizer_hash_table_free(), CK_TOKENIZER_HT_BUCKETS_LARGE, ck_tokenizer_mempool_init(), ck_trie_create(), CKTokenizer::config, CKTokenizer::eos_id, CKTokenizer::id_to_token, CKTokenizer::mask_id, CKTokenizer::pad_id, CKTokenizer::pool, CKTokenizer::scores, CKTokenizerConfig::spm_mode, CKTokenizerConfig::type, CKTokenizer::types, CKTokenizer::unk_id, CKTokenizerConfig::unk_score, CKTokenizer::vocab, CKTokenizer::vocab_capacity, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_create_bpe(), ck_tokenizer_create_spm(), ck_tokenizer_create_wordpiece(), and main().

◆ ck_tokenizer_decode()

int ck_tokenizer_decode	(	const CKTokenizer *	tok,
		const int32_t *	ids,
		int	num_ids,
		char *	text,
		int	max_len
	)

Decode token IDs to text.

Parameters

tok	Tokenizer
ids	Input token IDs
num_ids	Number of IDs
text	Output text buffer
max_len	Maximum text length

Returns: Number of bytes written

Definition at line 1211 of file tokenizer.c.

                                                                                                           {
     if (!tok || !ids || !text || max_len <= 0) return 0;
     int len = 0;
     for (int i = 0; i < num_ids; i++) {
         int32_t id = ids[i];
         if (id < 0) continue;
         const char *token = ck_tokenizer_id_to_token(tok, id);
         if (!token) continue;
         int token_len = (int)strlen(token);
  
         /* Check for space prefix markers and convert to ASCII space */
         unsigned char c0 = (unsigned char)token[0];
         unsigned char c1 = (unsigned char)token[1];
  
         if (c0 == 0xC4 && c1 == 0xA0) {
             /* Ġ (U+0120) is 2 bytes - convert to space */
             if (len < max_len - 1) text[len++] = ' ';
             token += 2; token_len -= 2;
         } else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
             /* ▁ (U+2581) is 3 bytes - convert to space */
             if (len < max_len - 1) text[len++] = ' ';
             token += 3; token_len -= 3;
         }
  
         for (int j = 0; j < token_len && len < max_len - 1; j++) text[len++] = token[j];
     }
     text[len] = '\0';
     return len;
 }

References CKTokenizer::bos_id, ck_tokenizer_id_to_token(), CKTokenizer::eos_id, ids, max_len, num_ids, CKTokenizer::pad_id, text, and token.

Referenced by main().

◆ ck_tokenizer_detect_space_prefix_style()

CKSpacePrefixStyle ck_tokenizer_detect_space_prefix_style ( CKTokenizer * tok )

Definition at line 276 of file tokenizer.c.

                                                                             {
     if (!tok) return CK_SPACE_PREFIX_GPT2;
  
     /* Already detected? */
     if (tok->config.space_prefix_detected && tok->config.space_prefix_style != CK_SPACE_PREFIX_AUTO) {
         return tok->config.space_prefix_style;
     }
  
     /* Count tokens starting with each style:
      * Ġ (U+0120) = bytes 0xC4 0xA0
      * ▁ (U+2581) = bytes 0xE2 0x96 0x81
      */
     int gpt2_count = 0;
     int spm_count = 0;
  
     for (size_t i = 0; i < tok->vocab_size && i < 10000; i++) {  /* Sample first 10k tokens */
         const char *token = tok->id_to_token[i];
         if (!token) continue;
  
         unsigned char c0 = (unsigned char)token[0];
         unsigned char c1 = (unsigned char)token[1];
  
         /* Check for Ġ (0xC4 0xA0) */
         if (c0 == 0xC4 && c1 == 0xA0) {
             gpt2_count++;
         }
         /* Check for ▁ (0xE2 0x96 0x81) */
         else if (c0 == 0xE2 && c1 == 0x96 && (unsigned char)token[2] == 0x81) {
             spm_count++;
         }
     }
  
     /* Determine style based on counts */
     CKSpacePrefixStyle detected;
     if (spm_count > gpt2_count * 2) {
         detected = CK_SPACE_PREFIX_SPM;
     } else {
         detected = CK_SPACE_PREFIX_GPT2;  /* Default to GPT-2 if similar counts */
     }
  
     tok->config.space_prefix_style = detected;
     tok->config.space_prefix_detected = true;
  
     return detected;
 }

References CK_SPACE_PREFIX_AUTO, CK_SPACE_PREFIX_GPT2, CK_SPACE_PREFIX_SPM, CKTokenizer::config, CKTokenizer::id_to_token, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, token, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_encode()

int ck_tokenizer_encode	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

Encode text to token IDs using greedy longest-match.

For BPE: applies merge rules iteratively. For WordPiece/SPM: greedy longest-match from vocabulary.

Parameters

tok	Tokenizer
text	Input text
text_len	Text length, or -1 for null-terminated
ids	Output token IDs
max_ids	Maximum IDs to write

Returns: Number of tokens written

Definition at line 1132 of file tokenizer.c.

                                                                                                            {
     if (!tok || !text || !ids || max_ids <= 0) return 0;
     if (text_len < 0) text_len = (int)strlen(text);
  
     /* For SPM tokenizers, use either unigram/Viterbi or llama-style SPM. */
     if (tok->config.type == CK_TOKENIZER_SPM) {
         int out_idx = 0;
         if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
             ids[out_idx++] = tok->bos_id;
         }
         if (text_len == 0) {
             if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
                 ids[out_idx++] = tok->eos_id;
             }
             return out_idx;
         }
         int n = 0;
         if (tok->config.spm_mode == CK_SPM_MODE_LLAMA) {
             n = ck_tokenizer_encode_spm_llama_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
         } else {
             n = ck_tokenizer_encode_spm_impl(tok, text, text_len, ids + out_idx, max_ids - out_idx);
         }
         if (n <= 0) return n;
         out_idx += n;
         if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
             ids[out_idx++] = tok->eos_id;
         }
         return out_idx;
     }
  
     /* For BPE tokenizers, convert spaces to appropriate prefix marker.
      * Auto-detect style from vocabulary if not already set. */
     char preprocessed[8192];
     const char *input = text;
     int input_len = text_len;
  
     if (tok->config.type == CK_TOKENIZER_BPE) {
         /* Get or detect space prefix style */
         CKSpacePrefixStyle style = ((CKTokenizer *)tok)->config.space_prefix_style;
         if (!((CKTokenizer *)tok)->config.space_prefix_detected) {
             style = ck_tokenizer_detect_space_prefix_style((CKTokenizer *)tok);
         }
  
         int pp_len = preprocess_bpe_spaces(text, text_len, preprocessed, sizeof(preprocessed) - 1, style);
         if (pp_len > 0) {
             preprocessed[pp_len] = '\0';
             input = preprocessed;
             input_len = pp_len;
         }
     }
  
     int out_idx = 0;
     if (tok->config.add_bos && tok->bos_id >= 0 && out_idx < max_ids) {
         ids[out_idx++] = tok->bos_id;
     }
  
     size_t pos = 0;
     while (pos < (size_t)input_len && out_idx < max_ids) {
         size_t match_len = 0;
         int32_t id = find_longest_match(tok, input, input_len, pos, &match_len);
  
         if (match_len == 0) {
             /* Emit UNK for unknown characters */
             if (tok->unk_id >= 0) ids[out_idx++] = tok->unk_id;
             pos++;
         } else {
             ids[out_idx++] = id;
             pos += match_len;
         }
     }
  
     if (tok->config.add_eos && tok->eos_id >= 0 && out_idx < max_ids) {
         ids[out_idx++] = tok->eos_id;
     }
  
     return out_idx;
 }

References CKTokenizer::add_bos, CKTokenizerConfig::add_bos, CKTokenizer::add_eos, CKTokenizerConfig::add_eos, CKTokenizer::bos_id, CK_SPM_MODE_LLAMA, CK_TOKENIZER_BPE, ck_tokenizer_detect_space_prefix_style(), ck_tokenizer_encode_spm_impl(), ck_tokenizer_encode_spm_llama_impl(), ck_tokenizer_lookup(), ck_tokenizer_lookup_merge(), CK_TOKENIZER_SPM, CKTokenizer::config, config, CKTokenizer::eos_id, find_longest_match(), id, ids, max_ids, CKMergeRule::merged, CKTokenizer::merges, CKTokenizer::num_merges, out_len, preprocess_bpe_spaces(), CKMergeRule::priority, CKTokenizerConfig::spm_mode, style, text, text_len, CKTokenizerConfig::type, CKTokenizer::unk_id, and utf8_len().

Referenced by main(), and run_inference().

◆ ck_tokenizer_encode_spm_impl()

static int ck_tokenizer_encode_spm_impl	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

static

Definition at line 832 of file tokenizer.c.

                                                     {
     if (!tok || !text || !ids || max_ids <= 0) return 0;
     if (text_len < 0) text_len = (int)strlen(text);
     if (text_len == 0) return 0;
     const int dbg = getenv("CK_DEBUG_SPM_ENCODE") ? 1 : 0;
     if (dbg) {
         fprintf(stderr, "[SPM] encode start: text_len=%d max_ids=%d\n", text_len, max_ids);
     }
  
     /* Preprocess: replace spaces with ▁ */
     char preprocessed[8192];
     int pp_len = preprocess_spm_text(text, text_len, preprocessed, sizeof(preprocessed) - 1,
                                      tok->config.add_space_prefix);
     if (pp_len < 0) return 0;
     preprocessed[pp_len] = '\0';
     if (dbg) {
         fprintf(stderr, "[SPM] preprocessed len=%d: \"%.*s\"\n", pp_len, pp_len, preprocessed);
     }
  
     /* DP arrays - use malloc for large inputs */
     size_t n = (size_t)pp_len + 1;
     float *best_score = (float *)malloc(n * sizeof(float));
     int32_t *best_prev = (int32_t *)malloc(n * sizeof(int32_t));
     int32_t *best_token = (int32_t *)malloc(n * sizeof(int32_t));
     if (dbg) {
         fprintf(stderr, "[SPM] DP alloc n=%zu\n", n);
     }
  
     if (!best_score || !best_prev || !best_token) {
         if (best_score) free(best_score);
         if (best_prev) free(best_prev);
         if (best_token) free(best_token);
         return 0;
     }
  
     /* Initialize DP */
     const float neg_inf = -1e30f;
     const float unknown_penalty = -10.0f;  /* SentencePiece-style UNK penalty */
     for (size_t i = 0; i < n; i++) {
         best_score[i] = neg_inf;
         best_prev[i] = -1;
         best_token[i] = -1;
     }
     best_score[0] = 0.0f;
  
     /* DP: for each position, find best way to reach it */
     for (size_t pos = 0; pos < n; pos++) {
         if (best_score[pos] == neg_inf) continue;
  
         /* Find all tokens that match at this position */
         int32_t candidates[64];
         int num_cand = spm_find_candidates_at_pos(tok, preprocessed, pp_len, pos, candidates, 64);
         if (dbg && pos < 8) {
             fprintf(stderr, "[SPM] pos=%zu cand=%d\n", pos, num_cand);
         }
  
         for (int c = 0; c < num_cand; c++) {
             int32_t token_id = candidates[c];
  
             /* Skip disallowed token types in DP */
             if (!spm_token_allowed_in_dp(tok, token_id)) {
                 continue;
             }
  
             /* Get token string and length */
             const char *token = ck_tokenizer_id_to_token(tok, token_id);
             if (!token) continue;
  
             /* Calculate token length in bytes */
             int token_len = (int)strlen(token);
  
             /* For UNK token, use the unknown run length to cover all consecutive unknown bytes */
             if (token_id == tok->unk_id) {
                 token_len = spm_count_unknown_run(tok, preprocessed, pp_len, pos);
                 if (token_len == 0) token_len = 1;  /* At least 1 byte */
             }
  
             size_t next_pos = pos + token_len;
  
             if (next_pos >= n) continue;
  
             /* Get token score for Viterbi */
             float token_score = 0.0f;
             if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->vocab_size) {
                 token_score = tok->scores[token_id];
             }
  
             /* USER_DEFINED tokens get score 0 (like llama.cpp) */
             if (tok->types && token_id >= 0 && token_id < (int32_t)tok->types_size) {
                 if (tok->types[token_id] == GGUF_TOKEN_USER_DEFINED) {
                     token_score = 0.0f;
                 }
             }
             /* Apply UNK penalty (SentencePiece behavior) */
             if (token_id == tok->unk_id) {
                 token_score += unknown_penalty;
             }
  
             /* Transition: score = best_score[pos] + token_score */
             float new_score = best_score[pos] + token_score;
  
             if (new_score > best_score[next_pos]) {
                 best_score[next_pos] = new_score;
                 best_prev[next_pos] = (int32_t)pos;
                 best_token[next_pos] = token_id;
             }
         }
     }
  
     /* Backtrack to find best token sequence */
     int32_t *reverse_ids = (int32_t *)malloc(max_ids * sizeof(int32_t));
     if (!reverse_ids) {
         free(best_score);
         free(best_prev);
         free(best_token);
         return 0;
     }
  
     int num_tokens = 0;
     int32_t curr = (int32_t)(n - 1);
  
     /* Handle trailingUNK by finding valid end */
     while (curr > 0 && best_token[curr] < 0) {
         curr = best_prev[curr];
     }
  
     /* Backtrack from end to start, collecting tokens.
      * We track the token's start position to avoid duplicates. */
     int last_start = -1;  /* Track the start position of last added token */
     while (curr > 0 && num_tokens < max_ids) {
         int32_t token_id = best_token[curr];
         if (token_id >= 0) {
             /* Use the DP backpointer as the true token start */
             int token_start = best_prev[curr];
  
             /* Only add if this is a new token (different start position) */
             if (token_start != last_start) {
                 reverse_ids[num_tokens++] = token_id;
                 last_start = token_start;
             }
         }
         curr = best_prev[curr];
     }
     if (dbg) {
         fprintf(stderr, "[SPM] backtrack tokens=%d curr=%d\n", num_tokens, curr);
     }
  
     /* Free DP arrays before using reverse_ids */
     free(best_score);
     free(best_prev);
     free(best_token);
  
     if (num_tokens > max_ids) num_tokens = max_ids;
  
     /* Backtracking collected tokens in reverse order, so reverse once */
     for (int i = 0; i < num_tokens / 2; i++) {
         int32_t tmp = reverse_ids[i];
         reverse_ids[i] = reverse_ids[num_tokens - 1 - i];
         reverse_ids[num_tokens - 1 - i] = tmp;
     }
  
     /* Copy to output and merge consecutive UNK tokens (SPM behavior) */
     int out_idx = 0;
     for (int i = 0; i < num_tokens && out_idx < max_ids; i++) {
         int32_t token_id = reverse_ids[i];
  
         /* Merge consecutive UNK tokens into one */
         if (token_id == tok->unk_id && out_idx > 0 && ids[out_idx - 1] == tok->unk_id) {
             continue;  /* Skip - already have UNK */
         }
         ids[out_idx++] = token_id;
     }
     if (dbg) {
         fprintf(stderr, "[SPM] encode done: out=%d\n", out_idx);
     }
  
     free(reverse_ids);
  
     /* If DP failed to produce valid tokens, use byte-fallback */
     if (num_tokens == 0) {
         return spm_encode_byte_fallback(tok, text, text_len, ids, max_ids);
     }
  
     return out_idx;
 }

References CKTokenizerConfig::add_space_prefix, ck_tokenizer_id_to_token(), CKTokenizer::config, GGUF_TOKEN_USER_DEFINED, ids, max_ids, preprocess_spm_text(), CKTokenizer::scores, spm_count_unknown_run(), spm_encode_byte_fallback(), spm_find_candidates_at_pos(), spm_token_allowed_in_dp(), text, text_len, token, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::unk_id, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_encode_spm_llama_impl()

static int ck_tokenizer_encode_spm_llama_impl	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

static

Definition at line 642 of file tokenizer.c.

                                                            {
     if (!tok || !text || !ids || max_ids <= 0) return 0;
     if (text_len < 0) text_len = (int)strlen(text);
     if (text_len == 0) return 0;
  
     char preprocessed[8192];
     int pp_len = preprocess_spm_llama_text(text, text_len, preprocessed, (int)sizeof(preprocessed) - 1,
                                            tok->config.add_space_prefix);
     if (pp_len < 0) return 0;
     preprocessed[pp_len] = '\0';
  
     int num_symbols = 0;
     for (int offs = 0; offs < pp_len;) {
         int char_len = utf8_len((unsigned char)preprocessed[offs]);
         if (char_len <= 0) char_len = 1;
         if (offs + char_len > pp_len) char_len = pp_len - offs;
         offs += char_len;
         num_symbols++;
     }
     if (num_symbols <= 0) return 0;
  
     SpmLlamaSymbol *symbols = (SpmLlamaSymbol *)calloc((size_t)num_symbols, sizeof(SpmLlamaSymbol));
     int node_cap = 2 * num_symbols + 1;
     SpmLlamaNode *nodes = (SpmLlamaNode *)calloc((size_t)node_cap, sizeof(SpmLlamaNode));
     if (!symbols || !nodes) {
         if (symbols) free(symbols);
         if (nodes) free(nodes);
         return 0;
     }
  
     int index = 0;
     for (int offs = 0; offs < pp_len && index < num_symbols;) {
         int char_len = utf8_len((unsigned char)preprocessed[offs]);
         if (char_len <= 0) char_len = 1;
         if (offs + char_len > pp_len) char_len = pp_len - offs;
  
         symbols[index].text = preprocessed + offs;
         symbols[index].n = char_len;
         symbols[index].prev = index - 1;
         symbols[index].next = (index + 1 < num_symbols) ? (index + 1) : -1;
         symbols[index].node_id = index;
  
         nodes[index].text = preprocessed + offs;
         nodes[index].n = char_len;
         nodes[index].left = -1;
         nodes[index].right = -1;
  
         offs += char_len;
         index++;
     }
  
     int node_count = num_symbols;
     for (;;) {
         int best_left = -1;
         int best_right = -1;
         float best_score = -1e30f;
  
         for (int left = 0; left != -1; left = symbols[left].next) {
             int right = symbols[left].next;
             if (right < 0) continue;
  
             int pair_len = symbols[left].n + symbols[right].n;
             int32_t token_id = ck_tokenizer_lookup_exact_n(tok, symbols[left].text, pair_len);
             if (token_id < 0 || token_id >= (int32_t)tok->vocab_size) continue;
  
             float score = 0.0f;
             if (tok->scores && token_id >= 0 && token_id < (int32_t)tok->scores_size) {
                 score = tok->scores[token_id];
             }
  
             if (best_left < 0 || score > best_score || (score == best_score && left < best_left)) {
                 best_left = left;
                 best_right = right;
                 best_score = score;
             }
         }
  
         if (best_left < 0 || best_right < 0) break;
         if (node_count >= node_cap) break;
  
         SpmLlamaSymbol *left = &symbols[best_left];
         SpmLlamaSymbol *right = &symbols[best_right];
  
         int new_node_id = node_count++;
         nodes[new_node_id].text = left->text;
         nodes[new_node_id].n = left->n + right->n;
         nodes[new_node_id].left = left->node_id;
         nodes[new_node_id].right = right->node_id;
  
         left->n += right->n;
         left->node_id = new_node_id;
         left->next = right->next;
         if (right->next >= 0) {
             symbols[right->next].prev = best_left;
         }
  
         right->n = 0;
         right->prev = -1;
         right->next = -1;
     }
  
     int out_idx = 0;
     for (int i = 0; i != -1 && out_idx < max_ids; i = symbols[i].next) {
         out_idx = spm_llama_resegment_node(tok, nodes, symbols[i].node_id, ids, max_ids, out_idx);
     }
  
     free(symbols);
     free(nodes);
     return out_idx;
 }

References CKTokenizerConfig::add_space_prefix, ck_tokenizer_lookup_exact_n(), CKTokenizer::config, ids, left, max_ids, preprocess_spm_llama_text(), right, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_llama_resegment_node(), text, text_len, utf8_len(), and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode().

◆ ck_tokenizer_free()

void ck_tokenizer_free ( CKTokenizer * tok )

Definition at line 91 of file tokenizer.c.

                                          {
     if (!tok) return;
  
     /* Free vocabulary entries */
     if (tok->vocab) {
         ck_tokenizer_hash_table_free(tok->vocab, true);
     }
  
     /* Free trie */
     if (tok->vocab_trie) {
         ck_trie_free(tok->vocab_trie);
     }
  
     /* Free reverse vocab strings */
     if (tok->id_to_token) {
         /* Note: strings were strdup'd in add_token */
         for (size_t i = 0; i < tok->vocab_size; i++) {
             if (tok->id_to_token[i]) {
                 free(tok->id_to_token[i]);
             }
         }
         free(tok->id_to_token);
     }
  
     /* Free SPM-related arrays */
     if (tok->scores) free(tok->scores);
     if (tok->types) free(tok->types);
     if (tok->byte_token_id) free(tok->byte_token_id);
  
     ck_tokenizer_mempool_free(&tok->pool);
     free(tok);
 }

References CKTokenizer::byte_token_id, ck_pool_free(), ck_tokenizer_hash_table_free(), ck_tokenizer_mempool_free(), ck_trie_free(), CKTokenizer::id_to_token, CKTokenizer::merge_hash, CKTokenizer::merges, CKTokenizer::pool, CKTokenizer::scores, CKTokenizer::types, CKTokenizer::vocab, CKTokenizer::vocab_hash, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by main(), and run_inference().

◆ ck_tokenizer_id_to_token()

const char* ck_tokenizer_id_to_token	(	const CKTokenizer *	tok,
		int32_t	id
	)

Definition at line 353 of file tokenizer.c.

                                                                          {
     if (!tok || id < 0 || id >= (int32_t)tok->vocab_size) return NULL;
     return tok->id_to_token[id];
 }

References id, CKTokenizer::id_to_token, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_decode(), ck_tokenizer_encode_spm_impl(), main(), and run_inference().

◆ ck_tokenizer_load_binary()

int ck_tokenizer_load_binary	(	CKTokenizer *	tok,
		int	vocab_size,
		const int32_t *	offsets,
		const char *	strings,
		int	num_merges,
		const int32_t *	merges
	)

Load vocabulary from memory-mapped binary data.

Parameters

tok	Tokenizer
vocab_size	Number of tokens
offsets	Array of offsets into strings pool
strings	String pool containing null-terminated tokens
num_merges	Number of BPE merges
merges	Merge rules as (left, right, merged) triplets

Returns: 0 on success, -1 on error

Definition at line 1242 of file tokenizer.c.

                                                     {
     return ck_tokenizer_load_binary_with_scores(tok, vocab_size, offsets, strings, NULL, NULL, num_merges, merges);
 }

References ck_tokenizer_load_binary_with_scores(), merges, num_merges, offsets, strings, and vocab_size.

◆ ck_tokenizer_load_binary_with_scores()

int ck_tokenizer_load_binary_with_scores	(	CKTokenizer *	tok,
		int	vocab_size,
		const int32_t *	offsets,
		const char *	strings,
		const float *	scores,
		const uint8_t *	types,
		int	num_merges,
		const int32_t *	merges
	)

Load vocabulary from memory-mapped binary data with scores and types.

This extended version supports SPM (SentencePiece) tokenizers which require token scores for Viterbi/DP encoding.

Parameters

tok	Tokenizer
vocab_size	Number of tokens
offsets	Array of offsets into strings pool
strings	String pool containing null-terminated tokens
scores	Array of token scores (float32), can be NULL
types	Array of token types (uint8), can be NULL
num_merges	Number of BPE merges
merges	Merge rules as (left, right, merged) triplets

Returns: 0 on success, -1 on error

Definition at line 1252 of file tokenizer.c.

                                                                 {
     if (!tok || !offsets || !strings) return -1;
     ck_tokenizer_reset(tok);
  
     /* Free any existing scores/types arrays before reallocating */
     if (tok->scores) {
         free(tok->scores);
         tok->scores = NULL;
         tok->scores_size = 0;
     }
     if (tok->types) {
         free(tok->types);
         tok->types = NULL;
         tok->types_size = 0;
     }
  
     /* Allocate scores and types arrays if provided */
     if (scores && vocab_size > 0) {
         tok->scores = (float *)malloc(vocab_size * sizeof(float));
         if (!tok->scores) return -1;
         memcpy(tok->scores, scores, vocab_size * sizeof(float));
         tok->scores_size = (size_t)vocab_size;
     }
     if (types && vocab_size > 0) {
         tok->types = (uint8_t *)malloc(vocab_size * sizeof(uint8_t));
         if (!tok->types) {
             if (tok->scores) {
                 free(tok->scores);
                 tok->scores = NULL;
             }
             return -1;
         }
         memcpy(tok->types, types, vocab_size * sizeof(uint8_t));
         tok->types_size = (size_t)vocab_size;
     }
  
     for (int i = 0; i < vocab_size; i++) {
         const char *token = strings + offsets[i];
         float score = scores ? scores[i] : 0.0f;
         ck_tokenizer_add_token(tok, token, i, score);
     }
  
     /* Build byte token lookup table if types are available */
     if (types && vocab_size > 0) {
         spm_build_byte_lookup(tok, strings, offsets, vocab_size);
  
         /* Log token type statistics */
         int count_normal = 0, count_unknown = 0, count_control = 0, count_byte = 0, count_other = 0;
         int max_type = 0;
         for (int i = 0; i < vocab_size; i++) {
             uint8_t t = tok->types[i];
             if (t > max_type) max_type = t;
             switch (t) {
                 case GGUF_TOKEN_NORMAL: count_normal++; break;
                 case GGUF_TOKEN_UNKNOWN: count_unknown++; break;
                 case GGUF_TOKEN_CONTROL: count_control++; break;
                 case GGUF_TOKEN_BYTE: count_byte++; break;
                 default: count_other++; break;
             }
         }
         fprintf(stderr, "[TOKENIZER] Loaded %d tokens: normal=%d, unknown=%d, control=%d, byte=%d, other=%d\n",
                 vocab_size, count_normal, count_unknown, count_control, count_byte, count_other);
         if (max_type > GGUF_TOKEN_BYTE) {
             fprintf(stderr, "[TOKENIZER] Warning: Unexpected token type %d\n", max_type);
         }
     }
  
     /* TODO: Merges */
     (void)num_merges; (void)merges;
     return 0;
 }

References ck_tokenizer_add_token(), ck_tokenizer_reset(), GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_NORMAL, GGUF_TOKEN_UNKNOWN, merges, num_merges, offsets, score, CKTokenizer::scores, CKTokenizer::scores_size, spm_build_byte_lookup(), strings, token, CKTokenizer::types, CKTokenizer::types_size, and vocab_size.

Referenced by ck_tokenizer_load_binary().

◆ ck_tokenizer_load_gguf()

int ck_tokenizer_load_gguf	(	CKTokenizer *	tok,
		const char *	path
	)

Load vocabulary from GGUF file.

Parameters

tok	Tokenizer
path	Path to GGUF file

Returns: 0 on success, -1 on error

Definition at line 1332 of file tokenizer.c.

1332 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_json()

int ck_tokenizer_load_json	(	CKTokenizer *	tok,
		const char *	path
	)

Load vocabulary from JSON file (HuggingFace format).

Parameters

tok	Tokenizer
path	Path to vocab.json or tokenizer.json

Returns: 0 on success, -1 on error

Definition at line 1333 of file tokenizer.c.

1333 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_merges()

int ck_tokenizer_load_merges	(	CKTokenizer *	tok,
		const char *	path
	)

Load BPE merges from text file.

Format: token1 token2 (one merge per line)

Parameters

tok	Tokenizer
path	Path to merges.txt

Returns: 0 on success, -1 on error

Definition at line 1335 of file tokenizer.c.

1335 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_load_text()

int ck_tokenizer_load_text	(	CKTokenizer *	tok,
		const char *	path
	)

Load vocabulary from text file (one token per line).

Format: token_string [id] [score] Lines starting with # are comments.

Parameters

tok	Tokenizer
path	Path to vocabulary file

Returns: 0 on success, -1 on error

Definition at line 1334 of file tokenizer.c.

1334 { (void)tok; (void)path; return -1; }

◆ ck_tokenizer_lookup()

int32_t ck_tokenizer_lookup	(	const CKTokenizer *	tok,
		const char *	token
	)

Definition at line 323 of file tokenizer.c.

                                                                        {
     if (!tok || !token) return -1;
     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
     return info ? info->id : tok->unk_id;
 }

References ck_tokenizer_hash_table_lookup(), token, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by spm_get_byte_token().

◆ ck_tokenizer_lookup_exact()

static int32_t ck_tokenizer_lookup_exact	(	const CKTokenizer *	tok,
		const char *	token
	)

static

Definition at line 330 of file tokenizer.c.

                                                                                     {
     if (!tok || !token) return -1;
     TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, token);
     return info ? info->id : -1;
 }

References ck_tokenizer_hash_table_lookup(), token, and CKTokenizer::vocab.

Referenced by ck_tokenizer_lookup_exact_n().

◆ ck_tokenizer_lookup_exact_n()

static int32_t ck_tokenizer_lookup_exact_n	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len
	)

static

Definition at line 337 of file tokenizer.c.

                                                                                                    {
     if (!tok || !text || text_len <= 0) return -1;
     char stack_buf[512];
     char *tmp = stack_buf;
     if (text_len >= (int)sizeof(stack_buf)) {
         tmp = (char *)malloc((size_t)text_len + 1);
         if (!tmp) return -1;
     }
     memcpy(tmp, text, (size_t)text_len);
     tmp[text_len] = '\0';
     int32_t id = ck_tokenizer_lookup_exact(tok, tmp);
     if (tmp != stack_buf) free(tmp);
     return id;
 }

References ck_tokenizer_lookup_exact(), id, text, and text_len.

Referenced by ck_tokenizer_encode_spm_llama_impl(), and spm_llama_resegment_node().

◆ ck_tokenizer_reset()

void ck_tokenizer_reset ( CKTokenizer * tok )

Definition at line 125 of file tokenizer.c.

                                           {
     if (!tok) return;
  
     ck_tokenizer_hash_table_clear(tok->vocab, true);
  
     if (tok->vocab_trie) {
         ck_trie_clear(tok->vocab_trie);
     }
  
     for (size_t i = 0; i < tok->vocab_size; i++) {
         if (tok->id_to_token[i]) {
             free(tok->id_to_token[i]);
             tok->id_to_token[i] = NULL;
         }
     }
  
     tok->vocab_size = 0;
  
     /* Reset SPM-related arrays using actual allocated sizes */
     if (tok->scores && tok->scores_size > 0) {
         memset(tok->scores, 0, tok->scores_size * sizeof(float));
     }
     if (tok->types && tok->types_size > 0) {
         memset(tok->types, 0, tok->types_size * sizeof(uint8_t));
     }
     /* Clear byte lookup table */
     if (tok->byte_token_id) {
         memset(tok->byte_token_id, -1, 256 * sizeof(int32_t));
     }
 }

References CKTokenizer::byte_token_id, ck_tokenizer_hash_table_clear(), ck_trie_clear(), CKTokenizer::id_to_token, CKTokenizer::scores, CKTokenizer::scores_size, CKTokenizer::types, CKTokenizer::types_size, CKTokenizer::vocab, CKTokenizer::vocab_size, and CKTokenizer::vocab_trie.

Referenced by ck_tokenizer_load_binary_with_scores().

◆ ck_tokenizer_set_add_bos_eos()

void ck_tokenizer_set_add_bos_eos	(	CKTokenizer *	tok,
		bool	add_bos,
		bool	add_eos
	)

Definition at line 243 of file tokenizer.c.

                                                                                 {
     if (!tok) return;
     tok->config.add_bos = add_bos;
     tok->config.add_eos = add_eos;
 }

References CKTokenizerConfig::add_bos, add_bos, CKTokenizerConfig::add_eos, add_eos, and CKTokenizer::config.

◆ ck_tokenizer_set_add_space_prefix()

void ck_tokenizer_set_add_space_prefix	(	CKTokenizer *	tok,
		bool	add_space_prefix
	)

Definition at line 249 of file tokenizer.c.

                                                                                 {
     if (!tok) return;
     tok->config.add_space_prefix = add_space_prefix;
 }

References CKTokenizerConfig::add_space_prefix, add_space_prefix, and CKTokenizer::config.

◆ ck_tokenizer_set_space_prefix_style()

void ck_tokenizer_set_space_prefix_style	(	CKTokenizer *	tok,
		CKSpacePrefixStyle	style
	)

Definition at line 266 of file tokenizer.c.

                                                                                      {
     if (!tok) return;
     tok->config.space_prefix_style = style;
     if (style != CK_SPACE_PREFIX_AUTO) {
         tok->config.space_prefix_detected = true;
     }
 }

References CK_SPACE_PREFIX_AUTO, CKTokenizer::config, CKTokenizerConfig::space_prefix_detected, CKTokenizerConfig::space_prefix_style, and style.

◆ ck_tokenizer_set_special_ids()

void ck_tokenizer_set_special_ids	(	CKTokenizer *	tok,
		int32_t	unk,
		int32_t	bos,
		int32_t	eos,
		int32_t	pad,
		int32_t	mask
	)

Definition at line 234 of file tokenizer.c.

                                                                                                                       {
     if (!tok) return;
     tok->unk_id = unk;
     tok->bos_id = bos;
     tok->eos_id = eos;
     tok->pad_id = pad;
     tok->mask_id = mask;
 }

References bos, CKTokenizer::bos_id, eos, CKTokenizer::eos_id, mask, CKTokenizer::mask_id, pad, CKTokenizer::pad_id, unk, and CKTokenizer::unk_id.

◆ ck_tokenizer_set_spm_mode()

void ck_tokenizer_set_spm_mode	(	CKTokenizer *	tok,
		CKSpmMode	spm_mode
	)

Definition at line 254 of file tokenizer.c.

                                                                      {
     if (!tok) return;
     tok->config.spm_mode = spm_mode;
 }

References CKTokenizer::config, CKTokenizerConfig::spm_mode, and spm_mode.

◆ ck_tokenizer_set_use_trie()

void ck_tokenizer_set_use_trie	(	CKTokenizer *	tok,
		bool	use_trie
	)

Definition at line 260 of file tokenizer.c.

                                                                 {
     if (!tok) return;
     tok->config.use_trie = use_trie;
 }

References CKTokenizer::config, CKTokenizerConfig::use_trie, and use_trie.

◆ find_longest_match()

static int32_t find_longest_match	(	const CKTokenizer *	tok,
		const char *	text,
		size_t	text_len,
		size_t	pos,
		size_t *	match_len
	)

static

Definition at line 400 of file tokenizer.c.

                                                                                                                             {
     if (tok->config.use_trie) {
         return find_longest_match_trie(tok, text, text_len, pos, match_len);
     } else {
         return find_longest_match_hash(tok, text, text_len, pos, match_len);
     }
 }

References CKTokenizer::config, find_longest_match_hash(), find_longest_match_trie(), text, text_len, and CKTokenizerConfig::use_trie.

Referenced by ck_tokenizer_encode().

◆ find_longest_match_hash()

static int32_t find_longest_match_hash	(	const CKTokenizer *	tok,
		const char *	text,
		size_t	text_len,
		size_t	pos,
		size_t *	match_len
	)

static

Definition at line 370 of file tokenizer.c.

                                                                                                                                  {
     if (!tok || !text || pos >= text_len) {
         *match_len = 0;
         return tok ? tok->unk_id : -1;
     }
  
     size_t max_len = 64;
     if (pos + max_len > text_len) max_len = text_len - pos;
  
     int32_t best_id = tok->unk_id;
     size_t best_len = 0;
  
     for (size_t len = max_len; len >= 1; len--) {
         char tmp[65];
         memcpy(tmp, text + pos, len);
         tmp[len] = '\0';
  
         TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
         if (info) {
             best_id = info->id;
             best_len = len;
             break;
         }
     }
  
     *match_len = best_len;
     return best_id;
 }

References ck_tokenizer_hash_table_lookup(), max_len, text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by find_longest_match().

◆ find_longest_match_trie()

static int32_t find_longest_match_trie	(	const CKTokenizer *	tok,
		const char *	text,
		size_t	text_len,
		size_t	pos,
		size_t *	match_len
	)

static

Definition at line 359 of file tokenizer.c.

                                                                                                                                  {
     if (!tok || !tok->vocab_trie || !text || pos >= text_len) {
         *match_len = 0;
         return tok ? tok->unk_id : -1;
     }
  
     int32_t token_id = ck_trie_find_longest(tok->vocab_trie, text, text_len, pos, match_len);
     return token_id >= 0 ? token_id : tok->unk_id;
 }

References ck_trie_find_longest(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab_trie.

Referenced by find_longest_match().

◆ preprocess_bpe_spaces()

static int preprocess_bpe_spaces	(	const char *	text,
		int	text_len,
		char *	out,
		int	out_max,
		CKSpacePrefixStyle	style
	)

static

Definition at line 412 of file tokenizer.c.

                                                                                                                    {
     int out_len = 0;
  
     /* For SentencePiece, add ▁ at the start of text (unless text starts with space) */
     if (style == CK_SPACE_PREFIX_SPM && text_len > 0 && text[0] != ' ') {
         if (out_len + 3 > out_max) return -1;
         out[out_len++] = (char)0xE2;
         out[out_len++] = (char)0x96;
         out[out_len++] = (char)0x81;
     }
  
     for (int i = 0; i < text_len; i++) {
         if (text[i] == ' ') {
             if (style == CK_SPACE_PREFIX_SPM) {
                 /* SentencePiece style: ▁ (3 bytes: 0xE2 0x96 0x81) */
                 if (out_len + 3 > out_max) return -1;
                 out[out_len++] = (char)0xE2;
                 out[out_len++] = (char)0x96;
                 out[out_len++] = (char)0x81;
             } else {
                 /* GPT-2 style: Ġ (2 bytes: 0xC4 0xA0) */
                 if (out_len + 2 > out_max) return -1;
                 out[out_len++] = (char)0xC4;
                 out[out_len++] = (char)0xA0;
             }
         } else {
             if (out_len + 1 > out_max) return -1;
             out[out_len++] = text[i];
         }
     }
     return out_len;
 }

References CK_SPACE_PREFIX_SPM, out_len, style, text, and text_len.

Referenced by ck_tokenizer_encode().

◆ preprocess_spm_llama_text()

static int preprocess_spm_llama_text	(	const char *	text,
		int	text_len,
		char *	out,
		int	out_max,
		bool	add_space_prefix
	)

static

Definition at line 554 of file tokenizer.c.

                                                                                                                     {
     int out_len = 0;
     if (text_len < 0) text_len = (int)strlen(text);
  
     if (add_space_prefix && text_len > 0) {
         if (out_len + 3 > out_max) return -1;
         out[out_len++] = (char)0xE2;
         out[out_len++] = (char)0x96;
         out[out_len++] = (char)0x81;
     }
  
     for (int i = 0; i < text_len;) {
         if (text[i] == ' ') {
             int j = i;
             while (j < text_len && text[j] == ' ') {
                 j++;
             }
             int run = j - i;
  
             /* Match llama.cpp behavior for this GGUF family:
              * single separators map to ▁, but multi-space runs remain literal. */
             if (run == 1) {
                 if (out_len + 3 > out_max) return -1;
                 out[out_len++] = (char)0xE2;
                 out[out_len++] = (char)0x96;
                 out[out_len++] = (char)0x81;
             } else {
                 if (out_len + run > out_max) return -1;
                 for (int k = 0; k < run; k++) {
                     out[out_len++] = ' ';
                 }
             }
             i = j;
         } else {
             if (out_len + 1 > out_max) return -1;
             out[out_len++] = text[i++];
         }
     }
  
     return out_len;
 }

References add_space_prefix, out_len, text, and text_len.

Referenced by ck_tokenizer_encode_spm_llama_impl().

◆ preprocess_spm_text()

static int preprocess_spm_text	(	const char *	text,
		int	text_len,
		char *	out,
		int	out_max,
		bool	add_space_prefix
	)

static

Definition at line 763 of file tokenizer.c.

                                                                                                               {
     int out_len = 0;
  
     /* Count leading spaces */
     int lead_spaces = 0;
     while (lead_spaces < text_len && text[lead_spaces] == ' ') {
         lead_spaces++;
     }
  
     /* Count trailing spaces */
     int trail_spaces = 0;
     while (trail_spaces < text_len - lead_spaces &&
            text[text_len - 1 - trail_spaces] == ' ') {
         trail_spaces++;
     }
  
     /* Add ▁ at start if there's any non-space content AND text doesn't already start with ▁ */
     int content_len = text_len - lead_spaces - trail_spaces;
     int starts_with_prefix = (text_len >= 3 &&
                               (unsigned char)text[0] == 0xE2 &&
                               (unsigned char)text[1] == 0x96 &&
                               (unsigned char)text[2] == 0x81);
     int inserted_prefix = 0;
     if (content_len > 0 && !starts_with_prefix && add_space_prefix) {
         if (out_len + 3 > out_max) return -1;
         out[out_len++] = (char)0xE2;
         out[out_len++] = (char)0x96;
         out[out_len++] = (char)0x81;
         inserted_prefix = 1;
     }
  
     /* Process middle content: collapse multiple spaces to single ▁ */
     int i = lead_spaces;
     int last_was_space = (starts_with_prefix || inserted_prefix) ? 1 : 0;
     while (i < text_len - trail_spaces) {
         if (text[i] == ' ') {
             if (!last_was_space) {
                 /* First space after content - add ▁ */
                 if (out_len + 3 > out_max) return -1;
                 out[out_len++] = (char)0xE2;
                 out[out_len++] = (char)0x96;
                 out[out_len++] = (char)0x81;
                 last_was_space = 1;
             }
             /* Skip additional consecutive spaces */
         } else {
             if (out_len + 1 > out_max) return -1;
             out[out_len++] = text[i];
             last_was_space = 0;
         }
         i++;
     }
  
     return out_len;
 }

References add_space_prefix, out_len, text, and text_len.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_build_byte_lookup()

static void spm_build_byte_lookup	(	CKTokenizer *	tok,
		const char *	strings,
		const int32_t *	offsets,
		int	vocab_size
	)

static

Definition at line 507 of file tokenizer.c.

                                                                                                                  {
     /* Reuse existing array or allocate new one */
     if (!tok->byte_token_id) {
         tok->byte_token_id = (int32_t *)malloc(256 * sizeof(int32_t));
         if (!tok->byte_token_id) return;
     }
  
     /* Initialize all entries to -1 */
     for (int i = 0; i < 256; i++) {
         tok->byte_token_id[i] = -1;
     }
  
     /* Scan vocab for byte tokens */
     for (int i = 0; i < vocab_size; i++) {
         if (!tok->types || tok->types[i] != GGUF_TOKEN_BYTE) continue;
  
         const char *token = strings + offsets[i];
         size_t len = strlen(token);
  
         if (len == 1) {
             /* Raw byte token (single byte) */
             unsigned char byte_val = (unsigned char)token[0];
             tok->byte_token_id[byte_val] = i;
         } else if (spm_token_is_byte_format(token)) {
             /* <0xXX> format - parse the hex value */
             unsigned int byte_val;
             if (sscanf(token, "<0x%02X>", &byte_val) == 1 && byte_val < 256) {
                 tok->byte_token_id[byte_val] = i;
             }
         }
     }
 }

References CKTokenizer::byte_token_id, GGUF_TOKEN_BYTE, offsets, spm_token_is_byte_format(), strings, token, CKTokenizer::types, and vocab_size.

Referenced by ck_tokenizer_load_binary_with_scores().

◆ spm_count_unknown_run()

static int spm_count_unknown_run	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		size_t	pos
	)

static

Definition at line 1098 of file tokenizer.c.

                                                                                                      {
     int run = 0;
     while (pos + run < (size_t)text_len) {
         /* Stop at '▁' (U+2581 = 0xE2 0x96 0x81) since that's a known token */
         if (pos + run + 3 <= (size_t)text_len &&
             (unsigned char)text[pos + run] == 0xE2 &&
             (unsigned char)text[pos + run + 1] == 0x96 &&
             (unsigned char)text[pos + run + 2] == 0x81) {
             break;
         }
  
         /* Check if any vocab token matches at this position */
         int max_len = 64;
         if (pos + run + max_len > (size_t)text_len) {
             max_len = (int)(text_len - pos - run);
         }
         int found = 0;
         for (int len = max_len; len >= 1; len--) {
             char tmp[65];
             memcpy(tmp, text + pos + run, len);
             tmp[len] = '\0';
             TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
             if (info && info->id >= 0 && info->id != tok->unk_id && spm_token_allowed_in_dp(tok, info->id)) {
                 found = 1;
                 break;
             }
         }
         if (found) break;
         run++;
     }
     return run;
 }

References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_encode_byte_fallback()

static int spm_encode_byte_fallback	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		int32_t *	ids,
		int	max_ids
	)

static

Definition at line 1024 of file tokenizer.c.

                                                                {
     if (!tok || !text || !ids || max_ids <= 0) return 0;
     if (text_len < 0) text_len = (int)strlen(text);
     if (text_len == 0) return 0;
  
     int count = 0;
     for (int i = 0; i < text_len && count < max_ids; i++) {
         unsigned char byte_val = (unsigned char)text[i];
         int32_t byte_token = spm_get_byte_token(tok, byte_val);
  
         /* If we have a byte token, use it; otherwise use UNK */
         if (byte_token >= 0 && byte_token != tok->unk_id) {
             ids[count++] = byte_token;
         } else {
             ids[count++] = tok->unk_id;
         }
     }
     return count;
 }

References ids, max_ids, spm_get_byte_token(), text, text_len, and CKTokenizer::unk_id.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_find_candidates_at_pos()

static int spm_find_candidates_at_pos	(	const CKTokenizer *	tok,
		const char *	text,
		int	text_len,
		size_t	pos,
		int32_t *	candidates,
		int	max_candidates
	)

static

Definition at line 1047 of file tokenizer.c.

                                                                                            {
     if (!tok || !text || pos >= (size_t)text_len) return 0;
  
     int num_found = 0;
     int max_len = 64;
     if (pos + max_len > (size_t)text_len) max_len = (int)(text_len - pos);
  
     /* Iterate from longest to shortest to find all matches */
     char tmp[65];
     for (int len = max_len; len >= 1 && num_found < max_candidates; len--) {
         memcpy(tmp, text + pos, len);
         tmp[len] = '\0';
  
         TokenInfo *info = (TokenInfo *)ck_tokenizer_hash_table_lookup(tok->vocab, tmp);
         if (info && info->id >= 0 && info->id != tok->unk_id) {
             /* Skip disallowed token types */
             if (!spm_token_allowed_in_dp(tok, info->id)) {
                 continue;
             }
  
             /* Check if already added */
             int dup = 0;
             for (int j = 0; j < num_found; j++) {
                 if (candidates[j] == info->id) {
                     dup = 1;
                     break;
                 }
             }
             if (!dup) {
                 candidates[num_found++] = info->id;
             }
         }
     }
  
     /* If no candidates found, add UNK token as fallback.
      * For SPM, UNK should cover all consecutive unknown bytes until a known token or end.
      * We handle this by adding UNK with a special marker - we'll check at runtime
      * how far we can extend it. */
     if (num_found == 0 && tok->unk_id >= 0 && max_candidates > 0) {
         /* Only add UNK if it's allowed in DP */
         if (spm_token_allowed_in_dp(tok, tok->unk_id)) {
             candidates[num_found++] = tok->unk_id;
         }
     }
  
     return num_found;
 }

References ck_tokenizer_hash_table_lookup(), max_len, spm_token_allowed_in_dp(), text, text_len, CKTokenizer::unk_id, and CKTokenizer::vocab.

Referenced by ck_tokenizer_encode_spm_impl().

◆ spm_get_byte_token()

static int32_t spm_get_byte_token	(	const CKTokenizer *	tok,
		unsigned char	byte_val
	)

inlinestatic

Definition at line 487 of file tokenizer.c.

                                                                                          {
     /* Try fast lookup table first */
     if (tok->byte_token_id && tok->byte_token_id[byte_val] >= 0) {
         return tok->byte_token_id[byte_val];
     }
     /* Fallback to <0xXX> format */
     char byte_token[16];
     int len = snprintf(byte_token, sizeof(byte_token), "<0x%02X>", byte_val);
     if (len <= 0) return tok->unk_id;
     return ck_tokenizer_lookup(tok, byte_token);
 }

References CKTokenizer::byte_token_id, ck_tokenizer_lookup(), and CKTokenizer::unk_id.

Referenced by spm_encode_byte_fallback(), and spm_llama_resegment_node().

◆ spm_is_byte_token()

static bool spm_is_byte_token	(	const CKTokenizer *	tok,
		int32_t	token_id
	)

inlinestatic

Definition at line 479 of file tokenizer.c.

                                                                                {
     if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
         return false;
     }
     return tok->types[token_id] == GGUF_TOKEN_BYTE;
 }

References GGUF_TOKEN_BYTE, CKTokenizer::types, and CKTokenizer::vocab_size.

◆ spm_llama_resegment_node()

static int spm_llama_resegment_node	(	const CKTokenizer *	tok,
		const SpmLlamaNode *	nodes,
		int	node_id,
		int32_t *	ids,
		int	max_ids,
		int	out_idx
	)

static

Definition at line 611 of file tokenizer.c.

                                                  {
     if (!tok || !nodes || node_id < 0 || !ids || out_idx >= max_ids) {
         return out_idx;
     }
  
     const SpmLlamaNode *node = &nodes[node_id];
     int32_t token_id = ck_tokenizer_lookup_exact_n(tok, node->text, node->n);
     if (token_id >= 0) {
         ids[out_idx++] = token_id;
         return out_idx;
     }
  
     if (node->left >= 0 && node->right >= 0) {
         out_idx = spm_llama_resegment_node(tok, nodes, node->left, ids, max_ids, out_idx);
         out_idx = spm_llama_resegment_node(tok, nodes, node->right, ids, max_ids, out_idx);
         return out_idx;
     }
  
     for (int i = 0; i < node->n && out_idx < max_ids; i++) {
         int32_t byte_token = spm_get_byte_token(tok, (unsigned char)node->text[i]);
         ids[out_idx++] = (byte_token >= 0) ? byte_token : tok->unk_id;
     }
     return out_idx;
 }

References ck_tokenizer_lookup_exact_n(), ids, max_ids, spm_get_byte_token(), and CKTokenizer::unk_id.

Referenced by ck_tokenizer_encode_spm_llama_impl().

◆ spm_token_allowed_in_dp()

static bool spm_token_allowed_in_dp	(	const CKTokenizer *	tok,
		int32_t	token_id
	)

inlinestatic

Definition at line 469 of file tokenizer.c.

                                                                                      {
     if (!tok->types || token_id < 0 || token_id >= (int32_t)tok->vocab_size) {
         return true;  /* No type info, allow all */
     }
     uint8_t t = tok->types[token_id];
     /* Reject CONTROL, UNUSED, and BYTE tokens (but allow UNKNOWN for fallback) */
     return t != GGUF_TOKEN_CONTROL && t != GGUF_TOKEN_UNUSED && t != GGUF_TOKEN_BYTE;
 }

References GGUF_TOKEN_BYTE, GGUF_TOKEN_CONTROL, GGUF_TOKEN_UNUSED, CKTokenizer::types, and CKTokenizer::vocab_size.

Referenced by ck_tokenizer_encode_spm_impl(), spm_count_unknown_run(), and spm_find_candidates_at_pos().

◆ spm_token_is_byte_format()

static bool spm_token_is_byte_format ( const char * token )

inlinestatic

Definition at line 500 of file tokenizer.c.

                                                                {
     return token && token[0] == '<' && token[1] == '0' &&
            token[2] == 'x' && token[3] >= '0' && token[3] <= 'F' &&
            token[4] >= '0' && token[4] <= 'F' && token[5] == '>';
 }

References token.

Referenced by spm_build_byte_lookup().

◆ utf8_len()

static int utf8_len ( unsigned char c )

inlinestatic

Definition at line 541 of file tokenizer.c.

                                             {
     if ((c & 0x80) == 0x00) return 1;  /* ASCII */
     if ((c & 0xE0) == 0xC0) return 2;  /* 2-byte sequence */
     if ((c & 0xF0) == 0xE0) return 3;  /* 3-byte sequence */
     if ((c & 0xF8) == 0xF0) return 4;  /* 4-byte sequence */
     return 1;  /* Invalid, treat as 1 byte */
 }

Referenced by ck_tokenizer_encode(), and ck_tokenizer_encode_spm_llama_impl().

Macros

Functions

Macro Definition Documentation

◆ GGUF_TOKEN_BYTE

◆ GGUF_TOKEN_CONTROL

◆ GGUF_TOKEN_NORMAL

◆ GGUF_TOKEN_UNKNOWN

◆ GGUF_TOKEN_UNUSED

◆ GGUF_TOKEN_USER_DEFINED

Function Documentation

◆ ck_tokenizer_add_merge()

◆ ck_tokenizer_add_special_token()

◆ ck_tokenizer_add_token()

◆ ck_tokenizer_create()

◆ ck_tokenizer_decode()

◆ ck_tokenizer_detect_space_prefix_style()

◆ ck_tokenizer_encode()

◆ ck_tokenizer_encode_spm_impl()

◆ ck_tokenizer_encode_spm_llama_impl()

◆ ck_tokenizer_free()

◆ ck_tokenizer_id_to_token()

◆ ck_tokenizer_load_binary()

◆ ck_tokenizer_load_binary_with_scores()

◆ ck_tokenizer_load_gguf()

◆ ck_tokenizer_load_json()

◆ ck_tokenizer_load_merges()

◆ ck_tokenizer_load_text()

◆ ck_tokenizer_lookup()

◆ ck_tokenizer_lookup_exact()

◆ ck_tokenizer_lookup_exact_n()

◆ ck_tokenizer_reset()

◆ ck_tokenizer_set_add_bos_eos()

◆ ck_tokenizer_set_add_space_prefix()

◆ ck_tokenizer_set_space_prefix_style()

◆ ck_tokenizer_set_special_ids()

◆ ck_tokenizer_set_spm_mode()

◆ ck_tokenizer_set_use_trie()

◆ find_longest_match()

◆ find_longest_match_hash()

◆ find_longest_match_trie()

◆ preprocess_bpe_spaces()

◆ preprocess_spm_llama_text()

◆ preprocess_spm_text()

◆ spm_build_byte_lookup()

◆ spm_count_unknown_run()

◆ spm_encode_byte_fallback()

◆ spm_find_candidates_at_pos()

◆ spm_get_byte_token()

◆ spm_is_byte_token()

◆ spm_llama_resegment_node()

◆ spm_token_allowed_in_dp()

◆ spm_token_is_byte_format()

◆ utf8_len()