← Back to C-Kernel-Engine Docs Doxygen Source Documentation
ck_tokenizer_v6.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include "ck_tokenizer.h"

Go to the source code of this file.

Functions

int ck_tokenizer_load_binary (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
 
static uint32_t hash_string (const char *s, int len)
 

Function Documentation

◆ ck_tokenizer_load_binary()

int ck_tokenizer_load_binary ( CKTokenizer tok,
int  vocab_size,
const int32_t *  offsets,
const char *  strings,
int  num_merges,
const int32_t *  merges 
)

Load vocabulary from memory-mapped binary data.

Parameters
tokTokenizer
vocab_sizeNumber of tokens
offsetsArray of offsets into strings pool
stringsString pool containing null-terminated tokens
num_mergesNumber of BPE merges
mergesMerge rules as (left, right, merged) triplets
Returns
0 on success, -1 on error

Definition at line 18 of file ck_tokenizer_v6.c.

23  {
24  if (!tok || !offsets || !strings) return -1;
25 
26  // We assume ck_tokenizer_init was already called to alloc hash tables
27  tok->vocab_size = 0;
28 
29  for (int i = 0; i < vocab_size; i++) {
30  const char *token = strings + offsets[i];
31  int len = (int)strlen(token);
32 
33  CKVocabEntry *entry = (CKVocabEntry *)ck_pool_alloc(&tok->pool, sizeof(CKVocabEntry));
34  entry->token = (char *)token;
35  entry->token_len = len;
36  entry->id = i;
37 
38  uint32_t bucket = hash_string(token, len) % tok->vocab_hash_size;
39  entry->next = tok->vocab_hash[bucket];
40  tok->vocab_hash[bucket] = entry;
41 
42  tok->id_to_token[i] = entry->token;
43  tok->vocab_size++;
44  }
45 
46  if (merges && num_merges > 0) {
47  for (int i = 0; i < num_merges; i++) {
48  int32_t left = merges[i*3 + 0];
49  int32_t right = merges[i*3 + 1];
50  int32_t merged = merges[i*3 + 2];
51  ck_tokenizer_add_merge(tok, left, right, merged);
52  }
53  }
54 
55  return 0;
56 }
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition: ck_tokenizer.c:248
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition: ck_tokenizer.c:69
static uint32_t hash_string(const char *s, int len)
CKMemPool pool
Definition: ck_tokenizer.h:78
CKVocabEntry ** vocab_hash
Definition: ck_tokenizer.h:82
int vocab_hash_size
Definition: ck_tokenizer.h:83
char ** id_to_token
Definition: ck_tokenizer.h:86
struct CKVocabEntry * next
Definition: ck_tokenizer.h:59
char * token
Definition: ck_tokenizer.h:56
int32_t id
Definition: ck_tokenizer.h:58
const char * token
Definition: tokenizer.h:306
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int const int32_t const char int const int32_t * merges
Definition: true_bpe.h:189
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
const char * left
Definition: true_bpe.h:130
const char const char * right
Definition: true_bpe.h:131

References ck_pool_alloc(), ck_tokenizer_add_merge(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, left, merges, CKVocabEntry::next, num_merges, offsets, CKTokenizer::pool, right, strings, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, CKTokenizer::vocab_size, and vocab_size.

◆ hash_string()

static uint32_t hash_string ( const char *  s,
int  len 
)
static

Definition at line 9 of file ck_tokenizer_v6.c.

9  {
10  uint32_t hash = 2166136261u;
11  for (int i = 0; i < len; i++) {
12  hash ^= (uint8_t)s[i];
13  hash *= 16777619u;
14  }
15  return hash;
16 }

Referenced by ck_tokenizer_load_binary().