CK
← Back to C-Kernel-Engine Docs
Doxygen Source Documentation
ck_tokenizer_v2.c
Go to the documentation of this file.
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <stdint.h>
4
#include <string.h>
5
#include <ctype.h>
6
7
#include "
ck_tokenizer.h
"
8
9
static
uint32_t
hash_string
(
const
char
*s,
int
len) {
10
uint32_t hash = 2166136261u;
11
for
(
int
i = 0; i < len; i++) {
12
hash ^= (uint8_t)s[i];
13
hash *= 16777619u;
14
}
15
return
hash;
16
}
17
18
int
ck_tokenizer_load_binary
(
CKTokenizer
*tok,
19
int
vocab_size
,
20
const
int32_t *
offsets
,
21
const
char
*
strings
,
22
int
num_merges
,
23
const
int32_t *
merges
) {
24
if
(!tok || !
offsets
|| !
strings
)
return
-1;
25
26
// We assume ck_tokenizer_init was already called to alloc hash tables
27
tok->
vocab_size
= 0;
28
29
for
(
int
i = 0; i <
vocab_size
; i++) {
30
const
char
*
token
=
strings
+
offsets
[i];
31
int
len = (int)strlen(
token
);
32
33
CKVocabEntry
*entry = (
CKVocabEntry
*)
ck_pool_alloc
(&tok->
pool
,
sizeof
(
CKVocabEntry
));
34
entry->
token
= (
char
*)
token
;
35
entry->
token_len
= len;
36
entry->
id
= i;
37
38
uint32_t bucket =
hash_string
(
token
, len) % tok->
vocab_hash_size
;
39
entry->
next
= tok->
vocab_hash
[bucket];
40
tok->
vocab_hash
[bucket] = entry;
41
42
tok->
id_to_token
[i] = entry->
token
;
43
tok->
vocab_size
++;
44
}
45
46
if
(
merges
&&
num_merges
> 0) {
47
for
(
int
i = 0; i <
num_merges
; i++) {
48
int32_t
left
=
merges
[i*3 + 0];
49
int32_t
right
=
merges
[i*3 + 1];
50
int32_t merged =
merges
[i*3 + 2];
51
ck_tokenizer_add_merge
(tok,
left
,
right
, merged);
52
}
53
}
54
55
return
0;
56
}
ck_tokenizer.h
ck_tokenizer_add_merge
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
Definition:
ck_tokenizer.c:248
ck_pool_alloc
void * ck_pool_alloc(CKMemPool *pool, size_t size)
Definition:
ck_tokenizer.c:69
ck_tokenizer_load_binary
int ck_tokenizer_load_binary(CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition:
ck_tokenizer_v2.c:18
hash_string
static uint32_t hash_string(const char *s, int len)
Definition:
ck_tokenizer_v2.c:9
CKTokenizer
Definition:
ck_tokenizer.h:76
CKTokenizer::pool
CKMemPool pool
Definition:
ck_tokenizer.h:78
CKTokenizer::vocab_hash
CKVocabEntry ** vocab_hash
Definition:
ck_tokenizer.h:82
CKTokenizer::vocab_hash_size
int vocab_hash_size
Definition:
ck_tokenizer.h:83
CKTokenizer::id_to_token
char ** id_to_token
Definition:
ck_tokenizer.h:86
CKTokenizer::vocab_size
int vocab_size
Definition:
ck_tokenizer.h:81
CKVocabEntry
Definition:
ck_tokenizer.h:55
CKVocabEntry::next
struct CKVocabEntry * next
Definition:
ck_tokenizer.h:59
CKVocabEntry::token
char * token
Definition:
ck_tokenizer.h:56
CKVocabEntry::token_len
int token_len
Definition:
ck_tokenizer.h:57
CKVocabEntry::id
int32_t id
Definition:
ck_tokenizer.h:58
token
const char * token
Definition:
tokenizer.h:306
num_merges
int const int32_t const char int num_merges
Definition:
true_bpe.h:188
strings
int const int32_t const char * strings
Definition:
true_bpe.h:187
merges
int const int32_t const char int const int32_t * merges
Definition:
true_bpe.h:189
vocab_size
int vocab_size
Definition:
true_bpe.h:185
offsets
int const int32_t * offsets
Definition:
true_bpe.h:186
left
const char * left
Definition:
true_bpe.h:130
right
const char const char * right
Definition:
true_bpe.h:131
src
ck_tokenizer_v2.c
Generated by
1.9.1