#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include "ck_tokenizer.h"
Go to the source code of this file.
◆ ck_tokenizer_load_binary()
| int ck_tokenizer_load_binary |
( |
CKTokenizer * |
tok, |
|
|
int |
vocab_size, |
|
|
const int32_t * |
offsets, |
|
|
const char * |
strings, |
|
|
int |
num_merges, |
|
|
const int32_t * |
merges |
|
) |
| |
Load vocabulary from memory-mapped binary data.
- Parameters
-
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
- Returns
- 0 on success, -1 on error
Definition at line 18 of file v2_legacy/ck_tokenizer_v2.c.
31 int len = (int)strlen(
token);
50 int32_t merged =
merges[i*3 + 2];
int ck_tokenizer_add_merge(CKTokenizer *tok, int32_t left, int32_t right, int32_t merged)
void * ck_pool_alloc(CKMemPool *pool, size_t size)
CKVocabEntry ** vocab_hash
struct CKVocabEntry * next
int const int32_t const char int num_merges
int const int32_t const char * strings
int const int32_t const char int const int32_t * merges
int const int32_t * offsets
const char const char * right
static uint32_t hash_string(const char *s, int len)
References ck_pool_alloc(), ck_tokenizer_add_merge(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, left, merges, CKVocabEntry::next, num_merges, offsets, CKTokenizer::pool, right, strings, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, CKTokenizer::vocab_size, and vocab_size.
◆ hash_string()
| static uint32_t hash_string |
( |
const char * |
s, |
|
|
int |
len |
|
) |
| |
|
static |