#include <stdio.h>#include <stdlib.h>#include <stdint.h>#include <string.h>#include <ctype.h>#include "ck_tokenizer.h"Go to the source code of this file.
Functions | |
| int | ck_tokenizer_load_binary (CKTokenizer *tok, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges) |
| static uint32_t | hash_string (const char *s, int len) |
| int ck_tokenizer_load_binary | ( | CKTokenizer * | tok, |
| int | vocab_size, | ||
| const int32_t * | offsets, | ||
| const char * | strings, | ||
| int | num_merges, | ||
| const int32_t * | merges | ||
| ) |
Load vocabulary from memory-mapped binary data.
| tok | Tokenizer |
| vocab_size | Number of tokens |
| offsets | Array of offsets into strings pool |
| strings | String pool containing null-terminated tokens |
| num_merges | Number of BPE merges |
| merges | Merge rules as (left, right, merged) triplets |
Definition at line 18 of file ck_tokenizer_v6.6.c.
References ck_pool_alloc(), ck_tokenizer_add_merge(), hash_string(), CKVocabEntry::id, CKTokenizer::id_to_token, left, merges, CKVocabEntry::next, num_merges, offsets, CKTokenizer::pool, right, strings, CKVocabEntry::token, token, CKVocabEntry::token_len, CKTokenizer::vocab_hash, CKTokenizer::vocab_hash_size, CKTokenizer::vocab_size, and vocab_size.
Referenced by main().
|
static |