18 size_t *offset,
size_t *size) {
20 snprintf(search,
sizeof(search),
"\"name\": \"%s\"", name);
21 char *pos = strstr(json, search);
22 if (!pos)
return false;
24 char *off_pos = strstr(pos,
"\"file_offset\":");
25 if (!off_pos)
return false;
26 *offset = strtoull(off_pos + 14, NULL, 0);
28 char *size_pos = strstr(pos,
"\"size\":");
29 if (!size_pos)
return false;
30 *size = strtoull(size_pos + 7, NULL, 0);
38 snprintf(search,
sizeof(search),
"\"%s\":", key);
39 char *pos = strstr(json, search);
41 return atoi(pos + strlen(search));
44 int main(
int argc,
char **argv) {
46 fprintf(stderr,
"Usage: %s <weights.bump> <manifest.json> [prompt]\n", argv[0]);
50 const char *bump_path = argv[1];
51 const char *manifest_path = argv[2];
52 const char *prompt = argc > 3 ? argv[3] :
"Hello, world!";
54 printf(
"Loading tokenizer from BUMP file...\n");
55 printf(
" BUMP: %s\n", bump_path);
56 printf(
" Manifest: %s\n", manifest_path);
57 printf(
" Prompt: \"%s\"\n\n", prompt);
60 FILE *mf = fopen(manifest_path,
"r");
62 fprintf(stderr,
"Cannot open manifest: %s\n", manifest_path);
65 fseek(mf, 0, SEEK_END);
66 long mlen = ftell(mf);
67 fseek(mf, 0, SEEK_SET);
68 char *manifest = malloc(mlen + 1);
69 fread(manifest, 1, mlen, mf);
70 manifest[mlen] =
'\0';
74 size_t vocab_off_offset, vocab_off_size;
75 size_t vocab_str_offset, vocab_str_size;
76 size_t vocab_merge_offset, vocab_merge_size;
79 fprintf(stderr,
"vocab_offsets not found in manifest\n");
84 fprintf(stderr,
"vocab_strings not found in manifest\n");
89 fprintf(stderr,
"vocab_merges not found in manifest\n");
98 printf(
"Manifest entries:\n");
101 printf(
" vocab_offsets: offset=%zu size=%zu\n", vocab_off_offset, vocab_off_size);
102 printf(
" vocab_strings: offset=%zu size=%zu\n", vocab_str_offset, vocab_str_size);
103 printf(
" vocab_merges: offset=%zu size=%zu\n", vocab_merge_offset, vocab_merge_size);
107 FILE *bf = fopen(bump_path,
"rb");
109 fprintf(stderr,
"Cannot open BUMP: %s\n", bump_path);
114 int32_t *vocab_offsets = malloc(vocab_off_size);
115 fseek(bf, vocab_off_offset, SEEK_SET);
116 fread(vocab_offsets, 1, vocab_off_size, bf);
119 char *vocab_strings = malloc(vocab_str_size);
120 fseek(bf, vocab_str_offset, SEEK_SET);
121 fread(vocab_strings, 1, vocab_str_size, bf);
124 int32_t *vocab_merges = NULL;
126 vocab_merges = malloc(vocab_merge_size);
127 fseek(bf, vocab_merge_offset, SEEK_SET);
128 fread(vocab_merges, 1, vocab_merge_size, bf);
136 fprintf(stderr,
"Failed to create tokenizer\n");
145 vocab_strings,
num_merges, vocab_merges) != 0) {
146 fprintf(stderr,
"Failed to load tokenizer from BUMP\n");
154 printf(
"Tokenizer loaded successfully!\n");
163 printf(
"Tokenization results:\n");
164 printf(
" Input: \"%s\"\n", prompt);
165 printf(
" Tokens (%d):", num_tokens);
166 for (
int i = 0; i < num_tokens; i++) {
168 printf(
" [%d:'%s']", tokens[i], tok_str ? tok_str :
"?");
173 char decoded[1024] = {0};
175 for (
int i = 0; i < num_tokens && decoded_len < 1020; i++) {
178 int len = strlen(tok_str);
179 if (decoded_len + len < 1020) {
180 strcpy(decoded + decoded_len, tok_str);
185 printf(
" Decoded: \"%s\"\n", decoded);
193 printf(
"\nTokenizer test PASSED!\n");
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
void ck_true_bpe_free(CKTrueBPE *bpe)
CKTrueBPE * ck_true_bpe_create(void)
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
int const int32_t const char int num_merges
static int parse_manifest_int(const char *json, const char *key)
static bool parse_manifest_entry(const char *json, const char *name, size_t *offset, size_t *size)
int main(int argc, char **argv)