23 uint32_t intermediate_size;
24 uint32_t context_length;
26 uint32_t num_kv_heads;
28 uint64_t aligned_embed_dim;
29 uint64_t aligned_head_dim;
30 uint64_t aligned_intermediate;
31 uint64_t aligned_context;
34 uint32_t total_vocab_bytes;
39 #define BUMP_HEADER_SIZE 128
50 size_t *out_offset,
size_t *out_size) {
51 FILE *f = fopen(json_path,
"r");
54 fseek(f, 0, SEEK_END);
56 fseek(f, 0, SEEK_SET);
58 char *buf = malloc(len + 1);
59 if (!buf) { fclose(f);
return false; }
60 fread(buf, 1, len, f);
68 snprintf(search,
sizeof(search),
"\"name\": \"%s\"", entry_name);
69 char *pos = strstr(buf, search);
70 if (!pos) { free(buf);
return false; }
73 char *offset_pos = strstr(pos,
"\"file_offset\":");
74 if (!offset_pos) { free(buf);
return false; }
75 *out_offset = strtoull(offset_pos + 14, NULL, 0);
78 char *size_pos = strstr(pos,
"\"size\":");
79 if (!size_pos) { free(buf);
return false; }
80 *out_size = strtoull(size_pos + 7, NULL, 0);
86 int main(
int argc,
char **argv) {
88 fprintf(stderr,
"Usage: %s <weights.bump> [manifest.json]\n", argv[0]);
89 fprintf(stderr,
"\nTests reading tokenizer data from BUMP file\n");
93 const char *bump_path = argv[1];
94 const char *manifest_path = argc > 2 ? argv[2] : NULL;
96 printf(
"Testing BUMP tokenizer extraction\n");
97 printf(
"==================================\n");
98 printf(
"BUMP file: %s\n", bump_path);
99 if (manifest_path) printf(
"Manifest: %s\n", manifest_path);
103 FILE *f = fopen(bump_path,
"rb");
105 fprintf(stderr,
"Error: Cannot open %s\n", bump_path);
111 if (fread(&header,
sizeof(header), 1, f) != 1) {
112 fprintf(stderr,
"Error: Cannot read header\n");
118 if (memcmp(header.magic,
"BUMPWGT4", 8) != 0) {
119 fprintf(stderr,
"Error: Invalid magic: %.8s\n", header.magic);
124 printf(
"BUMP Header:\n");
125 printf(
" Version: %u\n", header.version);
126 printf(
" Layers: %u\n", header.num_layers);
127 printf(
" Vocab size: %u\n", header.vocab_size);
128 printf(
" Embed dim: %u\n", header.embed_dim);
129 printf(
" Heads: %u/%u\n", header.num_heads, header.num_kv_heads);
130 printf(
" Head dim: %u\n", header.head_dim);
131 printf(
" Intermediate: %u\n", header.intermediate_size);
132 printf(
" Context: %u\n", header.context_length);
133 printf(
" Num merges: %u\n", header.num_merges);
134 printf(
" Vocab bytes: %u\n", header.total_vocab_bytes);
139 size_t vocab_offsets_off, vocab_offsets_size;
140 size_t vocab_strings_off, vocab_strings_size;
141 size_t vocab_merges_off, vocab_merges_size;
144 &vocab_offsets_off, &vocab_offsets_size);
146 &vocab_strings_off, &vocab_strings_size);
148 &vocab_merges_off, &vocab_merges_size);
150 printf(
"Manifest entries:\n");
152 printf(
" vocab_offsets: offset=0x%lx size=%zu bytes\n",
153 vocab_offsets_off, vocab_offsets_size);
155 printf(
" vocab_offsets: NOT FOUND\n");
158 printf(
" vocab_strings: offset=0x%lx size=%zu bytes\n",
159 vocab_strings_off, vocab_strings_size);
161 printf(
" vocab_strings: NOT FOUND\n");
164 printf(
" vocab_merges: offset=0x%lx size=%zu bytes\n",
165 vocab_merges_off, vocab_merges_size);
167 printf(
" vocab_merges: NOT FOUND\n");
171 if (have_offsets && have_strings) {
173 printf(
"Sample tokens from BUMP:\n");
176 int32_t *
offsets = malloc(vocab_offsets_size);
177 fseek(f, vocab_offsets_off, SEEK_SET);
178 fread(
offsets, 1, vocab_offsets_size, f);
181 char *
strings = malloc(vocab_strings_size);
182 fseek(f, vocab_strings_off, SEEK_SET);
183 fread(
strings, 1, vocab_strings_size, f);
185 int num_tokens = vocab_offsets_size / 4;
186 for (
int i = 0; i < 10 && i < num_tokens; i++) {
188 printf(
" [%d] '%s'\n", i,
token);
191 for (
int i = num_tokens - 5; i < num_tokens; i++) {
194 printf(
" [%d] '%s'\n", i,
token);
201 printf(
"\nTokenizer successfully read from BUMP file!\n");
204 printf(
"No manifest provided - cannot locate tokenizer entries.\n");
205 printf(
"Run converter with --manifest-out to generate manifest.\n");
int const int32_t const char int num_merges
int const int32_t const char * strings
int const int32_t * offsets
int main(int argc, char **argv)
static bool read_manifest_entry(const char *json_path, const char *entry_name, size_t *out_offset, size_t *out_size)