← Back to C-Kernel-Engine Docs Doxygen Source Documentation
v6.5/test_bump_tokenizer.c
Go to the documentation of this file.
1 /*
2  * test_bump_tokenizer.c - Test reading tokenizer from BUMP file
3  *
4  * This test verifies that tokenizer data is correctly embedded in the BUMP file
5  * and can be read directly without requiring a separate tokenizer.json file.
6  */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <stdbool.h>
13 
14 /* BUMP file header structure (must match convert_gguf_to_bump_v6_5.py) */
15 #pragma pack(push, 1)
16 typedef struct {
17  char magic[8]; /* "BUMPWGT4" */
18  uint32_t version; /* 4 */
19  uint32_t model_type; /* 1 = legacy */
20  uint32_t num_layers;
21  uint32_t vocab_size;
22  uint32_t embed_dim;
23  uint32_t intermediate_size;
24  uint32_t context_length;
25  uint32_t num_heads;
26  uint32_t num_kv_heads;
27  uint32_t head_dim;
28  uint64_t aligned_embed_dim;
29  uint64_t aligned_head_dim;
30  uint64_t aligned_intermediate;
31  uint64_t aligned_context;
32  /* Tokenizer metadata (v4.1+) */
33  uint32_t num_merges;
34  uint32_t total_vocab_bytes;
35  uint8_t checksum[32]; /* SHA-256 */
36 } BumpHeader;
37 #pragma pack(pop)
38 
39 #define BUMP_HEADER_SIZE 128
40 
41 /* Read manifest from JSON file to get tokenizer offsets */
42 typedef struct {
43  const char *name;
44  const char *dtype;
45  size_t offset;
46  size_t size;
47 } ManifestEntry;
48 
49 static bool read_manifest_entry(const char *json_path, const char *entry_name,
50  size_t *out_offset, size_t *out_size) {
51  FILE *f = fopen(json_path, "r");
52  if (!f) return false;
53 
54  fseek(f, 0, SEEK_END);
55  long len = ftell(f);
56  fseek(f, 0, SEEK_SET);
57 
58  char *buf = malloc(len + 1);
59  if (!buf) { fclose(f); return false; }
60  fread(buf, 1, len, f);
61  buf[len] = '\0';
62  fclose(f);
63 
64  /* Simple JSON parsing for our specific format:
65  * entries: [ { "name": "vocab_offsets", "file_offset": 123, "size": 456 }, ... ]
66  */
67  char search[256];
68  snprintf(search, sizeof(search), "\"name\": \"%s\"", entry_name);
69  char *pos = strstr(buf, search);
70  if (!pos) { free(buf); return false; }
71 
72  /* Find file_offset field within this entry */
73  char *offset_pos = strstr(pos, "\"file_offset\":");
74  if (!offset_pos) { free(buf); return false; }
75  *out_offset = strtoull(offset_pos + 14, NULL, 0);
76 
77  /* Find size field within this entry */
78  char *size_pos = strstr(pos, "\"size\":");
79  if (!size_pos) { free(buf); return false; }
80  *out_size = strtoull(size_pos + 7, NULL, 0);
81 
82  free(buf);
83  return true;
84 }
85 
86 int main(int argc, char **argv) {
87  if (argc < 2) {
88  fprintf(stderr, "Usage: %s <weights.bump> [manifest.json]\n", argv[0]);
89  fprintf(stderr, "\nTests reading tokenizer data from BUMP file\n");
90  return 1;
91  }
92 
93  const char *bump_path = argv[1];
94  const char *manifest_path = argc > 2 ? argv[2] : NULL;
95 
96  printf("Testing BUMP tokenizer extraction\n");
97  printf("==================================\n");
98  printf("BUMP file: %s\n", bump_path);
99  if (manifest_path) printf("Manifest: %s\n", manifest_path);
100  printf("\n");
101 
102  /* Open BUMP file */
103  FILE *f = fopen(bump_path, "rb");
104  if (!f) {
105  fprintf(stderr, "Error: Cannot open %s\n", bump_path);
106  return 1;
107  }
108 
109  /* Read header */
110  BumpHeader header;
111  if (fread(&header, sizeof(header), 1, f) != 1) {
112  fprintf(stderr, "Error: Cannot read header\n");
113  fclose(f);
114  return 1;
115  }
116 
117  /* Verify magic */
118  if (memcmp(header.magic, "BUMPWGT4", 8) != 0) {
119  fprintf(stderr, "Error: Invalid magic: %.8s\n", header.magic);
120  fclose(f);
121  return 1;
122  }
123 
124  printf("BUMP Header:\n");
125  printf(" Version: %u\n", header.version);
126  printf(" Layers: %u\n", header.num_layers);
127  printf(" Vocab size: %u\n", header.vocab_size);
128  printf(" Embed dim: %u\n", header.embed_dim);
129  printf(" Heads: %u/%u\n", header.num_heads, header.num_kv_heads);
130  printf(" Head dim: %u\n", header.head_dim);
131  printf(" Intermediate: %u\n", header.intermediate_size);
132  printf(" Context: %u\n", header.context_length);
133  printf(" Num merges: %u\n", header.num_merges);
134  printf(" Vocab bytes: %u\n", header.total_vocab_bytes);
135  printf("\n");
136 
137  /* If we have a manifest, use it to find tokenizer entries */
138  if (manifest_path) {
139  size_t vocab_offsets_off, vocab_offsets_size;
140  size_t vocab_strings_off, vocab_strings_size;
141  size_t vocab_merges_off, vocab_merges_size;
142 
143  bool have_offsets = read_manifest_entry(manifest_path, "vocab_offsets",
144  &vocab_offsets_off, &vocab_offsets_size);
145  bool have_strings = read_manifest_entry(manifest_path, "vocab_strings",
146  &vocab_strings_off, &vocab_strings_size);
147  bool have_merges = read_manifest_entry(manifest_path, "vocab_merges",
148  &vocab_merges_off, &vocab_merges_size);
149 
150  printf("Manifest entries:\n");
151  if (have_offsets) {
152  printf(" vocab_offsets: offset=0x%lx size=%zu bytes\n",
153  vocab_offsets_off, vocab_offsets_size);
154  } else {
155  printf(" vocab_offsets: NOT FOUND\n");
156  }
157  if (have_strings) {
158  printf(" vocab_strings: offset=0x%lx size=%zu bytes\n",
159  vocab_strings_off, vocab_strings_size);
160  } else {
161  printf(" vocab_strings: NOT FOUND\n");
162  }
163  if (have_merges) {
164  printf(" vocab_merges: offset=0x%lx size=%zu bytes\n",
165  vocab_merges_off, vocab_merges_size);
166  } else {
167  printf(" vocab_merges: NOT FOUND\n");
168  }
169  printf("\n");
170 
171  if (have_offsets && have_strings) {
172  /* Read and display some tokens */
173  printf("Sample tokens from BUMP:\n");
174 
175  /* Read vocab_offsets */
176  int32_t *offsets = malloc(vocab_offsets_size);
177  fseek(f, vocab_offsets_off, SEEK_SET);
178  fread(offsets, 1, vocab_offsets_size, f);
179 
180  /* Read vocab_strings */
181  char *strings = malloc(vocab_strings_size);
182  fseek(f, vocab_strings_off, SEEK_SET);
183  fread(strings, 1, vocab_strings_size, f);
184 
185  int num_tokens = vocab_offsets_size / 4;
186  for (int i = 0; i < 10 && i < num_tokens; i++) {
187  const char *token = strings + offsets[i];
188  printf(" [%d] '%s'\n", i, token);
189  }
190  printf(" ...\n");
191  for (int i = num_tokens - 5; i < num_tokens; i++) {
192  if (i >= 10) {
193  const char *token = strings + offsets[i];
194  printf(" [%d] '%s'\n", i, token);
195  }
196  }
197 
198  free(offsets);
199  free(strings);
200 
201  printf("\nTokenizer successfully read from BUMP file!\n");
202  }
203  } else {
204  printf("No manifest provided - cannot locate tokenizer entries.\n");
205  printf("Run converter with --manifest-out to generate manifest.\n");
206  }
207 
208  fclose(f);
209  return 0;
210 }
const char * token
Definition: tokenizer.h:306
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int const int32_t const char * strings
Definition: true_bpe.h:187
int vocab_size
Definition: true_bpe.h:185
int const int32_t * offsets
Definition: true_bpe.h:186
int main(int argc, char **argv)
static bool read_manifest_entry(const char *json_path, const char *entry_name, size_t *out_offset, size_t *out_size)