← Back to C-Kernel-Engine Docs Doxygen Source Documentation
v6.6/test_inference_with_bump_tokenizer.c
Go to the documentation of this file.
1 /*
2  * test_inference_with_bump_tokenizer.c - Test tokenizer loading from BUMP
3  *
4  * This demonstrates loading a tokenizer embedded in a BUMP file without
5  * needing a separate tokenizer.json file.
6  */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <stdbool.h>
13 
14 #include "tokenizer/true_bpe.h"
15 
16 /* Simple JSON parsing to extract manifest entries */
17 static bool parse_manifest_entry(const char *json, const char *name,
18  size_t *offset, size_t *size) {
19  char search[256];
20  snprintf(search, sizeof(search), "\"name\": \"%s\"", name);
21  char *pos = strstr(json, search);
22  if (!pos) return false;
23 
24  char *off_pos = strstr(pos, "\"file_offset\":");
25  if (!off_pos) return false;
26  *offset = strtoull(off_pos + 14, NULL, 0);
27 
28  char *size_pos = strstr(pos, "\"size\":");
29  if (!size_pos) return false;
30  *size = strtoull(size_pos + 7, NULL, 0);
31 
32  return true;
33 }
34 
35 /* Parse num_merges from manifest header */
36 static int parse_manifest_int(const char *json, const char *key) {
37  char search[256];
38  snprintf(search, sizeof(search), "\"%s\":", key);
39  char *pos = strstr(json, search);
40  if (!pos) return 0;
41  return atoi(pos + strlen(search));
42 }
43 
44 int main(int argc, char **argv) {
45  if (argc < 3) {
46  fprintf(stderr, "Usage: %s <weights.bump> <manifest.json> [prompt]\n", argv[0]);
47  return 1;
48  }
49 
50  const char *bump_path = argv[1];
51  const char *manifest_path = argv[2];
52  const char *prompt = argc > 3 ? argv[3] : "Hello, world!";
53 
54  printf("Loading tokenizer from BUMP file...\n");
55  printf(" BUMP: %s\n", bump_path);
56  printf(" Manifest: %s\n", manifest_path);
57  printf(" Prompt: \"%s\"\n\n", prompt);
58 
59  /* Read manifest JSON */
60  FILE *mf = fopen(manifest_path, "r");
61  if (!mf) {
62  fprintf(stderr, "Cannot open manifest: %s\n", manifest_path);
63  return 1;
64  }
65  fseek(mf, 0, SEEK_END);
66  long mlen = ftell(mf);
67  fseek(mf, 0, SEEK_SET);
68  char *manifest = malloc(mlen + 1);
69  fread(manifest, 1, mlen, mf);
70  manifest[mlen] = '\0';
71  fclose(mf);
72 
73  /* Parse manifest entries */
74  size_t vocab_off_offset, vocab_off_size;
75  size_t vocab_str_offset, vocab_str_size;
76  size_t vocab_merge_offset, vocab_merge_size;
77 
78  if (!parse_manifest_entry(manifest, "vocab_offsets", &vocab_off_offset, &vocab_off_size)) {
79  fprintf(stderr, "vocab_offsets not found in manifest\n");
80  free(manifest);
81  return 1;
82  }
83  if (!parse_manifest_entry(manifest, "vocab_strings", &vocab_str_offset, &vocab_str_size)) {
84  fprintf(stderr, "vocab_strings not found in manifest\n");
85  free(manifest);
86  return 1;
87  }
88  if (!parse_manifest_entry(manifest, "vocab_merges", &vocab_merge_offset, &vocab_merge_size)) {
89  fprintf(stderr, "vocab_merges not found in manifest\n");
90  free(manifest);
91  return 1;
92  }
93 
94  int vocab_size = parse_manifest_int(manifest, "vocab_size");
95  int num_merges = parse_manifest_int(manifest, "num_merges");
96  free(manifest);
97 
98  printf("Manifest entries:\n");
99  printf(" vocab_size: %d\n", vocab_size);
100  printf(" num_merges: %d\n", num_merges);
101  printf(" vocab_offsets: offset=%zu size=%zu\n", vocab_off_offset, vocab_off_size);
102  printf(" vocab_strings: offset=%zu size=%zu\n", vocab_str_offset, vocab_str_size);
103  printf(" vocab_merges: offset=%zu size=%zu\n", vocab_merge_offset, vocab_merge_size);
104  printf("\n");
105 
106  /* Open BUMP file and read tokenizer data */
107  FILE *bf = fopen(bump_path, "rb");
108  if (!bf) {
109  fprintf(stderr, "Cannot open BUMP: %s\n", bump_path);
110  return 1;
111  }
112 
113  /* Read vocab offsets */
114  int32_t *vocab_offsets = malloc(vocab_off_size);
115  fseek(bf, vocab_off_offset, SEEK_SET);
116  fread(vocab_offsets, 1, vocab_off_size, bf);
117 
118  /* Read vocab strings */
119  char *vocab_strings = malloc(vocab_str_size);
120  fseek(bf, vocab_str_offset, SEEK_SET);
121  fread(vocab_strings, 1, vocab_str_size, bf);
122 
123  /* Read merges */
124  int32_t *vocab_merges = NULL;
125  if (num_merges > 0) {
126  vocab_merges = malloc(vocab_merge_size);
127  fseek(bf, vocab_merge_offset, SEEK_SET);
128  fread(vocab_merges, 1, vocab_merge_size, bf);
129  }
130 
131  fclose(bf);
132 
133  /* Initialize tokenizer */
134  CKTrueBPE *tokenizer = ck_true_bpe_create();
135  if (!tokenizer) {
136  fprintf(stderr, "Failed to create tokenizer\n");
137  free(vocab_offsets);
138  free(vocab_strings);
139  free(vocab_merges);
140  return 1;
141  }
142 
143  /* Load tokenizer from BUMP data */
144  if (ck_true_bpe_load_binary(tokenizer, vocab_size, vocab_offsets,
145  vocab_strings, num_merges, vocab_merges) != 0) {
146  fprintf(stderr, "Failed to load tokenizer from BUMP\n");
147  ck_true_bpe_free(tokenizer);
148  free(vocab_offsets);
149  free(vocab_strings);
150  free(vocab_merges);
151  return 1;
152  }
153 
154  printf("Tokenizer loaded successfully!\n");
155  printf(" Vocab size: %d\n", vocab_size);
156  printf(" Num merges: %d\n", num_merges);
157  printf("\n");
158 
159  /* Tokenize the prompt */
160  int32_t tokens[512];
161  int num_tokens = ck_true_bpe_encode(tokenizer, prompt, -1, tokens, 512);
162 
163  printf("Tokenization results:\n");
164  printf(" Input: \"%s\"\n", prompt);
165  printf(" Tokens (%d):", num_tokens);
166  for (int i = 0; i < num_tokens; i++) {
167  const char *tok_str = ck_true_bpe_id_to_token(tokenizer, tokens[i]);
168  printf(" [%d:'%s']", tokens[i], tok_str ? tok_str : "?");
169  }
170  printf("\n");
171 
172  /* Decode tokens back to string */
173  char decoded[1024] = {0};
174  int decoded_len = 0;
175  for (int i = 0; i < num_tokens && decoded_len < 1020; i++) {
176  const char *tok_str = ck_true_bpe_id_to_token(tokenizer, tokens[i]);
177  if (tok_str) {
178  int len = strlen(tok_str);
179  if (decoded_len + len < 1020) {
180  strcpy(decoded + decoded_len, tok_str);
181  decoded_len += len;
182  }
183  }
184  }
185  printf(" Decoded: \"%s\"\n", decoded);
186 
187  /* Cleanup */
188  ck_true_bpe_free(tokenizer);
189  free(vocab_offsets);
190  free(vocab_strings);
191  free(vocab_merges);
192 
193  printf("\nTokenizer test PASSED!\n");
194  return 0;
195 }
int ck_true_bpe_encode(CKTrueBPE *bpe, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: true_bpe.c:1338
void ck_true_bpe_free(CKTrueBPE *bpe)
Definition: true_bpe.c:405
CKTrueBPE * ck_true_bpe_create(void)
Definition: true_bpe.c:342
int ck_true_bpe_load_binary(CKTrueBPE *bpe, int vocab_size, const int32_t *offsets, const char *strings, int num_merges, const int32_t *merges)
Definition: true_bpe.c:606
const char * ck_true_bpe_id_to_token(const CKTrueBPE *bpe, int32_t id)
Definition: true_bpe.c:645
int const int32_t const char int num_merges
Definition: true_bpe.h:188
int vocab_size
Definition: true_bpe.h:185
static int parse_manifest_int(const char *json, const char *key)
static bool parse_manifest_entry(const char *json, const char *name, size_t *offset, size_t *size)
int main(int argc, char **argv)