← Back to C-Kernel-Engine Docs Doxygen Source Documentation
test_tokenizer.c
Go to the documentation of this file.
1 /*
2  * Simple tokenizer test
3  */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include "tokenizer/tokenizer.h"
9 
10 int main(int argc, char **argv) {
11  printf("=== C-Kernel-Engine Tokenizer Test ===\n\n");
12 
13  /* Create BPE tokenizer */
15  if (!tok) {
16  fprintf(stderr, "Failed to create tokenizer\n");
17  return 1;
18  }
19 
20  printf("Tokenizer type: %s\n\n", ck_tokenizer_type_name(tok));
21 
22  /* Add some test tokens */
23  printf("Adding test vocabulary...\n");
24 
25  /* Special tokens */
26  ck_tokenizer_add_special_token(tok, "<unk>", 0);
27  ck_tokenizer_add_special_token(tok, "<s>", 1);
28  ck_tokenizer_add_special_token(tok, "</s>", 2);
29  ck_tokenizer_add_special_token(tok, "<pad>", 3);
30 
31  /* Common tokens */
32  ck_tokenizer_add_token(tok, "hello", 100, 0.0f);
33  ck_tokenizer_add_token(tok, "world", 101, 0.0f);
34  ck_tokenizer_add_token(tok, "hello world", 102, 0.0f);
35  ck_tokenizer_add_token(tok, "test", 103, 0.0f);
36  ck_tokenizer_add_token(tok, "ing", 104, 0.0f);
37  ck_tokenizer_add_token(tok, "testing", 105, 0.0f);
38  ck_tokenizer_add_token(tok, "token", 106, 0.0f);
39  ck_tokenizer_add_token(tok, "izer", 107, 0.0f);
40  ck_tokenizer_add_token(tok, "hello</s>", 108, 0.0f);
41 
42  /* WordPiece style with ## prefix */
43  ck_tokenizer_add_token(tok, "##ing", 200, 0.0f);
44  ck_tokenizer_add_token(tok, "##er", 201, 0.0f);
45 
46  printf("Vocabulary size: %zu\n\n", ck_tokenizer_vocab_size(tok));
47 
48  /* Test encoding */
49  const char *test_strings[] = {
50  "hello world",
51  "testing",
52  "tokenizer",
53  "hello world testing tokenizer",
54  NULL
55  };
56 
57  printf("=== Encoding Tests ===\n\n");
58 
59  for (int i = 0; test_strings[i] != NULL; i++) {
60  const char *text = test_strings[i];
61  int32_t ids[256];
62  int max_ids = 256;
63 
64  /* Enable BOS/EOS */
65  tok->config.add_bos = true;
66  tok->config.add_eos = true;
67 
68  int num_ids = ck_tokenizer_encode(tok, text, -1, ids, max_ids);
69 
70  printf("Input: \"%s\"\n", text);
71  printf("Tokens [%d]: ", num_ids);
72 
73  for (int j = 0; j < num_ids; j++) {
74  const char *token = ck_tokenizer_id_to_token(tok, ids[j]);
75  printf("%d", ids[j]);
76  if (token) {
77  printf("(%s)", token);
78  }
79  if (j < num_ids - 1) printf(", ");
80  }
81  printf("\n");
82 
83  /* Test decoding */
84  char decoded[1024];
85  ck_tokenizer_decode(tok, ids, num_ids, decoded, sizeof(decoded));
86  printf("Decoded: \"%s\"\n\n", decoded);
87  }
88 
89  /* Test lookup */
90  printf("=== Lookup Tests ===\n\n");
91  printf("'hello' -> id %d\n", ck_tokenizer_lookup(tok, "hello"));
92  printf("'world' -> id %d\n", ck_tokenizer_lookup(tok, "world"));
93  printf("'unknown' -> id %d (should be unk_id=0)\n", ck_tokenizer_lookup(tok, "unknown"));
94 
95  /* Clean up */
96  ck_tokenizer_free(tok);
97 
98  printf("\n=== Test Complete ===\n");
99  return 0;
100 }
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:227
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
Definition: ck_tokenizer.c:737
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
Definition: ck_tokenizer.c:239
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
Definition: ck_tokenizer.c:638
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
Definition: ck_tokenizer.c:196
void ck_tokenizer_free(CKTokenizer *tok)
Definition: ck_tokenizer.c:183
static int ck_tokenizer_vocab_size(const CKTokenizer *tok)
Definition: ck_tokenizer.h:196
CKTokenizerConfig config
Definition: tokenizer.h:97
int main(int argc, char **argv)
const int32_t * ids
Definition: tokenizer.h:443
static CKTokenizer * ck_tokenizer_create_bpe(void)
Definition: tokenizer.h:156
const int32_t int num_ids
Definition: tokenizer.h:444
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
Definition: tokenizer.c:213
const char int int32_t int max_ids
Definition: true_bpe.h:264