#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tokenizer/tokenizer.h"
Go to the source code of this file.
|
| int | main (int argc, char **argv) |
| |
◆ main()
| int main |
( |
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
Definition at line 10 of file test_tokenizer.c.
11 printf(
"=== C-Kernel-Engine Tokenizer Test ===\n\n");
16 fprintf(stderr,
"Failed to create tokenizer\n");
20 printf(
"Tokenizer type: %s\n\n", ck_tokenizer_type_name(tok));
23 printf(
"Adding test vocabulary...\n");
49 const char *test_strings[] = {
53 "hello world testing tokenizer",
57 printf(
"=== Encoding Tests ===\n\n");
59 for (
int i = 0; test_strings[i] != NULL; i++) {
60 const char *
text = test_strings[i];
70 printf(
"Input: \"%s\"\n",
text);
71 printf(
"Tokens [%d]: ",
num_ids);
73 for (
int j = 0; j <
num_ids; j++) {
77 printf(
"(%s)",
token);
79 if (j <
num_ids - 1) printf(
", ");
86 printf(
"Decoded: \"%s\"\n\n", decoded);
90 printf(
"=== Lookup Tests ===\n\n");
98 printf(
"\n=== Test Complete ===\n");
int32_t ck_tokenizer_lookup(const CKTokenizer *tok, const char *token, int len)
int ck_tokenizer_decode(const CKTokenizer *tok, const int32_t *ids, int num_ids, char *text, int max_len)
const char * ck_tokenizer_id_to_token(const CKTokenizer *tok, int32_t id)
int ck_tokenizer_encode(const CKTokenizer *tok, const char *text, int text_len, int32_t *ids, int max_ids)
int32_t ck_tokenizer_add_token(CKTokenizer *tok, const char *token, int len)
void ck_tokenizer_free(CKTokenizer *tok)
static int ck_tokenizer_vocab_size(const CKTokenizer *tok)
static CKTokenizer * ck_tokenizer_create_bpe(void)
const int32_t int num_ids
int ck_tokenizer_add_special_token(CKTokenizer *tok, const char *name, int32_t id)
const char int int32_t int max_ids
References CKTokenizerConfig::add_bos, CKTokenizerConfig::add_eos, ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_create_bpe(), ck_tokenizer_decode(), ck_tokenizer_encode(), ck_tokenizer_free(), ck_tokenizer_id_to_token(), ck_tokenizer_lookup(), ck_tokenizer_vocab_size(), CKTokenizer::config, ids, max_ids, num_ids, text, and token.