← Back to C-Kernel-Engine Docs Doxygen Source Documentation
include/data_structures/tries/trie.h
Go to the documentation of this file.
1 /*
2  * Trie-based Token Lookup
3  *
4  * Provides O(k) token lookups where k = token length
5  * Much faster than hash table's O(n*k) for longest-match
6  */
7 
8 #ifndef CK_TRIE_H
9 #define CK_TRIE_H
10 
11 #include <stdint.h>
12 #include <stdbool.h>
13 
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 /* Trie node structure */
19 typedef struct CKTrieNode {
20  /* Children indexed by first byte (0-255) */
21  struct CKTrieNode *children[256];
22 
23  /* Token ID if this node ends a valid token (-1 = not a token) */
24  int32_t token_id;
25 
26  /* Is this a special token? */
27  bool is_special;
28 
29  /* Priority for ordering (BPE merge order) */
30  int32_t priority;
31 } CKTrieNode;
32 
33 /* Trie structure */
34 typedef struct {
36  size_t node_count;
37  size_t max_nodes;
38 } CKTrie;
39 
40 /* Create a new trie */
41 CKTrie *ck_trie_create(size_t max_nodes);
42 
43 /* Free a trie */
44 void ck_trie_free(CKTrie *trie);
45 
46 /* Reset trie to empty state */
47 void ck_trie_clear(CKTrie *trie);
48 
49 /* Insert a token into the trie */
50 int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority);
51 
52 /* Find the longest matching token starting at position */
53 /* Returns token_id or -1 if no match found, sets match_len */
54 int32_t ck_trie_find_longest(const CKTrie *trie, const char *text, size_t text_len,
55  size_t start_pos, size_t *match_len);
56 
57 /* Check if a prefix exists in the trie */
58 bool ck_trie_has_prefix(const CKTrie *trie, const char *text, size_t text_len, size_t pos);
59 
60 /* Get the number of nodes in the trie */
61 size_t ck_trie_node_count(const CKTrie *trie);
62 
63 #ifdef __cplusplus
64 }
65 #endif
66 
67 #endif /* CK_TRIE_H */
void ck_trie_clear(CKTrie *trie)
Definition: trie.c:80
int32_t ck_trie_find_longest(const CKTrie *trie, const char *text, size_t text_len, size_t start_pos, size_t *match_len)
Definition: trie.c:142
int ck_trie_insert(CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
Definition: trie.c:110
bool ck_trie_has_prefix(const CKTrie *trie, const char *text, size_t text_len, size_t pos)
Definition: trie.c:181
void ck_trie_free(CKTrie *trie)
Definition: trie.c:51
size_t ck_trie_node_count(const CKTrie *trie)
Definition: trie.c:195
CKTrie * ck_trie_create(size_t max_nodes)
Definition: trie.c:29
struct CKTrieNode * children[256]
const char * text
Definition: tokenizer.h:563
const char * token
Definition: tokenizer.h:306
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115
const char int text_len
Definition: true_bpe.h:262