← Back to C-Kernel-Engine Docs Doxygen Source Documentation
trie.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include "data_structures/tries/trie.h"

Go to the source code of this file.

Macros

#define DEFAULT_MAX_NODES   1000000
 

Functions

void ck_trie_clear (CKTrie *trie)
 
CKTrieck_trie_create (size_t max_nodes)
 
int32_t ck_trie_find_longest (const CKTrie *trie, const char *text, size_t text_len, size_t start_pos, size_t *match_len)
 
void ck_trie_free (CKTrie *trie)
 
bool ck_trie_has_prefix (const CKTrie *trie, const char *text, size_t text_len, size_t pos)
 
int ck_trie_insert (CKTrie *trie, const char *token, int32_t token_id, bool is_special, int32_t priority)
 
size_t ck_trie_node_count (const CKTrie *trie)
 
static CKTrieNodecreate_node (void)
 

Macro Definition Documentation

◆ DEFAULT_MAX_NODES

#define DEFAULT_MAX_NODES   1000000

Definition at line 16 of file trie.c.

Function Documentation

◆ ck_trie_clear()

void ck_trie_clear ( CKTrie trie)

Definition at line 80 of file trie.c.

80  {
81  if (!trie) return;
82 
83  /* Free all nodes except root */
84  CKTrieNode **queue = (CKTrieNode **)malloc(trie->node_count * sizeof(CKTrieNode *));
85  if (!queue) return;
86 
87  size_t head = 0, tail = 0;
88  queue[tail++] = trie->root;
89 
90  while (head < tail) {
91  CKTrieNode *node = queue[head++];
92  for (int i = 0; i < 256; i++) {
93  if (node->children[i]) {
94  queue[tail++] = node->children[i];
95  }
96  }
97  /* Clear children pointers but don't free root */
98  if (node != trie->root) {
99  free(node);
100  } else {
101  memset(node->children, 0, sizeof(node->children));
102  node->token_id = -1;
103  }
104  }
105 
106  free(queue);
107  trie->node_count = 1;
108 }
struct CKTrieNode * children[256]

References CKTrieNode::children, CKTrie::node_count, CKTrie::root, and CKTrieNode::token_id.

Referenced by ck_tokenizer_reset().

◆ ck_trie_create()

CKTrie* ck_trie_create ( size_t  max_nodes)

Definition at line 29 of file trie.c.

29  {
30  CKTrie *trie = (CKTrie *)malloc(sizeof(CKTrie));
31  if (!trie) {
32  return NULL;
33  }
34 
35  if (max_nodes == 0) {
36  max_nodes = DEFAULT_MAX_NODES;
37  }
38 
39  trie->root = create_node();
40  if (!trie->root) {
41  free(trie);
42  return NULL;
43  }
44 
45  trie->max_nodes = max_nodes;
46  trie->node_count = 1; /* Root node */
47 
48  return trie;
49 }
#define DEFAULT_MAX_NODES
Definition: trie.c:16
static CKTrieNode * create_node(void)
Definition: trie.c:19

References create_node(), DEFAULT_MAX_NODES, CKTrie::max_nodes, CKTrie::node_count, and CKTrie::root.

Referenced by ck_tokenizer_create().

◆ ck_trie_find_longest()

int32_t ck_trie_find_longest ( const CKTrie trie,
const char *  text,
size_t  text_len,
size_t  start_pos,
size_t *  match_len 
)

Definition at line 142 of file trie.c.

143  {
144  if (!trie || !text || start_pos >= text_len || !match_len) {
145  *match_len = 0;
146  return -1;
147  }
148 
149  CKTrieNode *node = trie->root;
150  CKTrieNode *last_token_node = NULL;
151  size_t last_token_len = 0;
152  size_t pos = start_pos;
153 
154  /* Traverse as long as we have matching children */
155  while (pos < text_len && node) {
156  unsigned char c = (unsigned char)text[pos];
157 
158  if (!node->children[c]) {
159  break;
160  }
161 
162  node = node->children[c];
163  pos++;
164 
165  /* Record if this is a valid token end */
166  if (node->token_id >= 0) {
167  last_token_node = node;
168  last_token_len = pos - start_pos;
169  }
170  }
171 
172  *match_len = last_token_len;
173 
174  if (last_token_node) {
175  return last_token_node->token_id;
176  }
177 
178  return -1;
179 }
const char * text
Definition: tokenizer.h:563
const char int text_len
Definition: true_bpe.h:262

References CKTrieNode::children, CKTrie::root, text, text_len, and CKTrieNode::token_id.

Referenced by find_longest_match_trie().

◆ ck_trie_free()

void ck_trie_free ( CKTrie trie)

Definition at line 51 of file trie.c.

51  {
52  if (!trie) return;
53 
54  /* Free all nodes using BFS */
55  CKTrieNode **queue = (CKTrieNode **)malloc(trie->node_count * sizeof(CKTrieNode *));
56  if (!queue) {
57  /* Fallback to simple free */
58  free(trie->root);
59  free(trie);
60  return;
61  }
62 
63  size_t head = 0, tail = 0;
64  queue[tail++] = trie->root;
65 
66  while (head < tail) {
67  CKTrieNode *node = queue[head++];
68  for (int i = 0; i < 256; i++) {
69  if (node->children[i]) {
70  queue[tail++] = node->children[i];
71  }
72  }
73  free(node);
74  }
75 
76  free(queue);
77  free(trie);
78 }

References CKTrieNode::children, CKTrie::node_count, and CKTrie::root.

Referenced by ck_tokenizer_free().

◆ ck_trie_has_prefix()

bool ck_trie_has_prefix ( const CKTrie trie,
const char *  text,
size_t  text_len,
size_t  pos 
)

Definition at line 181 of file trie.c.

181  {
182  if (!trie || !text || pos >= text_len) return false;
183 
184  CKTrieNode *node = trie->root;
185 
186  while (pos < text_len && node) {
187  unsigned char c = (unsigned char)text[pos];
188  node = node->children[c];
189  pos++;
190  }
191 
192  return node != NULL;
193 }

References CKTrieNode::children, CKTrie::root, text, and text_len.

◆ ck_trie_insert()

int ck_trie_insert ( CKTrie trie,
const char *  token,
int32_t  token_id,
bool  is_special,
int32_t  priority 
)

Definition at line 110 of file trie.c.

110  {
111  if (!trie || !token) return -1;
112 
113  CKTrieNode *node = trie->root;
114  const unsigned char *p = (const unsigned char *)token;
115 
116  while (*p) {
117  unsigned char c = *p++;
118 
119  if (!node->children[c]) {
120  if (trie->node_count >= trie->max_nodes) {
121  return -1; /* Out of nodes */
122  }
123 
124  CKTrieNode *new_node = create_node();
125  if (!new_node) return -1;
126 
127  node->children[c] = new_node;
128  trie->node_count++;
129  }
130 
131  node = node->children[c];
132  }
133 
134  /* Mark end of token */
135  node->token_id = token_id;
136  node->is_special = is_special;
137  node->priority = priority;
138 
139  return 0;
140 }
const char * token
Definition: tokenizer.h:306
int32_t int32_t int32_t int32_t priority
Definition: true_bpe.h:115

References CKTrieNode::children, create_node(), CKTrieNode::is_special, CKTrie::max_nodes, CKTrie::node_count, CKTrieNode::priority, priority, CKTrie::root, token, and CKTrieNode::token_id.

Referenced by ck_tokenizer_add_special_token(), and ck_tokenizer_add_token().

◆ ck_trie_node_count()

size_t ck_trie_node_count ( const CKTrie trie)

Definition at line 195 of file trie.c.

195  {
196  return trie ? trie->node_count : 0;
197 }

References CKTrie::node_count.

◆ create_node()

static CKTrieNode* create_node ( void  )
static

Definition at line 19 of file trie.c.

19  {
20  CKTrieNode *node = (CKTrieNode *)calloc(1, sizeof(CKTrieNode));
21  if (node) {
22  node->token_id = -1;
23  node->is_special = false;
24  node->priority = 0;
25  }
26  return node;
27 }

References CKTrieNode::is_special, CKTrieNode::priority, and CKTrieNode::token_id.

Referenced by ck_trie_create(), and ck_trie_insert().