← Back to C-Kernel-Engine Docs Doxygen Source Documentation
hash_table.c File Reference
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <stdio.h>
#include "tokenizer/hash_table.h"
#include "tokenizer/murmurhash3.h"

Go to the source code of this file.

Macros

#define CK_TOKENIZER_HASH_SEED   0x9747b28c
 

Functions

uint32_t ck_tokenizer_hash (const char *key, size_t len)
 
uint32_t ck_tokenizer_hash_str (const char *key)
 
void ck_tokenizer_hash_table_clear (CKTokenizerHashTable *table, bool free_values)
 
bool ck_tokenizer_hash_table_contains (CKTokenizerHashTable *table, const char *key)
 
size_t ck_tokenizer_hash_table_count (CKTokenizerHashTable *table)
 
CKTokenizerHashTableck_tokenizer_hash_table_create (size_t bucket_count)
 
int ck_tokenizer_hash_table_delete (CKTokenizerHashTable *table, const char *key, bool free_value)
 
void ck_tokenizer_hash_table_free (CKTokenizerHashTable *table, bool free_values)
 
int ck_tokenizer_hash_table_insert (CKTokenizerHashTable *table, const char *key, void *value)
 
int ck_tokenizer_hash_table_iterate (CKTokenizerHashTable *table, CKTokenizerHashCallback callback, void *user_data)
 
size_t ck_tokenizer_hash_table_keys (CKTokenizerHashTable *table, const char **out_keys, size_t max_keys)
 
void * ck_tokenizer_hash_table_lookup (CKTokenizerHashTable *table, const char *key)
 
void * ck_tokenizer_hash_table_lookup_avx (CKTokenizerHashTable *table, const char *key)
 
static CKTokenizerHashEntrycreate_entry (const char *key, const void *value, size_t value_size)
 
static void free_entry (CKTokenizerHashEntry *entry, bool free_value)
 
static int simd_strcmp (const char *s1, const char *s2)
 

Macro Definition Documentation

◆ CK_TOKENIZER_HASH_SEED

#define CK_TOKENIZER_HASH_SEED   0x9747b28c

Definition at line 16 of file hash_table.c.

Function Documentation

◆ ck_tokenizer_hash()

uint32_t ck_tokenizer_hash ( const char *  key,
size_t  len 
)

Definition at line 18 of file hash_table.c.

18  {
19  return ck_murmurhash3(key, (uint32_t)len, CK_TOKENIZER_HASH_SEED);
20 }
#define CK_TOKENIZER_HASH_SEED
Definition: hash_table.c:16
uint32_t ck_murmurhash3(const char *key, uint32_t len, uint32_t seed)
Definition: murmurhash3.c:11

References ck_murmurhash3(), and CK_TOKENIZER_HASH_SEED.

◆ ck_tokenizer_hash_str()

uint32_t ck_tokenizer_hash_str ( const char *  key)

Definition at line 22 of file hash_table.c.

22  {
24 }
static uint32_t ck_murmurhash3_str(const char *str, uint32_t seed)
Definition: murmurhash3.h:55

References ck_murmurhash3_str(), and CK_TOKENIZER_HASH_SEED.

Referenced by ck_tokenizer_hash_table_delete(), ck_tokenizer_hash_table_insert(), ck_tokenizer_hash_table_lookup(), and ck_tokenizer_hash_table_lookup_avx().

◆ ck_tokenizer_hash_table_clear()

void ck_tokenizer_hash_table_clear ( CKTokenizerHashTable table,
bool  free_values 
)

Clear all entries (but keep bucket array).

Parameters
tableHash table
free_valuesIf true, free all value pointers

Definition at line 312 of file hash_table.c.

312  {
313  if (!table) {
314  return;
315  }
316 
317  for (size_t i = 0; i < table->size; i++) {
318  CKTokenizerHashEntry *entry = table->entries[i];
319  while (entry) {
320  CKTokenizerHashEntry *next = entry->next;
321  free_entry(entry, free_values);
322  entry = next;
323  }
324  table->entries[i] = NULL;
325  }
326 
327  table->count = 0;
328 }
static void free_entry(CKTokenizerHashEntry *entry, bool free_value)
Definition: hash_table.c:131
struct CKTokenizerHashEntry * next
Definition: hash_table.h:25
CKTokenizerHashEntry ** entries
Definition: hash_table.h:30

References CKTokenizerHashTable::count, CKTokenizerHashTable::entries, free_entry(), CKTokenizerHashEntry::next, and CKTokenizerHashTable::size.

Referenced by ck_tokenizer_reset().

◆ ck_tokenizer_hash_table_contains()

bool ck_tokenizer_hash_table_contains ( CKTokenizerHashTable table,
const char *  key 
)

Check if key exists.

Parameters
tableHash table
keyKey to check
Returns
true if key exists

Definition at line 268 of file hash_table.c.

268  {
269  return ck_tokenizer_hash_table_lookup(table, key) != NULL;
270 }
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198

References ck_tokenizer_hash_table_lookup().

◆ ck_tokenizer_hash_table_count()

size_t ck_tokenizer_hash_table_count ( CKTokenizerHashTable table)

Get the number of entries.

Parameters
tableHash table
Returns
Number of entries

Definition at line 264 of file hash_table.c.

264  {
265  return table ? table->count : 0;
266 }

References CKTokenizerHashTable::count.

◆ ck_tokenizer_hash_table_create()

CKTokenizerHashTable* ck_tokenizer_hash_table_create ( size_t  bucket_count)

Create a hash table.

Parameters
bucket_countNumber of buckets (0 = auto-size)
Returns
Newly allocated hash table, or NULL on error

Definition at line 80 of file hash_table.c.

80  {
81  if (bucket_count == 0) {
82  bucket_count = CK_TOKENIZER_HT_BUCKETS_SMALL;
83  }
84 
86  if (!table) {
87  return NULL;
88  }
89 
90  table->entries = (CKTokenizerHashEntry **)calloc(bucket_count, sizeof(CKTokenizerHashEntry *));
91  if (!table->entries) {
92  free(table);
93  return NULL;
94  }
95 
96  table->size = bucket_count;
97  table->count = 0;
98  table->load_factor = 0.75f;
99 
100  return table;
101 }
#define CK_TOKENIZER_HT_BUCKETS_SMALL
Definition: hash_table.h:140

References CK_TOKENIZER_HT_BUCKETS_SMALL, CKTokenizerHashTable::count, CKTokenizerHashTable::entries, CKTokenizerHashTable::load_factor, and CKTokenizerHashTable::size.

Referenced by ck_tokenizer_create(), and ck_true_bpe_create().

◆ ck_tokenizer_hash_table_delete()

int ck_tokenizer_hash_table_delete ( CKTokenizerHashTable table,
const char *  key,
bool  free_value 
)

Delete a key.

Parameters
tableHash table
keyKey to delete
free_valueIf true, free the value pointer
Returns
0 if found and deleted, -1 if not found

Definition at line 235 of file hash_table.c.

237  {
238  if (!table || !key) {
239  return -1;
240  }
241 
242  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
243  CKTokenizerHashEntry *entry = table->entries[bucket];
244  CKTokenizerHashEntry *prev = NULL;
245 
246  while (entry) {
247  if (strcmp(entry->key, key) == 0) {
248  if (prev) {
249  prev->next = entry->next;
250  } else {
251  table->entries[bucket] = entry->next;
252  }
253  free_entry(entry, free_value);
254  table->count--;
255  return 0;
256  }
257  prev = entry;
258  entry = entry->next;
259  }
260 
261  return -1;
262 }
uint32_t ck_tokenizer_hash_str(const char *key)
Definition: hash_table.c:22

References ck_tokenizer_hash_str(), CKTokenizerHashTable::count, CKTokenizerHashTable::entries, free_entry(), CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, and CKTokenizerHashTable::size.

◆ ck_tokenizer_hash_table_free()

void ck_tokenizer_hash_table_free ( CKTokenizerHashTable table,
bool  free_values 
)

Free a hash table.

Parameters
tableHash table to free
free_valuesIf true, also free all value pointers

Definition at line 140 of file hash_table.c.

140  {
141  if (!table) {
142  return;
143  }
144 
145  for (size_t i = 0; i < table->size; i++) {
146  CKTokenizerHashEntry *entry = table->entries[i];
147  while (entry) {
148  CKTokenizerHashEntry *next = entry->next;
149  free_entry(entry, free_values);
150  entry = next;
151  }
152  }
153 
154  free(table->entries);
155  free(table);
156 }

References CKTokenizerHashTable::entries, free_entry(), CKTokenizerHashEntry::next, and CKTokenizerHashTable::size.

Referenced by ck_tokenizer_create(), ck_tokenizer_free(), ck_true_bpe_create(), and ck_true_bpe_free().

◆ ck_tokenizer_hash_table_insert()

int ck_tokenizer_hash_table_insert ( CKTokenizerHashTable table,
const char *  key,
void *  value 
)

Insert a key-value pair.

Parameters
tableHash table
keyKey string
valueValue pointer
Returns
0 on success, -1 on error

Definition at line 158 of file hash_table.c.

160  {
161  if (!table || !key) {
162  return -1;
163  }
164 
165  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
166  CKTokenizerHashEntry *entry = table->entries[bucket];
167 
168  /* Check if key already exists */
169  while (entry) {
170  if (strcmp(entry->key, key) == 0) {
171  /* Update existing entry - just replace value pointer */
172  entry->value = value;
173  return 0;
174  }
175  entry = entry->next;
176  }
177 
178  /* Create new entry with NULL value (caller manages memory) */
179  CKTokenizerHashEntry *new_entry = (CKTokenizerHashEntry *)malloc(sizeof(CKTokenizerHashEntry));
180  if (!new_entry) {
181  return -1;
182  }
183 
184  new_entry->key = strdup(key);
185  if (!new_entry->key) {
186  free(new_entry);
187  return -1;
188  }
189 
190  new_entry->value = value;
191  new_entry->next = table->entries[bucket];
192  table->entries[bucket] = new_entry;
193  table->count++;
194 
195  return 0;
196 }

References ck_tokenizer_hash_str(), CKTokenizerHashTable::count, CKTokenizerHashTable::entries, CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, CKTokenizerHashTable::size, and CKTokenizerHashEntry::value.

Referenced by ck_tokenizer_add_token(), and ck_true_bpe_add_token().

◆ ck_tokenizer_hash_table_iterate()

int ck_tokenizer_hash_table_iterate ( CKTokenizerHashTable table,
CKTokenizerHashCallback  callback,
void *  user_data 
)

Definition at line 272 of file hash_table.c.

274  {
275  if (!table || !callback) {
276  return -1;
277  }
278 
279  for (size_t i = 0; i < table->size; i++) {
280  CKTokenizerHashEntry *entry = table->entries[i];
281  while (entry) {
282  int ret = callback(entry->key, entry->value, user_data);
283  if (ret != 0) {
284  return ret;
285  }
286  entry = entry->next;
287  }
288  }
289 
290  return 0;
291 }

References CKTokenizerHashTable::entries, CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, CKTokenizerHashTable::size, and CKTokenizerHashEntry::value.

◆ ck_tokenizer_hash_table_keys()

size_t ck_tokenizer_hash_table_keys ( CKTokenizerHashTable table,
const char **  out_keys,
size_t  max_keys 
)

Get all keys as an array.

Parameters
tableHash table
out_keysOutput array for keys (must be pre-allocated)
max_keysMaximum keys to write
Returns
Number of keys written

Definition at line 293 of file hash_table.c.

295  {
296  if (!table || !out_keys) {
297  return 0;
298  }
299 
300  size_t written = 0;
301  for (size_t i = 0; i < table->size && written < max_keys; i++) {
302  CKTokenizerHashEntry *entry = table->entries[i];
303  while (entry && written < max_keys) {
304  out_keys[written++] = entry->key;
305  entry = entry->next;
306  }
307  }
308 
309  return written;
310 }

References CKTokenizerHashTable::entries, CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, and CKTokenizerHashTable::size.

◆ ck_tokenizer_hash_table_lookup()

void* ck_tokenizer_hash_table_lookup ( CKTokenizerHashTable table,
const char *  key 
)

Look up a key.

Parameters
tableHash table
keyKey to look up
Returns
Value pointer, or NULL if not found

Definition at line 198 of file hash_table.c.

198  {
199  if (!table || !key) {
200  return NULL;
201  }
202 
203  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
204  CKTokenizerHashEntry *entry = table->entries[bucket];
205 
206  while (entry) {
207  if (strcmp(entry->key, key) == 0) {
208  return entry->value;
209  }
210  entry = entry->next;
211  }
212 
213  return NULL;
214 }

References ck_tokenizer_hash_str(), CKTokenizerHashTable::entries, CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, CKTokenizerHashTable::size, and CKTokenizerHashEntry::value.

Referenced by ck_tokenizer_add_special_token(), ck_tokenizer_add_token(), ck_tokenizer_hash_table_contains(), ck_tokenizer_lookup(), ck_tokenizer_lookup_exact(), ck_true_bpe_add_merge_by_tokens(), ck_true_bpe_add_token(), ck_true_bpe_lookup(), find_longest_match_hash(), spm_count_unknown_run(), and spm_find_candidates_at_pos().

◆ ck_tokenizer_hash_table_lookup_avx()

void* ck_tokenizer_hash_table_lookup_avx ( CKTokenizerHashTable table,
const char *  key 
)

Definition at line 217 of file hash_table.c.

217  {
218  if (!table || !key) {
219  return NULL;
220  }
221 
222  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
223  CKTokenizerHashEntry *entry = table->entries[bucket];
224 
225  while (entry) {
226  if (simd_strcmp(entry->key, key) == 0) {
227  return entry->value;
228  }
229  entry = entry->next;
230  }
231 
232  return NULL;
233 }
static int simd_strcmp(const char *s1, const char *s2)
Definition: hash_table.c:28

References ck_tokenizer_hash_str(), CKTokenizerHashTable::entries, CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, simd_strcmp(), CKTokenizerHashTable::size, and CKTokenizerHashEntry::value.

◆ create_entry()

static CKTokenizerHashEntry* create_entry ( const char *  key,
const void *  value,
size_t  value_size 
)
static

Definition at line 103 of file hash_table.c.

103  {
105  if (!entry) {
106  return NULL;
107  }
108 
109  entry->key = strdup(key);
110  if (!entry->key) {
111  free(entry);
112  return NULL;
113  }
114 
115  if (value) {
116  entry->value = malloc(value_size);
117  if (!entry->value) {
118  free(entry->key);
119  free(entry);
120  return NULL;
121  }
122  memcpy(entry->value, value, value_size);
123  } else {
124  entry->value = NULL;
125  }
126 
127  entry->next = NULL;
128  return entry;
129 }

References CKTokenizerHashEntry::key, CKTokenizerHashEntry::next, and CKTokenizerHashEntry::value.

◆ free_entry()

static void free_entry ( CKTokenizerHashEntry entry,
bool  free_value 
)
static

Definition at line 131 of file hash_table.c.

131  {
132  if (!entry) return;
133  free(entry->key);
134  if (free_value && entry->value) {
135  free(entry->value);
136  }
137  free(entry);
138 }

References CKTokenizerHashEntry::key, and CKTokenizerHashEntry::value.

Referenced by ck_tokenizer_hash_table_clear(), ck_tokenizer_hash_table_delete(), and ck_tokenizer_hash_table_free().

◆ simd_strcmp()

static int simd_strcmp ( const char *  s1,
const char *  s2 
)
inlinestatic

Definition at line 28 of file hash_table.c.

28  {
29  size_t len1 = strlen(s1);
30  size_t len2 = strlen(s2);
31 
32  /* Fallback to regular strcmp for short strings */
33  if (len1 < 64 || len2 < 64) {
34  return strcmp(s1, s2);
35  }
36 
37 #if defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
38  /* AVX-512 path */
39  while (1) {
40  /* Load 64 bytes from each string into AVX-512 registers */
41  __m512i chunk1 = _mm512_loadu_si512((const __m512i *)s1);
42  __m512i chunk2 = _mm512_loadu_si512((const __m512i *)s2);
43 
44  /* Compare the chunks byte by byte */
45  __mmask64 cmp_mask = _mm512_cmpeq_epu8_mask(chunk1, chunk2);
46 
47  /* Check if all bytes are equal */
48  if (cmp_mask != 0xFFFFFFFFFFFFFFFF) {
49  /* Find the first differing byte */
50  int first_diff = __builtin_ctzll(~cmp_mask);
51  return (unsigned char)s1[first_diff] - (unsigned char)s2[first_diff];
52  }
53 
54  /* Check if we hit a null character in s1 or s2 */
55  __mmask64 null_mask1 = _mm512_test_epi8_mask(chunk1, _mm512_set1_epi8('\0'));
56  __mmask64 null_mask2 = _mm512_test_epi8_mask(chunk2, _mm512_set1_epi8('\0'));
57 
58  if (null_mask1 || null_mask2) {
59  if (len1 == len2) {
60  return 0;
61  } else {
62  return (len1 < len2) ? -1 : 1;
63  }
64  }
65 
66  /* Advance the pointers by 64 bytes for the next iteration */
67  s1 += 64;
68  s2 += 64;
69  }
70 #else
71  /* SSE/AVX fallback - just use regular strcmp for simplicity */
72  (void)s1;
73  (void)s2;
74  (void)len1;
75  (void)len2;
76  return strcmp(s1, s2);
77 #endif
78 }

Referenced by ck_tokenizer_hash_table_lookup_avx().