← Back to C-Kernel-Engine Docs Doxygen Source Documentation
hash_table.c
Go to the documentation of this file.
1 /*
2  * Hash Table with AVX-512 Optimized String Comparison
3  *
4  * Direct copy from HPC_Embeddings with SIMD optimizations.
5  */
6 
7 #include <stdint.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <immintrin.h>
11 #include <stdio.h>
12 #include "tokenizer/hash_table.h"
13 #include "tokenizer/murmurhash3.h"
14 
15 /* Hash seed constant from HPC_Embeddings */
16 #define CK_TOKENIZER_HASH_SEED 0x9747b28c
17 
18 uint32_t ck_tokenizer_hash(const char *key, size_t len) {
19  return ck_murmurhash3(key, (uint32_t)len, CK_TOKENIZER_HASH_SEED);
20 }
21 
22 uint32_t ck_tokenizer_hash_str(const char *key) {
24 }
25 
26 /* SIMD-optimized string comparison using AVX-512 */
27 /* Falls back to regular strcmp if AVX-512 is not available at compile time */
28 static inline int simd_strcmp(const char *s1, const char *s2) {
29  size_t len1 = strlen(s1);
30  size_t len2 = strlen(s2);
31 
32  /* Fallback to regular strcmp for short strings */
33  if (len1 < 64 || len2 < 64) {
34  return strcmp(s1, s2);
35  }
36 
37 #if defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
38  /* AVX-512 path */
39  while (1) {
40  /* Load 64 bytes from each string into AVX-512 registers */
41  __m512i chunk1 = _mm512_loadu_si512((const __m512i *)s1);
42  __m512i chunk2 = _mm512_loadu_si512((const __m512i *)s2);
43 
44  /* Compare the chunks byte by byte */
45  __mmask64 cmp_mask = _mm512_cmpeq_epu8_mask(chunk1, chunk2);
46 
47  /* Check if all bytes are equal */
48  if (cmp_mask != 0xFFFFFFFFFFFFFFFF) {
49  /* Find the first differing byte */
50  int first_diff = __builtin_ctzll(~cmp_mask);
51  return (unsigned char)s1[first_diff] - (unsigned char)s2[first_diff];
52  }
53 
54  /* Check if we hit a null character in s1 or s2 */
55  __mmask64 null_mask1 = _mm512_test_epi8_mask(chunk1, _mm512_set1_epi8('\0'));
56  __mmask64 null_mask2 = _mm512_test_epi8_mask(chunk2, _mm512_set1_epi8('\0'));
57 
58  if (null_mask1 || null_mask2) {
59  if (len1 == len2) {
60  return 0;
61  } else {
62  return (len1 < len2) ? -1 : 1;
63  }
64  }
65 
66  /* Advance the pointers by 64 bytes for the next iteration */
67  s1 += 64;
68  s2 += 64;
69  }
70 #else
71  /* SSE/AVX fallback - just use regular strcmp for simplicity */
72  (void)s1;
73  (void)s2;
74  (void)len1;
75  (void)len2;
76  return strcmp(s1, s2);
77 #endif
78 }
79 
81  if (bucket_count == 0) {
82  bucket_count = CK_TOKENIZER_HT_BUCKETS_SMALL;
83  }
84 
86  if (!table) {
87  return NULL;
88  }
89 
90  table->entries = (CKTokenizerHashEntry **)calloc(bucket_count, sizeof(CKTokenizerHashEntry *));
91  if (!table->entries) {
92  free(table);
93  return NULL;
94  }
95 
96  table->size = bucket_count;
97  table->count = 0;
98  table->load_factor = 0.75f;
99 
100  return table;
101 }
102 
103 static CKTokenizerHashEntry *create_entry(const char *key, const void *value, size_t value_size) {
105  if (!entry) {
106  return NULL;
107  }
108 
109  entry->key = strdup(key);
110  if (!entry->key) {
111  free(entry);
112  return NULL;
113  }
114 
115  if (value) {
116  entry->value = malloc(value_size);
117  if (!entry->value) {
118  free(entry->key);
119  free(entry);
120  return NULL;
121  }
122  memcpy(entry->value, value, value_size);
123  } else {
124  entry->value = NULL;
125  }
126 
127  entry->next = NULL;
128  return entry;
129 }
130 
131 static void free_entry(CKTokenizerHashEntry *entry, bool free_value) {
132  if (!entry) return;
133  free(entry->key);
134  if (free_value && entry->value) {
135  free(entry->value);
136  }
137  free(entry);
138 }
139 
140 void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values) {
141  if (!table) {
142  return;
143  }
144 
145  for (size_t i = 0; i < table->size; i++) {
146  CKTokenizerHashEntry *entry = table->entries[i];
147  while (entry) {
148  CKTokenizerHashEntry *next = entry->next;
149  free_entry(entry, free_values);
150  entry = next;
151  }
152  }
153 
154  free(table->entries);
155  free(table);
156 }
157 
159  const char *key,
160  void *value) {
161  if (!table || !key) {
162  return -1;
163  }
164 
165  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
166  CKTokenizerHashEntry *entry = table->entries[bucket];
167 
168  /* Check if key already exists */
169  while (entry) {
170  if (strcmp(entry->key, key) == 0) {
171  /* Update existing entry - just replace value pointer */
172  entry->value = value;
173  return 0;
174  }
175  entry = entry->next;
176  }
177 
178  /* Create new entry with NULL value (caller manages memory) */
179  CKTokenizerHashEntry *new_entry = (CKTokenizerHashEntry *)malloc(sizeof(CKTokenizerHashEntry));
180  if (!new_entry) {
181  return -1;
182  }
183 
184  new_entry->key = strdup(key);
185  if (!new_entry->key) {
186  free(new_entry);
187  return -1;
188  }
189 
190  new_entry->value = value;
191  new_entry->next = table->entries[bucket];
192  table->entries[bucket] = new_entry;
193  table->count++;
194 
195  return 0;
196 }
197 
198 void *ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key) {
199  if (!table || !key) {
200  return NULL;
201  }
202 
203  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
204  CKTokenizerHashEntry *entry = table->entries[bucket];
205 
206  while (entry) {
207  if (strcmp(entry->key, key) == 0) {
208  return entry->value;
209  }
210  entry = entry->next;
211  }
212 
213  return NULL;
214 }
215 
216 /* AVX-512 optimized lookup */
218  if (!table || !key) {
219  return NULL;
220  }
221 
222  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
223  CKTokenizerHashEntry *entry = table->entries[bucket];
224 
225  while (entry) {
226  if (simd_strcmp(entry->key, key) == 0) {
227  return entry->value;
228  }
229  entry = entry->next;
230  }
231 
232  return NULL;
233 }
234 
236  const char *key,
237  bool free_value) {
238  if (!table || !key) {
239  return -1;
240  }
241 
242  uint32_t bucket = ck_tokenizer_hash_str(key) % table->size;
243  CKTokenizerHashEntry *entry = table->entries[bucket];
244  CKTokenizerHashEntry *prev = NULL;
245 
246  while (entry) {
247  if (strcmp(entry->key, key) == 0) {
248  if (prev) {
249  prev->next = entry->next;
250  } else {
251  table->entries[bucket] = entry->next;
252  }
253  free_entry(entry, free_value);
254  table->count--;
255  return 0;
256  }
257  prev = entry;
258  entry = entry->next;
259  }
260 
261  return -1;
262 }
263 
265  return table ? table->count : 0;
266 }
267 
269  return ck_tokenizer_hash_table_lookup(table, key) != NULL;
270 }
271 
273  CKTokenizerHashCallback callback,
274  void *user_data) {
275  if (!table || !callback) {
276  return -1;
277  }
278 
279  for (size_t i = 0; i < table->size; i++) {
280  CKTokenizerHashEntry *entry = table->entries[i];
281  while (entry) {
282  int ret = callback(entry->key, entry->value, user_data);
283  if (ret != 0) {
284  return ret;
285  }
286  entry = entry->next;
287  }
288  }
289 
290  return 0;
291 }
292 
294  const char **out_keys,
295  size_t max_keys) {
296  if (!table || !out_keys) {
297  return 0;
298  }
299 
300  size_t written = 0;
301  for (size_t i = 0; i < table->size && written < max_keys; i++) {
302  CKTokenizerHashEntry *entry = table->entries[i];
303  while (entry && written < max_keys) {
304  out_keys[written++] = entry->key;
305  entry = entry->next;
306  }
307  }
308 
309  return written;
310 }
311 
312 void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values) {
313  if (!table) {
314  return;
315  }
316 
317  for (size_t i = 0; i < table->size; i++) {
318  CKTokenizerHashEntry *entry = table->entries[i];
319  while (entry) {
320  CKTokenizerHashEntry *next = entry->next;
321  free_entry(entry, free_values);
322  entry = next;
323  }
324  table->entries[i] = NULL;
325  }
326 
327  table->count = 0;
328 }
bool ck_tokenizer_hash_table_contains(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:268
#define CK_TOKENIZER_HASH_SEED
Definition: hash_table.c:16
size_t ck_tokenizer_hash_table_count(CKTokenizerHashTable *table)
Definition: hash_table.c:264
size_t ck_tokenizer_hash_table_keys(CKTokenizerHashTable *table, const char **out_keys, size_t max_keys)
Definition: hash_table.c:293
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:140
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
Definition: hash_table.c:80
static void free_entry(CKTokenizerHashEntry *entry, bool free_value)
Definition: hash_table.c:131
int ck_tokenizer_hash_table_iterate(CKTokenizerHashTable *table, CKTokenizerHashCallback callback, void *user_data)
Definition: hash_table.c:272
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
Definition: hash_table.c:158
uint32_t ck_tokenizer_hash(const char *key, size_t len)
Definition: hash_table.c:18
uint32_t ck_tokenizer_hash_str(const char *key)
Definition: hash_table.c:22
void * ck_tokenizer_hash_table_lookup_avx(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:217
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
Definition: hash_table.c:198
static CKTokenizerHashEntry * create_entry(const char *key, const void *value, size_t value_size)
Definition: hash_table.c:103
static int simd_strcmp(const char *s1, const char *s2)
Definition: hash_table.c:28
int ck_tokenizer_hash_table_delete(CKTokenizerHashTable *table, const char *key, bool free_value)
Definition: hash_table.c:235
void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values)
Definition: hash_table.c:312
int(* CKTokenizerHashCallback)(const char *key, void *value, void *user_data)
Definition: hash_table.h:112
#define CK_TOKENIZER_HT_BUCKETS_SMALL
Definition: hash_table.h:140
uint32_t ck_murmurhash3(const char *key, uint32_t len, uint32_t seed)
Definition: murmurhash3.c:11
static uint32_t ck_murmurhash3_str(const char *str, uint32_t seed)
Definition: murmurhash3.h:55
struct CKTokenizerHashEntry * next
Definition: hash_table.h:25
CKTokenizerHashEntry ** entries
Definition: hash_table.h:30