10 #include <immintrin.h>
16 #define CK_TOKENIZER_HASH_SEED 0x9747b28c
28 static inline int simd_strcmp(
const char *s1,
const char *s2) {
29 size_t len1 = strlen(s1);
30 size_t len2 = strlen(s2);
33 if (len1 < 64 || len2 < 64) {
34 return strcmp(s1, s2);
37 #if defined(__AVX512F__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
41 __m512i chunk1 = _mm512_loadu_si512((
const __m512i *)s1);
42 __m512i chunk2 = _mm512_loadu_si512((
const __m512i *)s2);
45 __mmask64 cmp_mask = _mm512_cmpeq_epu8_mask(chunk1, chunk2);
48 if (cmp_mask != 0xFFFFFFFFFFFFFFFF) {
50 int first_diff = __builtin_ctzll(~cmp_mask);
51 return (
unsigned char)s1[first_diff] - (
unsigned char)s2[first_diff];
55 __mmask64 null_mask1 = _mm512_test_epi8_mask(chunk1, _mm512_set1_epi8(
'\0'));
56 __mmask64 null_mask2 = _mm512_test_epi8_mask(chunk2, _mm512_set1_epi8(
'\0'));
58 if (null_mask1 || null_mask2) {
62 return (len1 < len2) ? -1 : 1;
76 return strcmp(s1, s2);
81 if (bucket_count == 0) {
96 table->
size = bucket_count;
109 entry->
key = strdup(key);
116 entry->
value = malloc(value_size);
122 memcpy(entry->
value, value, value_size);
134 if (free_value && entry->
value) {
145 for (
size_t i = 0; i < table->
size; i++) {
161 if (!table || !key) {
170 if (strcmp(entry->
key, key) == 0) {
172 entry->
value = value;
184 new_entry->
key = strdup(key);
185 if (!new_entry->
key) {
190 new_entry->
value = value;
192 table->
entries[bucket] = new_entry;
199 if (!table || !key) {
207 if (strcmp(entry->
key, key) == 0) {
218 if (!table || !key) {
238 if (!table || !key) {
247 if (strcmp(entry->
key, key) == 0) {
265 return table ? table->
count : 0;
275 if (!table || !callback) {
279 for (
size_t i = 0; i < table->
size; i++) {
282 int ret = callback(entry->
key, entry->
value, user_data);
294 const char **out_keys,
296 if (!table || !out_keys) {
301 for (
size_t i = 0; i < table->
size && written < max_keys; i++) {
303 while (entry && written < max_keys) {
304 out_keys[written++] = entry->
key;
317 for (
size_t i = 0; i < table->
size; i++) {
bool ck_tokenizer_hash_table_contains(CKTokenizerHashTable *table, const char *key)
#define CK_TOKENIZER_HASH_SEED
size_t ck_tokenizer_hash_table_count(CKTokenizerHashTable *table)
size_t ck_tokenizer_hash_table_keys(CKTokenizerHashTable *table, const char **out_keys, size_t max_keys)
void ck_tokenizer_hash_table_free(CKTokenizerHashTable *table, bool free_values)
CKTokenizerHashTable * ck_tokenizer_hash_table_create(size_t bucket_count)
static void free_entry(CKTokenizerHashEntry *entry, bool free_value)
int ck_tokenizer_hash_table_iterate(CKTokenizerHashTable *table, CKTokenizerHashCallback callback, void *user_data)
int ck_tokenizer_hash_table_insert(CKTokenizerHashTable *table, const char *key, void *value)
uint32_t ck_tokenizer_hash(const char *key, size_t len)
uint32_t ck_tokenizer_hash_str(const char *key)
void * ck_tokenizer_hash_table_lookup_avx(CKTokenizerHashTable *table, const char *key)
void * ck_tokenizer_hash_table_lookup(CKTokenizerHashTable *table, const char *key)
static CKTokenizerHashEntry * create_entry(const char *key, const void *value, size_t value_size)
static int simd_strcmp(const char *s1, const char *s2)
int ck_tokenizer_hash_table_delete(CKTokenizerHashTable *table, const char *key, bool free_value)
void ck_tokenizer_hash_table_clear(CKTokenizerHashTable *table, bool free_values)
int(* CKTokenizerHashCallback)(const char *key, void *value, void *user_data)
#define CK_TOKENIZER_HT_BUCKETS_SMALL
uint32_t ck_murmurhash3(const char *key, uint32_t len, uint32_t seed)
static uint32_t ck_murmurhash3_str(const char *str, uint32_t seed)
struct CKTokenizerHashEntry * next
CKTokenizerHashEntry ** entries