10 #ifndef CK_TOKENIZER_UTF8_H
11 #define CK_TOKENIZER_UTF8_H
112 char *dst,
size_t dst_size);
118 return (
unsigned char)s[0];
125 return (c & 0xC0) == 0x80;
132 return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
139 return ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
146 return ((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
147 ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
151 #define CK_UTF8_SPACE 0x0020
152 #define CK_UTF8_TAB 0x0009
153 #define CK_UTF8_NEWLINE 0x000A
154 #define CK_UTF8_CARRIAGE_RETURN 0x000D
155 #define CK_UTF8_NBSP 0x00A0
156 #define CK_UTF8_WORD_JOINER 0x2060
159 #define CK_UTF8_GPT2_SPACE_HIGH 0xC4
160 #define CK_UTF8_GPT2_SPACE_LOW 0xA0
163 #define CK_UTF8_SPM_SPACE 0xE2 0x96 0x81
const int32_t int int * out_len
bool ck_utf8_is_whitespace(uint32_t cp)
int32_t ck_utf8_next_char(const char **str, int *out_len)
size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset)
size_t ck_utf8_count_chars(const char *str, size_t len)
static uint32_t ck_utf8_decode_3(const char *s)
int ck_utf8_from_cp(uint32_t cp, char *out)
static int ck_utf8_is_continuation(unsigned char c)
size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n)
bool ck_utf8_is_valid(const char *str, size_t len)
static uint32_t ck_utf8_decode_4(const char *s)
size_t ck_utf8_validate(const char *str, size_t len)
static unsigned char ck_utf8_first_byte(const char *s)
size_t ck_utf8_normalize_nfc(const char *src, size_t src_len, char *dst, size_t dst_size)
static uint32_t ck_utf8_decode_2(const char *s)
int ck_utf8_char_length(unsigned char c)