13 if ((c & 0x80) == 0) {
15 }
else if ((c & 0xE0) == 0xC0) {
17 }
else if ((c & 0xF0) == 0xE0) {
19 }
else if ((c & 0xF8) == 0xF0) {
26 if (len == (
size_t)-1) {
32 unsigned char c = (
unsigned char)str[i];
40 for (
int j = 1; j < char_len; j++) {
44 unsigned char cont = (
unsigned char)str[i + j];
45 if ((cont & 0xC0) != 0x80) {
51 if (char_len == 2 && c < 0xC2) {
53 }
else if (char_len == 3) {
54 if (c == 0xE0 && (str[i + 1] & 0xE0) == 0x80) {
57 if (c == 0xED && (str[i + 1] & 0xE0) == 0xA0) {
60 }
else if (char_len == 4) {
61 if (c == 0xF0 && (str[i + 1] & 0xF0) == 0x80) {
67 if (c == 0xF4 && (str[i + 1] & 0xF0) == 0x80) {
83 if (len == (
size_t)-1) {
105 if (!str || !*str || !
out_len) {
109 unsigned char c = (
unsigned char)(**str);
148 }
else if (cp < 0x800) {
149 out[0] = (char)(0xC0 | (cp >> 6));
150 out[1] = (char)(0x80 | (cp & 0x3F));
152 }
else if (cp < 0x10000) {
153 out[0] = (char)(0xE0 | (cp >> 12));
154 out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
155 out[2] = (char)(0x80 | (cp & 0x3F));
158 out[0] = (char)(0xF0 | (cp >> 18));
159 out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
160 out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
161 out[3] = (char)(0x80 | (cp & 0x3F));
167 if (len == (
size_t)-1) {
171 size_t byte_offset = 0;
174 while (byte_offset < len && char_idx < n) {
179 byte_offset += char_len;
188 if (len == (
size_t)-1) {
192 if (byte_offset >= len) {
199 while (i < byte_offset) {
213 static const struct {
241 char *dst,
size_t dst_size) {
242 if (src_len == (
size_t)-1) {
243 src_len = strlen(src);
251 size_t to_copy = src_len < dst_size - 1 ? src_len : dst_size - 1;
252 memcpy(dst, src, to_copy);
const int32_t int int * out_len
bool ck_utf8_is_whitespace(uint32_t cp)
int32_t ck_utf8_next_char(const char **str, int *out_len)
size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset)
size_t ck_utf8_count_chars(const char *str, size_t len)
int ck_utf8_from_cp(uint32_t cp, char *out)
size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n)
bool ck_utf8_is_valid(const char *str, size_t len)
size_t ck_tokenizer_utf8_normalize_nfc(const char *src, size_t src_len, char *dst, size_t dst_size)
static const struct @1 CK_UTF8_WHITESPACE_RANGES[]
size_t ck_utf8_validate(const char *str, size_t len)
int ck_utf8_char_length(unsigned char c)
static uint32_t ck_utf8_decode_3(const char *s)
static uint32_t ck_utf8_decode_4(const char *s)
static uint32_t ck_utf8_decode_2(const char *s)