#include <stddef.h>#include <stdint.h>#include <stdbool.h>Go to the source code of this file.
Macros | |
| #define | CK_UTF8_CARRIAGE_RETURN 0x000D |
| #define | CK_UTF8_GPT2_SPACE_HIGH 0xC4 |
| #define | CK_UTF8_GPT2_SPACE_LOW 0xA0 |
| #define | CK_UTF8_NBSP 0x00A0 |
| #define | CK_UTF8_NEWLINE 0x000A |
| #define | CK_UTF8_SPACE 0x0020 |
| #define | CK_UTF8_SPM_SPACE 0xE2 0x96 0x81 |
| #define | CK_UTF8_TAB 0x0009 |
| #define | CK_UTF8_WORD_JOINER 0x2060 |
Functions | |
| size_t | ck_utf8_byte_to_offset (const char *str, size_t len, size_t byte_offset) |
| int | ck_utf8_char_length (unsigned char c) |
| size_t | ck_utf8_count_chars (const char *str, size_t len) |
| static uint32_t | ck_utf8_decode_2 (const char *s) |
| static uint32_t | ck_utf8_decode_3 (const char *s) |
| static uint32_t | ck_utf8_decode_4 (const char *s) |
| static unsigned char | ck_utf8_first_byte (const char *s) |
| int | ck_utf8_from_cp (uint32_t cp, char *out) |
| static int | ck_utf8_is_continuation (unsigned char c) |
| bool | ck_utf8_is_valid (const char *str, size_t len) |
| bool | ck_utf8_is_whitespace (uint32_t cp) |
| int32_t | ck_utf8_next_char (const char **str, int *out_len) |
| size_t | ck_utf8_normalize_nfc (const char *src, size_t src_len, char *dst, size_t dst_size) |
| size_t | ck_utf8_offset_to_byte (const char *str, size_t len, size_t n) |
| size_t | ck_utf8_validate (const char *str, size_t len) |
| size_t ck_utf8_byte_to_offset | ( | const char * | str, |
| size_t | len, | ||
| size_t | byte_offset | ||
| ) |
Get the character index from byte offset.
| str | String |
| len | String length in bytes |
| byte_offset | Byte offset |
Definition at line 187 of file utf8.c.
References ck_utf8_char_length(), and ck_utf8_count_chars().
| int ck_utf8_char_length | ( | unsigned char | c | ) |
Get the length of a UTF-8 character from its first byte.
| c | First byte of UTF-8 character |
Definition at line 12 of file utf8.c.
Referenced by ck_utf8_byte_to_offset(), ck_utf8_count_chars(), ck_utf8_next_char(), ck_utf8_offset_to_byte(), and ck_utf8_validate().
| size_t ck_utf8_count_chars | ( | const char * | str, |
| size_t | len | ||
| ) |
Count UTF-8 characters in a string.
| str | String to count |
| len | Length in bytes, or -1 for null-terminated |
Definition at line 82 of file utf8.c.
References ck_utf8_char_length().
Referenced by ck_utf8_byte_to_offset().
|
inlinestatic |
Get 2-byte UTF-8 sequence value.
Definition at line 131 of file utf8.h.
Referenced by ck_utf8_next_char().
|
inlinestatic |
Get 3-byte UTF-8 sequence value.
Definition at line 138 of file utf8.h.
Referenced by ck_utf8_next_char().
|
inlinestatic |
Get 4-byte UTF-8 sequence value.
Definition at line 145 of file utf8.h.
Referenced by ck_utf8_next_char().
|
inlinestatic |
| int ck_utf8_from_cp | ( | uint32_t | cp, |
| char * | out | ||
| ) |
|
inlinestatic |
| bool ck_utf8_is_valid | ( | const char * | str, |
| size_t | len | ||
| ) |
Check if a byte sequence is valid UTF-8.
| str | String to check |
| len | Length in bytes |
Definition at line 78 of file utf8.c.
References ck_utf8_validate().
| bool ck_utf8_is_whitespace | ( | uint32_t | cp | ) |
Check if character is whitespace (Unicode White_Space property).
| cp | Unicode code point |
Definition at line 229 of file utf8.c.
References CK_UTF8_WHITESPACE_RANGES, end, and start.
| int32_t ck_utf8_next_char | ( | const char ** | str, |
| int * | out_len | ||
| ) |
Get next UTF-8 character, return its code point.
| str | String pointer (will be updated) |
| out_len | Output: length of character in bytes |
Definition at line 104 of file utf8.c.
References ck_utf8_char_length(), ck_utf8_decode_2(), ck_utf8_decode_3(), ck_utf8_decode_4(), and out_len.
| size_t ck_utf8_normalize_nfc | ( | const char * | src, |
| size_t | src_len, | ||
| char * | dst, | ||
| size_t | dst_size | ||
| ) |
Normalize UTF-8 string (Unicode normalization form NFC).
| src | Source string |
| src_len | Source length |
| dst | Destination buffer |
| dst_size | Destination size |
| size_t ck_utf8_offset_to_byte | ( | const char * | str, |
| size_t | len, | ||
| size_t | n | ||
| ) |
Get the byte offset of the N-th character.
| str | String |
| len | String length in bytes |
| n | Character index (0-based) |
Definition at line 166 of file utf8.c.
References ck_utf8_char_length().
| size_t ck_utf8_validate | ( | const char * | str, |
| size_t | len | ||
| ) |
Validate a UTF-8 string.
| str | String to validate |
| len | Length in bytes, or -1 for null-terminated |
Definition at line 25 of file utf8.c.
References ck_utf8_char_length().
Referenced by ck_utf8_is_valid().