Go to the source code of this file.
Functions | |
| size_t | ck_tokenizer_utf8_normalize_nfc (const char *src, size_t src_len, char *dst, size_t dst_size) |
| size_t | ck_utf8_byte_to_offset (const char *str, size_t len, size_t byte_offset) |
| int | ck_utf8_char_length (unsigned char c) |
| size_t | ck_utf8_count_chars (const char *str, size_t len) |
| int | ck_utf8_from_cp (uint32_t cp, char *out) |
| bool | ck_utf8_is_valid (const char *str, size_t len) |
| bool | ck_utf8_is_whitespace (uint32_t cp) |
| int32_t | ck_utf8_next_char (const char **str, int *out_len) |
| size_t | ck_utf8_offset_to_byte (const char *str, size_t len, size_t n) |
| size_t | ck_utf8_validate (const char *str, size_t len) |
Variables | |
| struct { | |
| uint32_t end | |
| uint32_t start | |
| } | CK_UTF8_WHITESPACE_RANGES [] |
| size_t ck_tokenizer_utf8_normalize_nfc | ( | const char * | src, |
| size_t | src_len, | ||
| char * | dst, | ||
| size_t | dst_size | ||
| ) |
| size_t ck_utf8_byte_to_offset | ( | const char * | str, |
| size_t | len, | ||
| size_t | byte_offset | ||
| ) |
Get the character index from byte offset.
| str | String |
| len | String length in bytes |
| byte_offset | Byte offset |
Definition at line 187 of file utf8.c.
References ck_utf8_char_length(), and ck_utf8_count_chars().
| int ck_utf8_char_length | ( | unsigned char | c | ) |
Get the length of a UTF-8 character from its first byte.
| c | First byte of UTF-8 character |
Definition at line 12 of file utf8.c.
Referenced by ck_utf8_byte_to_offset(), ck_utf8_count_chars(), ck_utf8_next_char(), ck_utf8_offset_to_byte(), and ck_utf8_validate().
| size_t ck_utf8_count_chars | ( | const char * | str, |
| size_t | len | ||
| ) |
Count UTF-8 characters in a string.
| str | String to count |
| len | Length in bytes, or -1 for null-terminated |
Definition at line 82 of file utf8.c.
References ck_utf8_char_length().
Referenced by ck_utf8_byte_to_offset().
| int ck_utf8_from_cp | ( | uint32_t | cp, |
| char * | out | ||
| ) |
| bool ck_utf8_is_valid | ( | const char * | str, |
| size_t | len | ||
| ) |
Check if a byte sequence is valid UTF-8.
| str | String to check |
| len | Length in bytes |
Definition at line 78 of file utf8.c.
References ck_utf8_validate().
| bool ck_utf8_is_whitespace | ( | uint32_t | cp | ) |
Check if character is whitespace (Unicode White_Space property).
| cp | Unicode code point |
Definition at line 229 of file utf8.c.
References CK_UTF8_WHITESPACE_RANGES, end, and start.
| int32_t ck_utf8_next_char | ( | const char ** | str, |
| int * | out_len | ||
| ) |
Get next UTF-8 character, return its code point.
| str | String pointer (will be updated) |
| out_len | Output: length of character in bytes |
Definition at line 104 of file utf8.c.
References ck_utf8_char_length(), ck_utf8_decode_2(), ck_utf8_decode_3(), ck_utf8_decode_4(), and out_len.
| size_t ck_utf8_offset_to_byte | ( | const char * | str, |
| size_t | len, | ||
| size_t | n | ||
| ) |
Get the byte offset of the N-th character.
| str | String |
| len | String length in bytes |
| n | Character index (0-based) |
Definition at line 166 of file utf8.c.
References ck_utf8_char_length().
| size_t ck_utf8_validate | ( | const char * | str, |
| size_t | len | ||
| ) |
Validate a UTF-8 string.
| str | String to validate |
| len | Length in bytes, or -1 for null-terminated |
Definition at line 25 of file utf8.c.
References ck_utf8_char_length().
Referenced by ck_utf8_is_valid().
| const { ... } CK_UTF8_WHITESPACE_RANGES[] |
Referenced by ck_utf8_is_whitespace().
| uint32_t end |
Definition at line 215 of file utf8.c.
Referenced by ck_ir_v2_apply_meta(), ck_ir_v2_apply_weight_dtypes(), ck_ir_v2_find_array_end(), ck_ir_v2_find_key(), ck_ir_v2_next_object(), ck_ir_v2_parse_bool(), ck_ir_v2_parse_buffers(), ck_ir_v2_parse_float(), ck_ir_v2_parse_int(), ck_ir_v2_parse_json(), ck_ir_v2_parse_nodes(), ck_ir_v2_parse_string(), ck_ir_v2_parse_string_field(), ck_ir_v2_skip_string(), ck_ir_v2_skip_ws(), ck_parse_env_int(), ck_utf8_is_whitespace(), load_manifest(), parse_eos_ids(), parse_float_field_in_range(), parse_int_field_in_range(), run_benchmark(), topology_discover_numa(), topology_measure_memory_bandwidth_ex(), and trim_string().
| uint32_t start |
Definition at line 214 of file utf8.c.
Referenced by bump_bytes(), ck_ir_v2_apply_weight_dtypes(), ck_ir_v2_next_object(), ck_ir_v2_parse_bindings(), ck_ir_v2_parse_buffers(), ck_ir_v2_parse_nodes(), ck_ir_v2_parse_shape(), ck_ir_v2_parse_string(), ck_utf8_is_whitespace(), encode_text_segment(), find_object_range(), load_manifest(), run_benchmark(), sample_token(), sample_topk(), topology_discover_numa(), topology_measure_memory_bandwidth_ex(), and trim_string().