← Back to C-Kernel-Engine Docs Doxygen Source Documentation
utf8.h
Go to the documentation of this file.
1 /*
2  * UTF-8 Utilities
3  *
4  * Provides UTF-8 character handling for tokenization.
5  * Handles multi-byte sequences, validation, and normalization.
6  *
7  * By Anthony Shivakumar
8  */
9 
10 #ifndef CK_TOKENIZER_UTF8_H
11 #define CK_TOKENIZER_UTF8_H
12 
13 #include <stddef.h>
14 #include <stdint.h>
15 #include <stdbool.h>
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 
21 /**
22  * Get the length of a UTF-8 character from its first byte.
23  *
24  * @param c First byte of UTF-8 character
25  * @return Number of bytes in character (1-4), or 0 for invalid
26  */
27 int ck_utf8_char_length(unsigned char c);
28 
29 /**
30  * Validate a UTF-8 string.
31  *
32  * @param str String to validate
33  * @param len Length in bytes, or -1 for null-terminated
34  * @return 0 if valid, position of first invalid byte otherwise
35  */
36 size_t ck_utf8_validate(const char *str, size_t len);
37 
38 /**
39  * Check if a byte sequence is valid UTF-8.
40  *
41  * @param str String to check
42  * @param len Length in bytes
43  * @return true if all bytes form valid UTF-8
44  */
45 bool ck_utf8_is_valid(const char *str, size_t len);
46 
47 /**
48  * Count UTF-8 characters in a string.
49  *
50  * @param str String to count
51  * @param len Length in bytes, or -1 for null-terminated
52  * @return Number of Unicode code points
53  */
54 size_t ck_utf8_count_chars(const char *str, size_t len);
55 
56 /**
57  * Get next UTF-8 character, return its code point.
58  *
59  * @param str String pointer (will be updated)
60  * @param out_len Output: length of character in bytes
61  * @return Unicode code point, or -1 on error
62  */
63 int32_t ck_utf8_next_char(const char **str, int *out_len);
64 
65 /**
66  * Write a Unicode code point as UTF-8.
67  *
68  * @param cp Unicode code point
69  * @param out Output buffer (must have 4+ bytes)
70  * @return Number of bytes written
71  */
72 int ck_utf8_from_cp(uint32_t cp, char *out);
73 
74 /**
75  * Get the byte offset of the N-th character.
76  *
77  * @param str String
78  * @param len String length in bytes
79  * @param n Character index (0-based)
80  * @return Byte offset, or len if n >= char count
81  */
82 size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n);
83 
84 /**
85  * Get the character index from byte offset.
86  *
87  * @param str String
88  * @param len String length in bytes
89  * @param byte_offset Byte offset
90  * @return Character index, or total chars if offset beyond end
91  */
92 size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset);
93 
94 /**
95  * Check if character is whitespace (Unicode White_Space property).
96  *
97  * @param cp Unicode code point
98  * @return true if whitespace
99  */
100 bool ck_utf8_is_whitespace(uint32_t cp);
101 
102 /**
103  * Normalize UTF-8 string (Unicode normalization form NFC).
104  *
105  * @param src Source string
106  * @param src_len Source length
107  * @param dst Destination buffer
108  * @param dst_size Destination size
109  * @return Length written, or required size if dst=NULL
110  */
111 size_t ck_utf8_normalize_nfc(const char *src, size_t src_len,
112  char *dst, size_t dst_size);
113 
114 /**
115  * Get the first byte of a UTF-8 character.
116  */
117 static inline unsigned char ck_utf8_first_byte(const char *s) {
118  return (unsigned char)s[0];
119 }
120 
121 /**
122  * Get the continuation byte mask and value.
123  */
124 static inline int ck_utf8_is_continuation(unsigned char c) {
125  return (c & 0xC0) == 0x80;
126 }
127 
128 /**
129  * Get 2-byte UTF-8 sequence value.
130  */
131 static inline uint32_t ck_utf8_decode_2(const char *s) {
132  return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
133 }
134 
135 /**
136  * Get 3-byte UTF-8 sequence value.
137  */
138 static inline uint32_t ck_utf8_decode_3(const char *s) {
139  return ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
140 }
141 
142 /**
143  * Get 4-byte UTF-8 sequence value.
144  */
145 static inline uint32_t ck_utf8_decode_4(const char *s) {
146  return ((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
147  ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
148 }
149 
150 /* Common Unicode code points */
151 #define CK_UTF8_SPACE 0x0020
152 #define CK_UTF8_TAB 0x0009
153 #define CK_UTF8_NEWLINE 0x000A
154 #define CK_UTF8_CARRIAGE_RETURN 0x000D
155 #define CK_UTF8_NBSP 0x00A0
156 #define CK_UTF8_WORD_JOINER 0x2060
157 
158 /* GPT-2 space marker: Ġ (U+0120 in UTF-8: 0xC4 0xA0) */
159 #define CK_UTF8_GPT2_SPACE_HIGH 0xC4
160 #define CK_UTF8_GPT2_SPACE_LOW 0xA0
161 
162 /* SentencePiece space marker: ▁ (U+2581) */
163 #define CK_UTF8_SPM_SPACE 0xE2 0x96 0x81
164 
165 #ifdef __cplusplus
166 }
167 #endif
168 
169 #endif /* CK_TOKENIZER_UTF8_H */
const int32_t int int * out_len
Definition: tokenizer.h:445
bool ck_utf8_is_whitespace(uint32_t cp)
Definition: utf8.c:229
int32_t ck_utf8_next_char(const char **str, int *out_len)
Definition: utf8.c:104
size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset)
Definition: utf8.c:187
size_t ck_utf8_count_chars(const char *str, size_t len)
Definition: utf8.c:82
static uint32_t ck_utf8_decode_3(const char *s)
Definition: utf8.h:138
int ck_utf8_from_cp(uint32_t cp, char *out)
Definition: utf8.c:144
static int ck_utf8_is_continuation(unsigned char c)
Definition: utf8.h:124
size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n)
Definition: utf8.c:166
bool ck_utf8_is_valid(const char *str, size_t len)
Definition: utf8.c:78
static uint32_t ck_utf8_decode_4(const char *s)
Definition: utf8.h:145
size_t ck_utf8_validate(const char *str, size_t len)
Definition: utf8.c:25
static unsigned char ck_utf8_first_byte(const char *s)
Definition: utf8.h:117
size_t ck_utf8_normalize_nfc(const char *src, size_t src_len, char *dst, size_t dst_size)
static uint32_t ck_utf8_decode_2(const char *s)
Definition: utf8.h:131
int ck_utf8_char_length(unsigned char c)
Definition: utf8.c:12