← Back to C-Kernel-Engine Docs Doxygen Source Documentation
utf8.c
Go to the documentation of this file.
1 /*
2  * UTF-8 Utilities Implementation
3  *
4  * Provides UTF-8 character handling for tokenization.
5  */
6 
7 #include "tokenizer/utf8.h"
8 #include <string.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 
12 int ck_utf8_char_length(unsigned char c) {
13  if ((c & 0x80) == 0) {
14  return 1; /* ASCII */
15  } else if ((c & 0xE0) == 0xC0) {
16  return 2; /* 2-byte sequence */
17  } else if ((c & 0xF0) == 0xE0) {
18  return 3; /* 3-byte sequence */
19  } else if ((c & 0xF8) == 0xF0) {
20  return 4; /* 4-byte sequence */
21  }
22  return 0; /* Invalid */
23 }
24 
25 size_t ck_utf8_validate(const char *str, size_t len) {
26  if (len == (size_t)-1) {
27  len = strlen(str);
28  }
29 
30  size_t i = 0;
31  while (i < len) {
32  unsigned char c = (unsigned char)str[i];
33  int char_len = ck_utf8_char_length(c);
34 
35  if (char_len == 0) {
36  return i; /* Invalid byte */
37  }
38 
39  /* Check continuation bytes */
40  for (int j = 1; j < char_len; j++) {
41  if (i + j >= len) {
42  return i + j; /* Truncated sequence */
43  }
44  unsigned char cont = (unsigned char)str[i + j];
45  if ((cont & 0xC0) != 0x80) {
46  return i + j; /* Invalid continuation */
47  }
48  }
49 
50  /* Validate code point for overlong encodings */
51  if (char_len == 2 && c < 0xC2) {
52  return i; /* Overlong 2-byte */
53  } else if (char_len == 3) {
54  if (c == 0xE0 && (str[i + 1] & 0xE0) == 0x80) {
55  return i; /* Overlong 3-byte */
56  }
57  if (c == 0xED && (str[i + 1] & 0xE0) == 0xA0) {
58  return i; /* Surrogate */
59  }
60  } else if (char_len == 4) {
61  if (c == 0xF0 && (str[i + 1] & 0xF0) == 0x80) {
62  return i; /* Overlong 4-byte */
63  }
64  if (c > 0xF4) {
65  return i; /* Invalid */
66  }
67  if (c == 0xF4 && (str[i + 1] & 0xF0) == 0x80) {
68  return i; /* Out of range */
69  }
70  }
71 
72  i += char_len;
73  }
74 
75  return 0; /* Valid */
76 }
77 
78 bool ck_utf8_is_valid(const char *str, size_t len) {
79  return ck_utf8_validate(str, len) == 0;
80 }
81 
82 size_t ck_utf8_count_chars(const char *str, size_t len) {
83  if (len == (size_t)-1) {
84  len = strlen(str);
85  }
86 
87  size_t count = 0;
88  size_t i = 0;
89 
90  while (i < len) {
91  int char_len = ck_utf8_char_length((unsigned char)str[i]);
92  if (char_len == 0) {
93  /* Invalid UTF-8, count as 1 byte */
94  i++;
95  } else {
96  i += char_len;
97  count++;
98  }
99  }
100 
101  return count;
102 }
103 
104 int32_t ck_utf8_next_char(const char **str, int *out_len) {
105  if (!str || !*str || !out_len) {
106  return -1;
107  }
108 
109  unsigned char c = (unsigned char)(**str);
110  int char_len = ck_utf8_char_length(c);
111 
112  if (char_len == 0) {
113  *out_len = 1;
114  (*str)++;
115  return -1;
116  }
117 
118  uint32_t cp;
119 
120  switch (char_len) {
121  case 1:
122  cp = c;
123  break;
124  case 2:
125  cp = ck_utf8_decode_2(*str);
126  break;
127  case 3:
128  cp = ck_utf8_decode_3(*str);
129  break;
130  case 4:
131  cp = ck_utf8_decode_4(*str);
132  break;
133  default:
134  cp = -1;
135  break;
136  }
137 
138  *out_len = char_len;
139  *str += char_len;
140 
141  return (int32_t)cp;
142 }
143 
144 int ck_utf8_from_cp(uint32_t cp, char *out) {
145  if (cp < 0x80) {
146  out[0] = (char)cp;
147  return 1;
148  } else if (cp < 0x800) {
149  out[0] = (char)(0xC0 | (cp >> 6));
150  out[1] = (char)(0x80 | (cp & 0x3F));
151  return 2;
152  } else if (cp < 0x10000) {
153  out[0] = (char)(0xE0 | (cp >> 12));
154  out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
155  out[2] = (char)(0x80 | (cp & 0x3F));
156  return 3;
157  } else {
158  out[0] = (char)(0xF0 | (cp >> 18));
159  out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
160  out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
161  out[3] = (char)(0x80 | (cp & 0x3F));
162  return 4;
163  }
164 }
165 
166 size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n) {
167  if (len == (size_t)-1) {
168  len = strlen(str);
169  }
170 
171  size_t byte_offset = 0;
172  size_t char_idx = 0;
173 
174  while (byte_offset < len && char_idx < n) {
175  int char_len = ck_utf8_char_length((unsigned char)str[byte_offset]);
176  if (char_len == 0) {
177  byte_offset++;
178  } else {
179  byte_offset += char_len;
180  }
181  char_idx++;
182  }
183 
184  return byte_offset;
185 }
186 
187 size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset) {
188  if (len == (size_t)-1) {
189  len = strlen(str);
190  }
191 
192  if (byte_offset >= len) {
193  return ck_utf8_count_chars(str, len);
194  }
195 
196  size_t char_idx = 0;
197  size_t i = 0;
198 
199  while (i < byte_offset) {
200  int char_len = ck_utf8_char_length((unsigned char)str[i]);
201  if (char_len == 0) {
202  i++;
203  } else {
204  i += char_len;
205  }
206  char_idx++;
207  }
208 
209  return char_idx;
210 }
211 
212 /* Unicode White_Space characters (common ones) */
213 static const struct {
214  uint32_t start;
215  uint32_t end;
217  {0x0009, 0x000D}, /* Tab, LF, VT, FF, CR */
218  {0x0020, 0x0020}, /* Space */
219  {0x0085, 0x0085}, /* NEL */
220  {0x00A0, 0x00A0}, /* NBSP */
221  {0x1680, 0x1680}, /* Ogham space mark */
222  {0x2000, 0x200A}, /* Various widths of spaces */
223  {0x2028, 0x2029}, /* Line/Paragraph separator */
224  {0x202F, 0x202F}, /* Narrow NBSP */
225  {0x205F, 0x205F}, /* Medium mathematical space */
226  {0x3000, 0x3000}, /* Ideographic space */
227 };
228 
229 bool ck_utf8_is_whitespace(uint32_t cp) {
230  for (size_t i = 0; i < sizeof(CK_UTF8_WHITESPACE_RANGES) / sizeof(CK_UTF8_WHITESPACE_RANGES[0]); i++) {
231  if (cp >= CK_UTF8_WHITESPACE_RANGES[i].start &&
232  cp <= CK_UTF8_WHITESPACE_RANGES[i].end) {
233  return true;
234  }
235  }
236  return false;
237 }
238 
239 /* Simple NFC normalization (handles common cases) */
240 size_t ck_tokenizer_utf8_normalize_nfc(const char *src, size_t src_len,
241  char *dst, size_t dst_size) {
242  if (src_len == (size_t)-1) {
243  src_len = strlen(src);
244  }
245 
246  /* For now, just copy (full NFC is complex) */
247  if (dst == NULL) {
248  return src_len;
249  }
250 
251  size_t to_copy = src_len < dst_size - 1 ? src_len : dst_size - 1;
252  memcpy(dst, src, to_copy);
253  dst[to_copy] = '\0';
254 
255  return to_copy;
256 }
const int32_t int int * out_len
Definition: tokenizer.h:445
bool ck_utf8_is_whitespace(uint32_t cp)
Definition: utf8.c:229
uint32_t end
Definition: utf8.c:215
int32_t ck_utf8_next_char(const char **str, int *out_len)
Definition: utf8.c:104
size_t ck_utf8_byte_to_offset(const char *str, size_t len, size_t byte_offset)
Definition: utf8.c:187
size_t ck_utf8_count_chars(const char *str, size_t len)
Definition: utf8.c:82
uint32_t start
Definition: utf8.c:214
int ck_utf8_from_cp(uint32_t cp, char *out)
Definition: utf8.c:144
size_t ck_utf8_offset_to_byte(const char *str, size_t len, size_t n)
Definition: utf8.c:166
bool ck_utf8_is_valid(const char *str, size_t len)
Definition: utf8.c:78
size_t ck_tokenizer_utf8_normalize_nfc(const char *src, size_t src_len, char *dst, size_t dst_size)
Definition: utf8.c:240
static const struct @1 CK_UTF8_WHITESPACE_RANGES[]
size_t ck_utf8_validate(const char *str, size_t len)
Definition: utf8.c:25
int ck_utf8_char_length(unsigned char c)
Definition: utf8.c:12
static uint32_t ck_utf8_decode_3(const char *s)
Definition: utf8.h:138
static uint32_t ck_utf8_decode_4(const char *s)
Definition: utf8.h:145
static uint32_t ck_utf8_decode_2(const char *s)
Definition: utf8.h:131