← Back to C-Kernel-Engine Docs Doxygen Source Documentation
utf8.c File Reference
#include "tokenizer/utf8.h"
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>

Go to the source code of this file.

Functions

size_t ck_tokenizer_utf8_normalize_nfc (const char *src, size_t src_len, char *dst, size_t dst_size)
 
size_t ck_utf8_byte_to_offset (const char *str, size_t len, size_t byte_offset)
 
int ck_utf8_char_length (unsigned char c)
 
size_t ck_utf8_count_chars (const char *str, size_t len)
 
int ck_utf8_from_cp (uint32_t cp, char *out)
 
bool ck_utf8_is_valid (const char *str, size_t len)
 
bool ck_utf8_is_whitespace (uint32_t cp)
 
int32_t ck_utf8_next_char (const char **str, int *out_len)
 
size_t ck_utf8_offset_to_byte (const char *str, size_t len, size_t n)
 
size_t ck_utf8_validate (const char *str, size_t len)
 

Variables

struct {
   uint32_t   end
 
   uint32_t   start
 
CK_UTF8_WHITESPACE_RANGES []
 

Function Documentation

◆ ck_tokenizer_utf8_normalize_nfc()

size_t ck_tokenizer_utf8_normalize_nfc ( const char *  src,
size_t  src_len,
char *  dst,
size_t  dst_size 
)

Definition at line 240 of file utf8.c.

241  {
242  if (src_len == (size_t)-1) {
243  src_len = strlen(src);
244  }
245 
246  /* For now, just copy (full NFC is complex) */
247  if (dst == NULL) {
248  return src_len;
249  }
250 
251  size_t to_copy = src_len < dst_size - 1 ? src_len : dst_size - 1;
252  memcpy(dst, src, to_copy);
253  dst[to_copy] = '\0';
254 
255  return to_copy;
256 }

◆ ck_utf8_byte_to_offset()

size_t ck_utf8_byte_to_offset ( const char *  str,
size_t  len,
size_t  byte_offset 
)

Get the character index from byte offset.

Parameters
strString
lenString length in bytes
byte_offsetByte offset
Returns
Character index, or total chars if offset beyond end

Definition at line 187 of file utf8.c.

187  {
188  if (len == (size_t)-1) {
189  len = strlen(str);
190  }
191 
192  if (byte_offset >= len) {
193  return ck_utf8_count_chars(str, len);
194  }
195 
196  size_t char_idx = 0;
197  size_t i = 0;
198 
199  while (i < byte_offset) {
200  int char_len = ck_utf8_char_length((unsigned char)str[i]);
201  if (char_len == 0) {
202  i++;
203  } else {
204  i += char_len;
205  }
206  char_idx++;
207  }
208 
209  return char_idx;
210 }
size_t ck_utf8_count_chars(const char *str, size_t len)
Definition: utf8.c:82
int ck_utf8_char_length(unsigned char c)
Definition: utf8.c:12

References ck_utf8_char_length(), and ck_utf8_count_chars().

◆ ck_utf8_char_length()

int ck_utf8_char_length ( unsigned char  c)

Get the length of a UTF-8 character from its first byte.

Parameters
cFirst byte of UTF-8 character
Returns
Number of bytes in character (1-4), or 0 for invalid

Definition at line 12 of file utf8.c.

12  {
13  if ((c & 0x80) == 0) {
14  return 1; /* ASCII */
15  } else if ((c & 0xE0) == 0xC0) {
16  return 2; /* 2-byte sequence */
17  } else if ((c & 0xF0) == 0xE0) {
18  return 3; /* 3-byte sequence */
19  } else if ((c & 0xF8) == 0xF0) {
20  return 4; /* 4-byte sequence */
21  }
22  return 0; /* Invalid */
23 }

Referenced by ck_utf8_byte_to_offset(), ck_utf8_count_chars(), ck_utf8_next_char(), ck_utf8_offset_to_byte(), and ck_utf8_validate().

◆ ck_utf8_count_chars()

size_t ck_utf8_count_chars ( const char *  str,
size_t  len 
)

Count UTF-8 characters in a string.

Parameters
strString to count
lenLength in bytes, or -1 for null-terminated
Returns
Number of Unicode code points

Definition at line 82 of file utf8.c.

82  {
83  if (len == (size_t)-1) {
84  len = strlen(str);
85  }
86 
87  size_t count = 0;
88  size_t i = 0;
89 
90  while (i < len) {
91  int char_len = ck_utf8_char_length((unsigned char)str[i]);
92  if (char_len == 0) {
93  /* Invalid UTF-8, count as 1 byte */
94  i++;
95  } else {
96  i += char_len;
97  count++;
98  }
99  }
100 
101  return count;
102 }

References ck_utf8_char_length().

Referenced by ck_utf8_byte_to_offset().

◆ ck_utf8_from_cp()

int ck_utf8_from_cp ( uint32_t  cp,
char *  out 
)

Write a Unicode code point as UTF-8.

Parameters
cpUnicode code point
outOutput buffer (must have 4+ bytes)
Returns
Number of bytes written

Definition at line 144 of file utf8.c.

144  {
145  if (cp < 0x80) {
146  out[0] = (char)cp;
147  return 1;
148  } else if (cp < 0x800) {
149  out[0] = (char)(0xC0 | (cp >> 6));
150  out[1] = (char)(0x80 | (cp & 0x3F));
151  return 2;
152  } else if (cp < 0x10000) {
153  out[0] = (char)(0xE0 | (cp >> 12));
154  out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
155  out[2] = (char)(0x80 | (cp & 0x3F));
156  return 3;
157  } else {
158  out[0] = (char)(0xF0 | (cp >> 18));
159  out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
160  out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
161  out[3] = (char)(0x80 | (cp & 0x3F));
162  return 4;
163  }
164 }

◆ ck_utf8_is_valid()

bool ck_utf8_is_valid ( const char *  str,
size_t  len 
)

Check if a byte sequence is valid UTF-8.

Parameters
strString to check
lenLength in bytes
Returns
true if all bytes form valid UTF-8

Definition at line 78 of file utf8.c.

78  {
79  return ck_utf8_validate(str, len) == 0;
80 }
size_t ck_utf8_validate(const char *str, size_t len)
Definition: utf8.c:25

References ck_utf8_validate().

◆ ck_utf8_is_whitespace()

bool ck_utf8_is_whitespace ( uint32_t  cp)

Check if character is whitespace (Unicode White_Space property).

Parameters
cpUnicode code point
Returns
true if whitespace

Definition at line 229 of file utf8.c.

229  {
230  for (size_t i = 0; i < sizeof(CK_UTF8_WHITESPACE_RANGES) / sizeof(CK_UTF8_WHITESPACE_RANGES[0]); i++) {
231  if (cp >= CK_UTF8_WHITESPACE_RANGES[i].start &&
232  cp <= CK_UTF8_WHITESPACE_RANGES[i].end) {
233  return true;
234  }
235  }
236  return false;
237 }
uint32_t end
Definition: utf8.c:215
uint32_t start
Definition: utf8.c:214
static const struct @1 CK_UTF8_WHITESPACE_RANGES[]

References CK_UTF8_WHITESPACE_RANGES, end, and start.

◆ ck_utf8_next_char()

int32_t ck_utf8_next_char ( const char **  str,
int *  out_len 
)

Get next UTF-8 character, return its code point.

Parameters
strString pointer (will be updated)
out_lenOutput: length of character in bytes
Returns
Unicode code point, or -1 on error

Definition at line 104 of file utf8.c.

104  {
105  if (!str || !*str || !out_len) {
106  return -1;
107  }
108 
109  unsigned char c = (unsigned char)(**str);
110  int char_len = ck_utf8_char_length(c);
111 
112  if (char_len == 0) {
113  *out_len = 1;
114  (*str)++;
115  return -1;
116  }
117 
118  uint32_t cp;
119 
120  switch (char_len) {
121  case 1:
122  cp = c;
123  break;
124  case 2:
125  cp = ck_utf8_decode_2(*str);
126  break;
127  case 3:
128  cp = ck_utf8_decode_3(*str);
129  break;
130  case 4:
131  cp = ck_utf8_decode_4(*str);
132  break;
133  default:
134  cp = -1;
135  break;
136  }
137 
138  *out_len = char_len;
139  *str += char_len;
140 
141  return (int32_t)cp;
142 }
const int32_t int int * out_len
Definition: tokenizer.h:445
static uint32_t ck_utf8_decode_3(const char *s)
Definition: utf8.h:138
static uint32_t ck_utf8_decode_4(const char *s)
Definition: utf8.h:145
static uint32_t ck_utf8_decode_2(const char *s)
Definition: utf8.h:131

References ck_utf8_char_length(), ck_utf8_decode_2(), ck_utf8_decode_3(), ck_utf8_decode_4(), and out_len.

◆ ck_utf8_offset_to_byte()

size_t ck_utf8_offset_to_byte ( const char *  str,
size_t  len,
size_t  n 
)

Get the byte offset of the N-th character.

Parameters
strString
lenString length in bytes
nCharacter index (0-based)
Returns
Byte offset, or len if n >= char count

Definition at line 166 of file utf8.c.

166  {
167  if (len == (size_t)-1) {
168  len = strlen(str);
169  }
170 
171  size_t byte_offset = 0;
172  size_t char_idx = 0;
173 
174  while (byte_offset < len && char_idx < n) {
175  int char_len = ck_utf8_char_length((unsigned char)str[byte_offset]);
176  if (char_len == 0) {
177  byte_offset++;
178  } else {
179  byte_offset += char_len;
180  }
181  char_idx++;
182  }
183 
184  return byte_offset;
185 }

References ck_utf8_char_length().

◆ ck_utf8_validate()

size_t ck_utf8_validate ( const char *  str,
size_t  len 
)

Validate a UTF-8 string.

Parameters
strString to validate
lenLength in bytes, or -1 for null-terminated
Returns
0 if valid, position of first invalid byte otherwise

Definition at line 25 of file utf8.c.

25  {
26  if (len == (size_t)-1) {
27  len = strlen(str);
28  }
29 
30  size_t i = 0;
31  while (i < len) {
32  unsigned char c = (unsigned char)str[i];
33  int char_len = ck_utf8_char_length(c);
34 
35  if (char_len == 0) {
36  return i; /* Invalid byte */
37  }
38 
39  /* Check continuation bytes */
40  for (int j = 1; j < char_len; j++) {
41  if (i + j >= len) {
42  return i + j; /* Truncated sequence */
43  }
44  unsigned char cont = (unsigned char)str[i + j];
45  if ((cont & 0xC0) != 0x80) {
46  return i + j; /* Invalid continuation */
47  }
48  }
49 
50  /* Validate code point for overlong encodings */
51  if (char_len == 2 && c < 0xC2) {
52  return i; /* Overlong 2-byte */
53  } else if (char_len == 3) {
54  if (c == 0xE0 && (str[i + 1] & 0xE0) == 0x80) {
55  return i; /* Overlong 3-byte */
56  }
57  if (c == 0xED && (str[i + 1] & 0xE0) == 0xA0) {
58  return i; /* Surrogate */
59  }
60  } else if (char_len == 4) {
61  if (c == 0xF0 && (str[i + 1] & 0xF0) == 0x80) {
62  return i; /* Overlong 4-byte */
63  }
64  if (c > 0xF4) {
65  return i; /* Invalid */
66  }
67  if (c == 0xF4 && (str[i + 1] & 0xF0) == 0x80) {
68  return i; /* Out of range */
69  }
70  }
71 
72  i += char_len;
73  }
74 
75  return 0; /* Valid */
76 }

References ck_utf8_char_length().

Referenced by ck_utf8_is_valid().

Variable Documentation

◆ 

const { ... } CK_UTF8_WHITESPACE_RANGES[]
Initial value:
= {
{0x0009, 0x000D},
{0x0020, 0x0020},
{0x0085, 0x0085},
{0x00A0, 0x00A0},
{0x1680, 0x1680},
{0x2000, 0x200A},
{0x2028, 0x2029},
{0x202F, 0x202F},
{0x205F, 0x205F},
{0x3000, 0x3000},
}

Referenced by ck_utf8_is_whitespace().

◆ end

◆ start