1#include "include/pw.h"
2#include "src/types/string/string_internal.h"
3
4char32_t _pw_decode_utf8_char(char8_t** str)
5{
6 char8_t* p = *str;
7 char8_t c = *p++;
8 if (c < 0x80) {
9 *str = p;
10 return c;
11 }
12
13 char32_t codepoint;
14 char8_t next;
15
16# define APPEND_NEXT \
17 next = *p++; \
18 if (_pw_unlikely(next == 0)) goto end_of_string; \
19 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
20 codepoint <<= 6; \
21 codepoint |= next & 0x3F;
22
23 if ((c & 0b1110'0000) == 0b1100'0000) {
24 codepoint = c & 0b0011'1111;
25 APPEND_NEXT
26 } else if ((c & 0b1111'0000) == 0b1110'0000) {
27 codepoint = c & 0b0001'1111;
28 APPEND_NEXT
29 APPEND_NEXT
30 } else if ((c & 0b1111'1000) == 0b1111'0000) {
31 codepoint = c & 0b0000'1111;
32 APPEND_NEXT
33 APPEND_NEXT
34 APPEND_NEXT
35 } else {
36 goto bad_utf8;
37 }
38 if (codepoint == 0) {
39 // zero codepoint encoded with 2 or more bytes,
40 // make it invalid to avoid mixing up with 1-byte null character
41 codepoint = 0xFFFFFFFF;
42/*
43 } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
44 // surrogate pairs are prohibited, return inalid codepoint for them
45 codepoint = 0xFFFFFFFF;
46*/
47 }
48 *str = p;
49 return codepoint;
50
51end_of_string:
52 *str = p;
53 return 0;
54
55bad_utf8:
56 *str = --p; // rollback to bad octet, will process it on the next call
57 return 0xFFFFFFFF;
58
59# undef APPEND_NEXT
60}
61
62char32_t _pw_decode_utf8_char_reverse(char8_t** ptr)
63{
64 // XXX work in progress
65 // XXX if sequence is wrong, always decrement ptr by 1
66
67 char32_t codepoint;
68 char8_t c;
69 char8_t next;
70 char8_t* p = *ptr;
71 char8_t* end_ptr = p;
72
73 // seek to the start of UTF-8 sequence
74 char8_t* str_start = p - 4;
75 for (;;) {
76 if (p < str_start) {
77 goto bad_utf8;
78 }
79 c = *--p;
80 if (c < 0x80) {
81 codepoint = c;
82 goto done;
83 }
84 if ((c & 0b1100'0000) != 0b1000'0000) {
85 break;
86 }
87 }
88 char8_t* np = p + 1;
89
90# define APPEND_NEXT \
91 if (_pw_unlikely(np >= end_ptr)) goto bad_utf8; \
92 next = *np++; \
93 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
94 codepoint <<= 6; \
95 codepoint |= next & 0x3F;
96
97 if ((c & 0b1110'0000) == 0b1100'0000) {
98 codepoint = c & 0b0011'1111;
99 APPEND_NEXT
100 } else if ((c & 0b1111'0000) == 0b1110'0000) {
101 codepoint = c & 0b0001'1111;
102 APPEND_NEXT
103 APPEND_NEXT
104 } else if ((c & 0b1111'1000) == 0b1111'0000) {
105 codepoint = c & 0b0000'1111;
106 APPEND_NEXT
107 APPEND_NEXT
108 APPEND_NEXT
109 } else {
110 goto bad_utf8;
111 }
112 if (codepoint == 0) {
113 // zero codepoint encoded with 2 or more bytes,
114 // make it invalid to avoid mixing up with 1-byte null character
115 codepoint = 0xFFFFFFFF;
116 goto done;
117 }
118/*
119 if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
120 // surrogate pairs are prohibited, return inalid codepoint for them
121 codepoint = 0xFFFFFFFF;
122 goto done;
123 }
124*/
125
126done:
127 *ptr = p;
128 return codepoint;
129
130bad_utf8:
131 *ptr = p;
132 return 0xFFFFFFFF;
133
134# undef APPEND_NEXT
135}
136
137bool _pw_decode_utf8_buffer(char8_t** ptr, unsigned* bytes_remaining, char32_t* result)
138{
139 char8_t* p = *ptr;
140 unsigned remaining = *bytes_remaining;
141 if (!remaining) {
142 return false;
143 }
144
145 char32_t codepoint;
146 char8_t next;
147
148# define APPEND_NEXT \
149 next = *p++; \
150 remaining--; \
151 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
152 codepoint <<= 6; \
153 codepoint |= next & 0x3F;
154
155 char8_t c = *p++;
156 remaining--;
157 if (c < 0x80) {
158 codepoint = c;
159 goto done;
160 }
161 if ((c & 0b1110'0000) == 0b1100'0000) {
162 if (_pw_unlikely(!remaining)) return false;
163 codepoint = c & 0b0011'1111;
164 APPEND_NEXT
165 } else if ((c & 0b1111'0000) == 0b1110'0000) {
166 if (_pw_unlikely(remaining < 2)) return false;
167 codepoint = c & 0b0001'1111;
168 APPEND_NEXT
169 APPEND_NEXT
170 } else if ((c & 0b1111'1000) == 0b1111'0000) {
171 if (_pw_unlikely(remaining < 3)) return false;
172 codepoint = c & 0b0000'1111;
173 APPEND_NEXT
174 APPEND_NEXT
175 APPEND_NEXT
176 } else {
177 goto bad_utf8;
178 }
179 if (codepoint == 0) {
180 // zero codepoint encoded with 2 or more bytes,
181 // make it invalid to avoid mixing up with 1-byte null character
182 codepoint = 0xFFFFFFFF;
183/*
184 } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
185 // surrogate pairs are prohibited, return inalid codepoint for them
186 codepoint = 0xFFFFFFFF;
187*/
188 }
189
190done:
191 *ptr = p;
192 *bytes_remaining = remaining;
193 *result = codepoint;
194 return true;
195
196bad_utf8:
197 p--; // rollback to bad octet, will process it on the next call
198 codepoint = 0xFFFFFFFF;
199 goto done;
200
201# undef APPEND_NEXT
202}
203
204unsigned pw_char32_to_utf8(char32_t codepoint, char* buffer)
205{
206 /*
207 * U+0000 - U+007F 0xxxxxxx
208 * U+0080 - U+07FF 110xxxxx 10xxxxxx
209 * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
210 * U+010000 - U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211 */
212 if (codepoint < 0x80) {
213 *buffer++ = (char) codepoint;
214 return 1;
215 }
216 if (codepoint < 0b1'00000'000000) {
217 *buffer++ = (char) (0xC0 | (codepoint >> 6));
218 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
219 return 2;
220 }
221 if (codepoint < 0b1'0000'000000'000000) {
222 *buffer++ = (char) (0xE0 | (codepoint >> 12));
223 *buffer++ = (char) (0x80 | ((codepoint >> 6) & 0x3F));
224 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
225 return 3;
226 }
227 *buffer++ = (char) (0xF0 | ((codepoint >> 18) & 0x07));
228 *buffer++ = (char) (0x80 | ((codepoint >> 12) & 0x3F));
229 *buffer++ = (char) (0x80 | ((codepoint >> 6) & 0x3F));
230 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
231 return 4;
232}
233
234unsigned utf8_strlen(char8_t* str)
235{
236 unsigned length = 0;
237 while(*str != 0) {
238 char32_t c = _pw_decode_utf8_char(&str);
239 if (c != 0xFFFFFFFF) {
240 length++;
241 }
242 }
243 return length;
244}
245
246unsigned utf8_strlen2(char8_t* str, uint8_t* char_size)
247{
248 unsigned length = 0;
249 char32_t width = 0;
250 while(*str != 0) {
251 char32_t c = _pw_decode_utf8_char(&str);
252 if (c != 0xFFFFFFFF) {
253 width |= c;
254 length++;
255 }
256 }
257 *char_size = calc_char_size(width);
258 return length;
259}
260
261unsigned utf8_strlen3(char8_t* str, uint8_t* char_size, char8_t** end_ptr)
262{
263 unsigned length = 0;
264 char32_t width = 0;
265 while(*str != 0) {
266 char32_t c = _pw_decode_utf8_char(&str);
267 if (c != 0xFFFFFFFF) {
268 width |= c;
269 length++;
270 }
271 }
272 *char_size = calc_char_size(width);
273 *end_ptr = str;
274 return length;
275}
276
277unsigned utf8_strlen2_buf(char8_t* buffer, unsigned* size, uint8_t* char_size)
278{
279 char8_t* ptr = buffer;
280 unsigned bytes_remaining = *size;
281 unsigned length = 0;
282 char32_t width = 0;
283
284 while (bytes_remaining) {
285 char32_t c;
286 if (!_pw_decode_utf8_buffer(&ptr, &bytes_remaining, &c)) {
287 break;
288 }
289 if (c != 0xFFFFFFFF) {
290 width |= c;
291 length++;
292 }
293 }
294 *size -= bytes_remaining;
295
296 if (char_size) {
297 *char_size = calc_char_size(width);
298 }
299
300 return length;
301}
302
303uint8_t utf8_char_size(char8_t* str, unsigned max_len)
304{
305 char32_t width = 0;
306 while(*str != 0) {
307 char32_t c = _pw_decode_utf8_char(&str);
308 if (c != 0xFFFFFFFF) {
309 width |= c;
310 }
311 }
312 return calc_char_size(width);
313}
314
315char8_t* utf8_skip(char8_t* str, unsigned n)
316{
317 while(n--) {
318 _pw_decode_utf8_char(&str);
319 if (*str == 0) {
320 break;
321 }
322 }
323 return str;
324}
325
326void _pw_putchar32_utf8(FILE* fp, char32_t codepoint)
327{
328 char buffer[5];
329 char* start = buffer;
330 unsigned n = pw_char32_to_utf8(codepoint, buffer);
331 while (n--) {
332 fputc(*start++, fp);
333 }
334}
335
336unsigned utf32_strlen(char32_t* str)
337{
338 unsigned length = 0;
339 while (*str++) {
340 length++;
341 }
342 return length;
343}
344
345unsigned utf32_strlen2(char32_t* str, uint8_t* char_size)
346{
347 unsigned length = 0;
348 char32_t width = 0;
349 char32_t c;
350 while ((c = *str++) != 0) {
351 width |= c;
352 length++;
353 }
354 *char_size = calc_char_size(width);
355 return length;
356}
357
358/*
359int utf32_strcmp(char32_t* a, char32_t* b)
360{
361 if (a == b) {
362 return 0;
363 }
364 for (;;) {
365 char32_t ca = *a++;
366 char32_t cb = *b++;
367 if (ca < cb) {
368 return -1;
369 } else if (ca > cb) {
370 return 1;
371 } else if (ca == 0) {
372 return 0;
373 }
374 }
375}
376
377int utf32_strcmp_utf8(char32_t* a, char8_t* b)
378{
379 for (;;) {
380 char32_t ca = *a++;
381 char32_t cb = _pw_decode_utf8_char(&b);
382 if (ca < cb) {
383 return -1;
384 } else if (ca > cb) {
385 return 1;
386 } else if (ca == 0) {
387 return 0;
388 }
389 }
390}
391*/
392
393char32_t* utf32_strchr(char32_t* str, char32_t chr)
394{
395 char32_t c;
396 while ((c = *str) != 0) {
397 if (c == chr) {
398 return str;
399 }
400 str++;
401 }
402 return nullptr;
403}
404
405uint8_t utf32_char_size(char32_t* str, unsigned max_len)
406{
407 char32_t width = 0;
408 while (max_len--) {
409 char32_t c = *str++;
410 if (c == 0) {
411 break;
412 }
413 width |= c;
414 }
415 return calc_char_size(width);
416}
417
418unsigned pw_strlen_in_utf8(PwValuePtr str)
419{
420 pw_assert_string(str);
421 unsigned length = 0;
422 uint8_t char_size = str->char_size;
423 unsigned n;
424 uint8_t* ptr = _pw_string_start_length(str, &n);
425 while (n) {
426 char32_t c = _pw_get_char(ptr, char_size);
427 if (c < 0x80) {
428 length++;
429 } else if (c < 0b1'00000'000000) {
430 length += 2;
431 } else if (c < 0b1'0000'000000'000000) {
432 length += 3;
433 } else {
434 length += 4;
435 }
436 ptr += char_size;
437 n--;
438 }
439 return length;
440}
441
442static void _cp_to_utf8_uint8_t(uint8_t* self_ptr, char* dest, unsigned length)
443{
444 memcpy(dest, self_ptr, length);
445 *(dest + length) = 0;
446}
447
448// integral types:
449
450#define STR_COPY_TO_UTF8_IMPL(type_name_self) \
451 static void _cp_to_utf8_##type_name_self(uint8_t* self_ptr, char* dest, unsigned length) \
452 { \
453 type_name_self* src_ptr = (type_name_self*) self_ptr; \
454 while (length--) { \
455 dest += pw_char32_to_utf8(*src_ptr++, dest); \
456 } \
457 *dest = 0; \
458 }
459
460STR_COPY_TO_UTF8_IMPL(uint16_t)
461STR_COPY_TO_UTF8_IMPL(uint32_t)
462
463// uint24_t
464
465static void _cp_to_utf8_uint24_t(uint8_t* self_ptr, char* dest, unsigned length)
466{
467 while (length--) {
468 char32_t c = *self_ptr++;
469 c |= (*self_ptr++) << 8;
470 c |= (*self_ptr++) << 16;
471 dest += pw_char32_to_utf8(c, dest);
472 }
473 *dest = 0;
474}
475
476typedef void (*CopyToUtf8)(uint8_t* self_ptr, char* dest_ptr, unsigned length);
477
478static CopyToUtf8 _pw_copy_to_utf8_variants[5] = {
479 nullptr,
480 _cp_to_utf8_uint8_t,
481 _cp_to_utf8_uint16_t,
482 _cp_to_utf8_uint24_t,
483 _cp_to_utf8_uint32_t
484};
485
486void pw_string_to_utf8(PwValuePtr str, char* buffer)
487{
488 pw_assert_string(str);
489 unsigned length;
490 uint8_t* ptr = _pw_string_start_length(str, &length);
491 CopyToUtf8 fn_copy_to_utf8 = _pw_copy_to_utf8_variants[str->char_size];
492 fn_copy_to_utf8(ptr, buffer, length);
493}
494
495void pw_substr_to_utf8(PwValuePtr str, unsigned start_pos, unsigned end_pos, char* buffer)
496{
497 pw_assert_string(str);
498 unsigned length;
499 uint8_t* ptr = _pw_string_start_length(str, &length);
500 if (end_pos >= length) {
501 end_pos = length;
502 }
503 if (end_pos <= start_pos) {
504 *buffer = 0;
505 return;
506 }
507 CopyToUtf8 fn_copy_to_utf8 = _pw_copy_to_utf8_variants[str->char_size];
508 fn_copy_to_utf8(
509 ptr + start_pos * str->char_size,
510 buffer,
511 end_pos - start_pos
512 );
513}