dte test coverage


Directory: ./
File: src/util/utf8.c
Date: 2025-05-08 15:05:54
Exec Total Coverage
Lines: 117 118 99.2%
Functions: 13 13 100.0%
Branches: 44 48 91.7%

Line Branch Exec Source
1 #include <stdbool.h>
2 #include <stdint.h>
3 #include "utf8.h"
4 #include "ascii.h"
5 #include "debug.h"
6 #include "numtostr.h"
7
8 enum {
9 I = -1, // Invalid byte
10 C = 0, // Continuation byte
11 };
12
13 // https://en.wikipedia.org/wiki/UTF-8#Byte_map
14 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
15 static const int8_t seq_len_table[256] = {
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
24 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 80..8F
25 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 90..9F
26 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // A0..AF
27 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // B0..BF
28 I, I, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..CF
29 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
30 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
31 4, 4, 4, 4, 4, I, I, I, I, I, I, I, I, I, I, I // F0..FF
32 };
33
34 116 static int u_seq_len(unsigned char first_byte)
35 {
36 116 int8_t len = seq_len_table[first_byte];
37 116 BUG_ON(len < I || len > UTF8_MAX_SEQ_LEN);
38 116 return len;
39 }
40
41 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288
42 127 static bool u_is_continuation_byte(unsigned char u)
43 {
44 // (u & 0b11000000) == 0b10000000
45 127 return (u & 0xC0) == 0x80;
46 }
47
48 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
49 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703:~:text=%E2%80%9Cnon%2Dshortest%20form%E2%80%9D
50 // https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings
51 // https://en.wikipedia.org/wiki/UTF-8#Error_handling
52 48 static bool u_is_overlong_sequence(CodePoint u, size_t seq_len)
53 {
54 48 BUG_ON(seq_len > UTF8_MAX_SEQ_LEN);
55 48 return u_char_size(u) != seq_len;
56 }
57
58 /*
59 * Unicode §3.9.4: "A conformant encoding form conversion will treat any
60 * ill-formed code unit sequence as an error condition. (See conformance
61 * clause C10.)"
62 *
63 * C10: "When a process interprets a code unit sequence which purports
64 * to be in a Unicode character encoding form, it shall treat ill-formed
65 * code unit sequences as an error condition and shall not interpret such
66 * sequences as characters."
67 *
68 * Unicode §3.9.3:
69 *
70 * • "Before the Unicode Standard, Version 3.1, the problematic “non-shortest
71 * form” byte sequences in UTF-8 were those where BMP characters could be
72 * represented in more than one way. These sequences are ill-formed, because
73 * they are not allowed by Table 3-7."
74 * • "Because surrogate code points are not Unicode scalar values, any UTF-8
75 * byte sequence that would otherwise map to code points U+D800..U+DFFF
76 * is ill-formed."
77 *
78 * See also:
79 *
80 * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31737
81 * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G23402
82 * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
83 * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703
84 */
85 48 static bool u_is_illformed(CodePoint u, size_t seq_len)
86 {
87
4/4
✓ Branch 0 (3→4) taken 44 times.
✓ Branch 1 (3→6) taken 4 times.
✓ Branch 2 (4→5) taken 2 times.
✓ Branch 3 (4→6) taken 42 times.
48 return u_is_overlong_sequence(u, seq_len) || u_is_surrogate(u);
88 }
89
90 /*
91 * Len Mask Note
92 * -------------------------------------------------
93 * 1 0111 1111 Not supported by this function!
94 * 2 0001 1111
95 * 3 0000 1111
96 * 4 0000 0111
97 * 5 0000 0011 Forbidden by RFC 3629
98 * 6 0000 0001 Forbidden by RFC 3629
99 */
100 56 static unsigned int u_get_first_byte_mask(unsigned int seq_len)
101 {
102 56 BUG_ON(seq_len < 2 || seq_len > UTF8_MAX_SEQ_LEN);
103 56 return (0x80 >> seq_len) - 1;
104 }
105
106 50 size_t u_str_width(const unsigned char *str)
107 {
108 50 size_t i = 0, w = 0;
109
2/2
✓ Branch 0 (5→3) taken 274 times.
✓ Branch 1 (5→6) taken 50 times.
324 while (str[i]) {
110 274 w += u_char_width(u_str_get_char(str, &i));
111 }
112 50 return w;
113 }
114
115 18 CodePoint u_prev_char(const unsigned char *str, size_t *idx)
116 {
117 18 size_t i = *idx;
118 18 unsigned char ch = str[--i];
119
2/2
✓ Branch 0 (2→3) taken 7 times.
✓ Branch 1 (2→4) taken 11 times.
18 if (likely(ch < 0x80)) {
120 7 *idx = i;
121 7 return (CodePoint)ch;
122 }
123
124
2/2
✓ Branch 0 (4→5) taken 5 times.
✓ Branch 1 (4→6) taken 6 times.
11 if (!u_is_continuation_byte(ch)) {
125 5 goto invalid;
126 }
127
128 6 CodePoint u = ch & 0x3f;
129
1/2
✓ Branch 0 (19→7) taken 15 times.
✗ Branch 1 (19→20) not taken.
15 for (unsigned int count = 1, shift = 6; i > 0; ) {
130 15 ch = str[--i];
131 15 unsigned int len = u_seq_len(ch);
132 15 count++;
133
2/2
✓ Branch 0 (8→9) taken 9 times.
✓ Branch 1 (8→12) taken 6 times.
15 if (len == 0) {
134
1/2
✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→11) taken 9 times.
9 if (count == 4) {
135 // Too long sequence
136 break;
137 }
138 9 u |= (ch & 0x3f) << shift;
139 9 shift += 6;
140
1/2
✗ Branch 0 (12→13) not taken.
✓ Branch 1 (12→14) taken 6 times.
6 } else if (count != len) {
141 // Incorrect length
142 break;
143 } else {
144 6 u |= (ch & u_get_first_byte_mask(len)) << shift;
145
1/2
✗ Branch 0 (16→17) not taken.
✓ Branch 1 (16→18) taken 6 times.
6 if (u_is_illformed(u, len)) {
146 break;
147 }
148 6 *idx = i;
149 6 return u;
150 }
151 }
152
153 invalid:
154 5 *idx = *idx - 1;
155 5 u = str[*idx];
156 5 return -u;
157 }
158
159 14659 CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx)
160 {
161 14659 size_t i = *idx;
162 14659 CodePoint u = str[i];
163
2/2
✓ Branch 0 (2→3) taken 14560 times.
✓ Branch 1 (2→4) taken 99 times.
14659 if (likely(u < 0x80)) {
164 14560 *idx = i + 1;
165 14560 return u;
166 }
167 99 return u_get_nonascii(str, size, idx);
168 }
169
170 101 CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx)
171 {
172 101 size_t i = *idx;
173 101 unsigned int first = str[i++];
174 101 int seq_len = u_seq_len(first);
175
4/4
✓ Branch 0 (3→4) taken 51 times.
✓ Branch 1 (3→5) taken 50 times.
✓ Branch 2 (4→5) taken 1 times.
✓ Branch 3 (4→6) taken 50 times.
101 if (unlikely(seq_len < 2 || seq_len > size - i + 1)) {
176 51 goto invalid;
177 }
178
179 50 unsigned int count = seq_len - 2;
180 50 CodePoint u = first & u_get_first_byte_mask(seq_len);
181
182 116 do {
183 116 unsigned char ch = str[i++];
184
2/2
✓ Branch 0 (8→9) taken 8 times.
✓ Branch 1 (8→10) taken 108 times.
116 if (!u_is_continuation_byte(ch)) {
185 8 goto invalid;
186 }
187 108 u = (u << 6) | (ch & 0x3f);
188
2/2
✓ Branch 0 (10→8) taken 66 times.
✓ Branch 1 (10→11) taken 42 times.
108 } while (count--);
189
190
2/2
✓ Branch 0 (12→13) taken 6 times.
✓ Branch 1 (12→14) taken 36 times.
42 if (u_is_illformed(u, seq_len)) {
191 6 goto invalid;
192 }
193
194 36 *idx = i;
195 36 return u;
196
197 65 invalid:
198 65 *idx += 1;
199 65 return -first;
200 }
201
202 147 size_t u_set_char_raw(char *buf, CodePoint u)
203 {
204 147 unsigned int prefix = 0;
205 147 size_t len = u_char_size(u);
206 147 BUG_ON(len == 0 || len > UTF8_MAX_SEQ_LEN);
207
208
4/4
✓ Branch 0 (4→5) taken 8 times.
✓ Branch 1 (4→6) taken 9 times.
✓ Branch 2 (4→7) taken 4 times.
✓ Branch 3 (4→8) taken 126 times.
147 switch (len) {
209 8 case 4:
210 8 buf[3] = (u & 0x3F) | 0x80;
211 8 u >>= 6;
212 8 prefix |= 0xF0;
213 // Fallthrough
214 17 case 3:
215 17 buf[2] = (u & 0x3F) | 0x80;
216 17 u >>= 6;
217 17 prefix |= 0xE0;
218 // Fallthrough
219 21 case 2:
220 21 buf[1] = (u & 0x3F) | 0x80;
221 21 u >>= 6;
222 21 prefix |= 0xC0;
223 }
224
225 147 buf[0] = (u & 0xFF) | prefix;
226 147 return len;
227 }
228
229 8517 size_t u_set_char(char *buf, CodePoint u)
230 {
231
2/2
✓ Branch 0 (2→3) taken 8498 times.
✓ Branch 1 (2→6) taken 19 times.
8517 if (likely(u <= 0x7F)) {
232 8498 size_t i = 0;
233
2/2
✓ Branch 0 (3→4) taken 8 times.
✓ Branch 1 (3→5) taken 8490 times.
8498 if (unlikely(ascii_iscntrl(u))) {
234 // Use caret notation for control chars:
235 8 buf[i++] = '^';
236 8 u = (u + 64) & 0x7F;
237 }
238 8498 buf[i++] = u;
239 8498 return i;
240 }
241
242
2/2
✓ Branch 0 (7→8) taken 8 times.
✓ Branch 1 (7→9) taken 11 times.
19 if (u_is_unprintable(u)) {
243 8 return u_set_hex(buf, u);
244 }
245
246 11 BUG_ON(u > 0x10FFFF); // (implied by !u_is_unprintable(u))
247 11 return u_set_char_raw(buf, u);
248 }
249
250 8 size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u)
251 {
252 8 buf[0] = '<';
253
2/2
✓ Branch 0 (2→3) taken 5 times.
✓ Branch 1 (2→4) taken 3 times.
8 if (!u_is_unicode(u)) {
254 // Invalid byte (negated)
255 5 u *= -1;
256 5 hex_encode_byte(buf + 1, u & 0xFF);
257 } else {
258 3 buf[1] = '?';
259 3 buf[2] = '?';
260 }
261 8 buf[3] = '>';
262 8 return U_SET_HEX_LEN;
263 }
264
265 // Return the number of bytes that must be skipped at the start of `str`
266 // in order to trim at least `skip_width` columns of display width. This
267 // can be used to e.g. obtain the longest suffix of `str` that can be
268 // displayed in a given number of columns.
269 19 size_t u_skip_chars(const char *str, unsigned int skip_width)
270 {
271 19 size_t idx = 0;
272
4/4
✓ Branch 0 (5→6) taken 93 times.
✓ Branch 1 (5→7) taken 4 times.
✓ Branch 2 (6→3) taken 78 times.
✓ Branch 3 (6→7) taken 15 times.
97 for (unsigned int w = 0; str[idx] && w < skip_width; ) {
273 78 w += u_char_width(u_str_get_char(str, &idx));
274 }
275 19 return idx;
276 }
277