Line | Branch | Exec | Source |
---|---|---|---|
1 | #include <stdbool.h> | ||
2 | #include <stdint.h> | ||
3 | #include "utf8.h" | ||
4 | #include "ascii.h" | ||
5 | #include "debug.h" | ||
6 | #include "numtostr.h" | ||
7 | |||
8 | enum { | ||
9 | I = -1, // Invalid byte | ||
10 | C = 0, // Continuation byte | ||
11 | }; | ||
12 | |||
13 | // https://en.wikipedia.org/wiki/UTF-8#Byte_map | ||
14 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
15 | static const int8_t seq_len_table[256] = { | ||
16 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F | ||
17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F | ||
18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F | ||
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F | ||
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F | ||
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F | ||
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F | ||
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F | ||
24 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 80..8F | ||
25 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 90..9F | ||
26 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // A0..AF | ||
27 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // B0..BF | ||
28 | I, I, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..CF | ||
29 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF | ||
30 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF | ||
31 | 4, 4, 4, 4, 4, I, I, I, I, I, I, I, I, I, I, I // F0..FF | ||
32 | }; | ||
33 | |||
34 | 116 | static int u_seq_len(unsigned char first_byte) | |
35 | { | ||
36 | 116 | int8_t len = seq_len_table[first_byte]; | |
37 | 116 | BUG_ON(len < I || len > UTF8_MAX_SEQ_LEN); | |
38 | 116 | return len; | |
39 | } | ||
40 | |||
41 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288 | ||
42 | 127 | static bool u_is_continuation_byte(unsigned char u) | |
43 | { | ||
44 | // (u & 0b11000000) == 0b10000000 | ||
45 | 127 | return (u & 0xC0) == 0x80; | |
46 | } | ||
47 | |||
48 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
49 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703:~:text=%E2%80%9Cnon%2Dshortest%20form%E2%80%9D | ||
50 | // https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings | ||
51 | // https://en.wikipedia.org/wiki/UTF-8#Error_handling | ||
52 | 48 | static bool u_is_overlong_sequence(CodePoint u, size_t seq_len) | |
53 | { | ||
54 | 48 | BUG_ON(seq_len > UTF8_MAX_SEQ_LEN); | |
55 | 48 | return u_char_size(u) != seq_len; | |
56 | } | ||
57 | |||
58 | /* | ||
59 | * Unicode §3.9.4: "A conformant encoding form conversion will treat any | ||
60 | * ill-formed code unit sequence as an error condition. (See conformance | ||
61 | * clause C10.)" | ||
62 | * | ||
63 | * C10: "When a process interprets a code unit sequence which purports | ||
64 | * to be in a Unicode character encoding form, it shall treat ill-formed | ||
65 | * code unit sequences as an error condition and shall not interpret such | ||
66 | * sequences as characters." | ||
67 | * | ||
68 | * Unicode §3.9.3: | ||
69 | * | ||
70 | * • "Before the Unicode Standard, Version 3.1, the problematic “non-shortest | ||
71 | * form” byte sequences in UTF-8 were those where BMP characters could be | ||
72 | * represented in more than one way. These sequences are ill-formed, because | ||
73 | * they are not allowed by Table 3-7." | ||
74 | * • "Because surrogate code points are not Unicode scalar values, any UTF-8 | ||
75 | * byte sequence that would otherwise map to code points U+D800..U+DFFF | ||
76 | * is ill-formed." | ||
77 | * | ||
78 | * See also: | ||
79 | * | ||
80 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31737 | ||
81 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G23402 | ||
82 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
83 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703 | ||
84 | */ | ||
85 | 48 | static bool u_is_illformed(CodePoint u, size_t seq_len) | |
86 | { | ||
87 |
4/4✓ Branch 0 (3→4) taken 44 times.
✓ Branch 1 (3→6) taken 4 times.
✓ Branch 2 (4→5) taken 2 times.
✓ Branch 3 (4→6) taken 42 times.
|
48 | return u_is_overlong_sequence(u, seq_len) || u_is_surrogate(u); |
88 | } | ||
89 | |||
90 | /* | ||
91 | * Len Mask Note | ||
92 | * ------------------------------------------------- | ||
93 | * 1 0111 1111 Not supported by this function! | ||
94 | * 2 0001 1111 | ||
95 | * 3 0000 1111 | ||
96 | * 4 0000 0111 | ||
97 | * 5 0000 0011 Forbidden by RFC 3629 | ||
98 | * 6 0000 0001 Forbidden by RFC 3629 | ||
99 | */ | ||
100 | 56 | static unsigned int u_get_first_byte_mask(unsigned int seq_len) | |
101 | { | ||
102 | 56 | BUG_ON(seq_len < 2 || seq_len > UTF8_MAX_SEQ_LEN); | |
103 | 56 | return (0x80 >> seq_len) - 1; | |
104 | } | ||
105 | |||
106 | 50 | size_t u_str_width(const unsigned char *str) | |
107 | { | ||
108 | 50 | size_t i = 0, w = 0; | |
109 |
2/2✓ Branch 0 (5→3) taken 274 times.
✓ Branch 1 (5→6) taken 50 times.
|
324 | while (str[i]) { |
110 | 274 | w += u_char_width(u_str_get_char(str, &i)); | |
111 | } | ||
112 | 50 | return w; | |
113 | } | ||
114 | |||
115 | 18 | CodePoint u_prev_char(const unsigned char *str, size_t *idx) | |
116 | { | ||
117 | 18 | size_t i = *idx; | |
118 | 18 | unsigned char ch = str[--i]; | |
119 |
2/2✓ Branch 0 (2→3) taken 7 times.
✓ Branch 1 (2→4) taken 11 times.
|
18 | if (likely(ch < 0x80)) { |
120 | 7 | *idx = i; | |
121 | 7 | return (CodePoint)ch; | |
122 | } | ||
123 | |||
124 |
2/2✓ Branch 0 (4→5) taken 5 times.
✓ Branch 1 (4→6) taken 6 times.
|
11 | if (!u_is_continuation_byte(ch)) { |
125 | 5 | goto invalid; | |
126 | } | ||
127 | |||
128 | 6 | CodePoint u = ch & 0x3f; | |
129 |
1/2✓ Branch 0 (19→7) taken 15 times.
✗ Branch 1 (19→20) not taken.
|
15 | for (unsigned int count = 1, shift = 6; i > 0; ) { |
130 | 15 | ch = str[--i]; | |
131 | 15 | unsigned int len = u_seq_len(ch); | |
132 | 15 | count++; | |
133 |
2/2✓ Branch 0 (8→9) taken 9 times.
✓ Branch 1 (8→12) taken 6 times.
|
15 | if (len == 0) { |
134 |
1/2✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→11) taken 9 times.
|
9 | if (count == 4) { |
135 | // Too long sequence | ||
136 | break; | ||
137 | } | ||
138 | 9 | u |= (ch & 0x3f) << shift; | |
139 | 9 | shift += 6; | |
140 |
1/2✗ Branch 0 (12→13) not taken.
✓ Branch 1 (12→14) taken 6 times.
|
6 | } else if (count != len) { |
141 | // Incorrect length | ||
142 | break; | ||
143 | } else { | ||
144 | 6 | u |= (ch & u_get_first_byte_mask(len)) << shift; | |
145 |
1/2✗ Branch 0 (16→17) not taken.
✓ Branch 1 (16→18) taken 6 times.
|
6 | if (u_is_illformed(u, len)) { |
146 | break; | ||
147 | } | ||
148 | 6 | *idx = i; | |
149 | 6 | return u; | |
150 | } | ||
151 | } | ||
152 | |||
153 | ✗ | invalid: | |
154 | 5 | *idx = *idx - 1; | |
155 | 5 | u = str[*idx]; | |
156 | 5 | return -u; | |
157 | } | ||
158 | |||
159 | 14659 | CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx) | |
160 | { | ||
161 | 14659 | size_t i = *idx; | |
162 | 14659 | CodePoint u = str[i]; | |
163 |
2/2✓ Branch 0 (2→3) taken 14560 times.
✓ Branch 1 (2→4) taken 99 times.
|
14659 | if (likely(u < 0x80)) { |
164 | 14560 | *idx = i + 1; | |
165 | 14560 | return u; | |
166 | } | ||
167 | 99 | return u_get_nonascii(str, size, idx); | |
168 | } | ||
169 | |||
170 | 101 | CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx) | |
171 | { | ||
172 | 101 | size_t i = *idx; | |
173 | 101 | unsigned int first = str[i++]; | |
174 | 101 | int seq_len = u_seq_len(first); | |
175 |
4/4✓ Branch 0 (3→4) taken 51 times.
✓ Branch 1 (3→5) taken 50 times.
✓ Branch 2 (4→5) taken 1 times.
✓ Branch 3 (4→6) taken 50 times.
|
101 | if (unlikely(seq_len < 2 || seq_len > size - i + 1)) { |
176 | 51 | goto invalid; | |
177 | } | ||
178 | |||
179 | 50 | unsigned int count = seq_len - 2; | |
180 | 50 | CodePoint u = first & u_get_first_byte_mask(seq_len); | |
181 | |||
182 | 116 | do { | |
183 | 116 | unsigned char ch = str[i++]; | |
184 |
2/2✓ Branch 0 (8→9) taken 8 times.
✓ Branch 1 (8→10) taken 108 times.
|
116 | if (!u_is_continuation_byte(ch)) { |
185 | 8 | goto invalid; | |
186 | } | ||
187 | 108 | u = (u << 6) | (ch & 0x3f); | |
188 |
2/2✓ Branch 0 (10→8) taken 66 times.
✓ Branch 1 (10→11) taken 42 times.
|
108 | } while (count--); |
189 | |||
190 |
2/2✓ Branch 0 (12→13) taken 6 times.
✓ Branch 1 (12→14) taken 36 times.
|
42 | if (u_is_illformed(u, seq_len)) { |
191 | 6 | goto invalid; | |
192 | } | ||
193 | |||
194 | 36 | *idx = i; | |
195 | 36 | return u; | |
196 | |||
197 | 65 | invalid: | |
198 | 65 | *idx += 1; | |
199 | 65 | return -first; | |
200 | } | ||
201 | |||
202 | 147 | size_t u_set_char_raw(char *buf, CodePoint u) | |
203 | { | ||
204 | 147 | unsigned int prefix = 0; | |
205 | 147 | size_t len = u_char_size(u); | |
206 | 147 | BUG_ON(len == 0 || len > UTF8_MAX_SEQ_LEN); | |
207 | |||
208 |
4/4✓ Branch 0 (4→5) taken 8 times.
✓ Branch 1 (4→6) taken 9 times.
✓ Branch 2 (4→7) taken 4 times.
✓ Branch 3 (4→8) taken 126 times.
|
147 | switch (len) { |
209 | 8 | case 4: | |
210 | 8 | buf[3] = (u & 0x3F) | 0x80; | |
211 | 8 | u >>= 6; | |
212 | 8 | prefix |= 0xF0; | |
213 | // Fallthrough | ||
214 | 17 | case 3: | |
215 | 17 | buf[2] = (u & 0x3F) | 0x80; | |
216 | 17 | u >>= 6; | |
217 | 17 | prefix |= 0xE0; | |
218 | // Fallthrough | ||
219 | 21 | case 2: | |
220 | 21 | buf[1] = (u & 0x3F) | 0x80; | |
221 | 21 | u >>= 6; | |
222 | 21 | prefix |= 0xC0; | |
223 | } | ||
224 | |||
225 | 147 | buf[0] = (u & 0xFF) | prefix; | |
226 | 147 | return len; | |
227 | } | ||
228 | |||
229 | 8517 | size_t u_set_char(char *buf, CodePoint u) | |
230 | { | ||
231 |
2/2✓ Branch 0 (2→3) taken 8498 times.
✓ Branch 1 (2→6) taken 19 times.
|
8517 | if (likely(u <= 0x7F)) { |
232 | 8498 | size_t i = 0; | |
233 |
2/2✓ Branch 0 (3→4) taken 8 times.
✓ Branch 1 (3→5) taken 8490 times.
|
8498 | if (unlikely(ascii_iscntrl(u))) { |
234 | // Use caret notation for control chars: | ||
235 | 8 | buf[i++] = '^'; | |
236 | 8 | u = (u + 64) & 0x7F; | |
237 | } | ||
238 | 8498 | buf[i++] = u; | |
239 | 8498 | return i; | |
240 | } | ||
241 | |||
242 |
2/2✓ Branch 0 (7→8) taken 8 times.
✓ Branch 1 (7→9) taken 11 times.
|
19 | if (u_is_unprintable(u)) { |
243 | 8 | return u_set_hex(buf, u); | |
244 | } | ||
245 | |||
246 | 11 | BUG_ON(u > 0x10FFFF); // (implied by !u_is_unprintable(u)) | |
247 | 11 | return u_set_char_raw(buf, u); | |
248 | } | ||
249 | |||
250 | 8 | size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u) | |
251 | { | ||
252 | 8 | buf[0] = '<'; | |
253 |
2/2✓ Branch 0 (2→3) taken 5 times.
✓ Branch 1 (2→4) taken 3 times.
|
8 | if (!u_is_unicode(u)) { |
254 | // Invalid byte (negated) | ||
255 | 5 | u *= -1; | |
256 | 5 | hex_encode_byte(buf + 1, u & 0xFF); | |
257 | } else { | ||
258 | 3 | buf[1] = '?'; | |
259 | 3 | buf[2] = '?'; | |
260 | } | ||
261 | 8 | buf[3] = '>'; | |
262 | 8 | return U_SET_HEX_LEN; | |
263 | } | ||
264 | |||
265 | // Return the number of bytes that must be skipped at the start of `str` | ||
266 | // in order to trim at least `skip_width` columns of display width. This | ||
267 | // can be used to e.g. obtain the longest suffix of `str` that can be | ||
268 | // displayed in a given number of columns. | ||
269 | 19 | size_t u_skip_chars(const char *str, unsigned int skip_width) | |
270 | { | ||
271 | 19 | size_t idx = 0; | |
272 |
4/4✓ Branch 0 (5→6) taken 93 times.
✓ Branch 1 (5→7) taken 4 times.
✓ Branch 2 (6→3) taken 78 times.
✓ Branch 3 (6→7) taken 15 times.
|
97 | for (unsigned int w = 0; str[idx] && w < skip_width; ) { |
273 | 78 | w += u_char_width(u_str_get_char(str, &idx)); | |
274 | } | ||
275 | 19 | return idx; | |
276 | } | ||
277 |