Line | Branch | Exec | Source |
---|---|---|---|
1 | #include <stdbool.h> | ||
2 | #include <stdint.h> | ||
3 | #include "utf8.h" | ||
4 | #include "ascii.h" | ||
5 | #include "debug.h" | ||
6 | #include "numtostr.h" | ||
7 | |||
8 | enum { | ||
9 | I = -1, // Invalid byte | ||
10 | C = 0, // Continuation byte | ||
11 | }; | ||
12 | |||
13 | // https://en.wikipedia.org/wiki/UTF-8#Byte_map | ||
14 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
15 | static const int8_t seq_len_table[256] = { | ||
16 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F | ||
17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F | ||
18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F | ||
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F | ||
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F | ||
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F | ||
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F | ||
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F | ||
24 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 80..8F | ||
25 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 90..9F | ||
26 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // A0..AF | ||
27 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // B0..BF | ||
28 | I, I, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..CF | ||
29 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF | ||
30 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF | ||
31 | 4, 4, 4, 4, 4, I, I, I, I, I, I, I, I, I, I, I // F0..FF | ||
32 | }; | ||
33 | |||
34 | 55 | static int u_seq_len(unsigned char first_byte) | |
35 | { | ||
36 | 55 | int8_t len = seq_len_table[first_byte]; | |
37 | 55 | BUG_ON(len < I || len > UTF8_MAX_SEQ_LEN); | |
38 | 55 | return len; | |
39 | } | ||
40 | |||
41 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288 | ||
42 | 74 | static bool u_is_continuation_byte(unsigned char u) | |
43 | { | ||
44 | // (u & 0b11000000) == 0b10000000 | ||
45 | 74 | return (u & 0xC0) == 0x80; | |
46 | } | ||
47 | |||
48 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
49 | // https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings | ||
50 | // https://en.wikipedia.org/wiki/UTF-8#Error_handling | ||
51 | 33 | static bool u_seq_len_ok(CodePoint u, int len) | |
52 | { | ||
53 | 33 | return u_char_size(u) == len; | |
54 | } | ||
55 | |||
56 | /* | ||
57 | * Len Mask Note | ||
58 | * ------------------------------------------------- | ||
59 | * 1 0111 1111 Not supported by this function! | ||
60 | * 2 0001 1111 | ||
61 | * 3 0000 1111 | ||
62 | * 4 0000 0111 | ||
63 | * 5 0000 0011 Forbidden by RFC 3629 | ||
64 | * 6 0000 0001 Forbidden by RFC 3629 | ||
65 | */ | ||
66 | 35 | static unsigned int u_get_first_byte_mask(unsigned int len) | |
67 | { | ||
68 | 35 | BUG_ON(len < 2); | |
69 | 35 | BUG_ON(len > UTF8_MAX_SEQ_LEN); | |
70 | 35 | return (0x80 >> len) - 1; | |
71 | } | ||
72 | |||
73 | 49 | size_t u_str_width(const unsigned char *str) | |
74 | { | ||
75 | 49 | size_t i = 0, w = 0; | |
76 |
2/2✓ Branch 0 taken 266 times.
✓ Branch 1 taken 49 times.
|
315 | while (str[i]) { |
77 | 266 | w += u_char_width(u_str_get_char(str, &i)); | |
78 | } | ||
79 | 49 | return w; | |
80 | } | ||
81 | |||
82 | 14 | CodePoint u_prev_char(const unsigned char *str, size_t *idx) | |
83 | { | ||
84 | 14 | size_t i = *idx; | |
85 | 14 | unsigned char ch = str[--i]; | |
86 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 11 times.
|
14 | if (likely(ch < 0x80)) { |
87 | 3 | *idx = i; | |
88 | 3 | return (CodePoint)ch; | |
89 | } | ||
90 | |||
91 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 6 times.
|
11 | if (!u_is_continuation_byte(ch)) { |
92 | 5 | goto invalid; | |
93 | } | ||
94 | |||
95 | 6 | CodePoint u = ch & 0x3f; | |
96 |
1/2✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
|
15 | for (unsigned int count = 1, shift = 6; i > 0; ) { |
97 | 15 | ch = str[--i]; | |
98 | 15 | unsigned int len = u_seq_len(ch); | |
99 | 15 | count++; | |
100 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 6 times.
|
15 | if (len == 0) { |
101 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
|
9 | if (count == 4) { |
102 | // Too long sequence | ||
103 | break; | ||
104 | } | ||
105 | 9 | u |= (ch & 0x3f) << shift; | |
106 | 9 | shift += 6; | |
107 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | } else if (count != len) { |
108 | // Incorrect length | ||
109 | break; | ||
110 | } else { | ||
111 | 6 | u |= (ch & u_get_first_byte_mask(len)) << shift; | |
112 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
|
6 | if (!u_seq_len_ok(u, len)) { |
113 | break; | ||
114 | } | ||
115 | 6 | *idx = i; | |
116 | 6 | return u; | |
117 | } | ||
118 | } | ||
119 | |||
120 | ✗ | invalid: | |
121 | 5 | *idx = *idx - 1; | |
122 | 5 | u = str[*idx]; | |
123 | 5 | return -u; | |
124 | } | ||
125 | |||
126 | 316 | CodePoint u_str_get_char(const unsigned char *str, size_t *idx) | |
127 | { | ||
128 | 316 | size_t i = *idx; | |
129 | 316 | CodePoint u = str[i]; | |
130 |
2/2✓ Branch 0 taken 301 times.
✓ Branch 1 taken 15 times.
|
316 | if (likely(u < 0x80)) { |
131 | 301 | *idx = i + 1; | |
132 | 301 | return u; | |
133 | } | ||
134 | 15 | return u_get_nonascii(str, i + UTF8_MAX_SEQ_LEN, idx); | |
135 | } | ||
136 | |||
137 | 1198 | CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx) | |
138 | { | ||
139 | 1198 | size_t i = *idx; | |
140 | 1198 | CodePoint u = str[i]; | |
141 |
2/2✓ Branch 0 taken 1175 times.
✓ Branch 1 taken 23 times.
|
1198 | if (likely(u < 0x80)) { |
142 | 1175 | *idx = i + 1; | |
143 | 1175 | return u; | |
144 | } | ||
145 | 23 | return u_get_nonascii(str, size, idx); | |
146 | } | ||
147 | |||
148 | 40 | CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx) | |
149 | { | ||
150 | 40 | size_t i = *idx; | |
151 | 40 | unsigned int first = str[i++]; | |
152 | 40 | int len = u_seq_len(first); | |
153 |
4/4✓ Branch 0 taken 30 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 29 times.
|
40 | if (unlikely(len < 2 || len > size - i + 1)) { |
154 | 11 | goto invalid; | |
155 | } | ||
156 | |||
157 | 29 | CodePoint u = first & u_get_first_byte_mask(len); | |
158 | 29 | int c = len - 1; | |
159 | 63 | do { | |
160 | 63 | unsigned char ch = str[i++]; | |
161 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 61 times.
|
63 | if (!u_is_continuation_byte(ch)) { |
162 | 2 | goto invalid; | |
163 | } | ||
164 | 61 | u = (u << 6) | (ch & 0x3f); | |
165 |
2/2✓ Branch 0 taken 34 times.
✓ Branch 1 taken 27 times.
|
61 | } while (--c); |
166 | |||
167 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 25 times.
|
27 | if (!u_seq_len_ok(u, len)) { |
168 | // Overlong encoding | ||
169 | 2 | goto invalid; | |
170 | } | ||
171 | |||
172 | 25 | *idx = i; | |
173 | 25 | return u; | |
174 | |||
175 | 15 | invalid: | |
176 | 15 | *idx += 1; | |
177 | 15 | return -first; | |
178 | } | ||
179 | |||
180 | 89 | size_t u_set_char_raw(char *buf, CodePoint u) | |
181 | { | ||
182 | 89 | unsigned int prefix = 0; | |
183 | 89 | size_t len = u_char_size(u); | |
184 | 89 | BUG_ON(len == 0 || len > UTF8_MAX_SEQ_LEN); | |
185 | |||
186 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 9 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 68 times.
|
89 | switch (len) { |
187 | 8 | case 4: | |
188 | 8 | buf[3] = (u & 0x3F) | 0x80; | |
189 | 8 | u >>= 6; | |
190 | 8 | prefix |= 0xF0; | |
191 | // Fallthrough | ||
192 | 17 | case 3: | |
193 | 17 | buf[2] = (u & 0x3F) | 0x80; | |
194 | 17 | u >>= 6; | |
195 | 17 | prefix |= 0xE0; | |
196 | // Fallthrough | ||
197 | 21 | case 2: | |
198 | 21 | buf[1] = (u & 0x3F) | 0x80; | |
199 | 21 | u >>= 6; | |
200 | 21 | prefix |= 0xC0; | |
201 | } | ||
202 | |||
203 | 89 | buf[0] = (u & 0xFF) | prefix; | |
204 | 89 | return len; | |
205 | } | ||
206 | |||
207 | 320 | size_t u_set_char(char *buf, CodePoint u) | |
208 | { | ||
209 |
2/2✓ Branch 0 taken 301 times.
✓ Branch 1 taken 19 times.
|
320 | if (likely(u <= 0x7F)) { |
210 | 301 | size_t i = 0; | |
211 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 296 times.
|
301 | if (unlikely(ascii_iscntrl(u))) { |
212 | // Use caret notation for control chars: | ||
213 | 5 | buf[i++] = '^'; | |
214 | 5 | u = (u + 64) & 0x7F; | |
215 | } | ||
216 | 301 | buf[i++] = u; | |
217 | 301 | return i; | |
218 | } | ||
219 | |||
220 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 11 times.
|
19 | if (u_is_unprintable(u)) { |
221 | 8 | return u_set_hex(buf, u); | |
222 | } | ||
223 | |||
224 | 11 | BUG_ON(u > 0x10FFFF); // (implied by !u_is_unprintable(u)) | |
225 | 11 | return u_set_char_raw(buf, u); | |
226 | } | ||
227 | |||
228 | 8 | size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u) | |
229 | { | ||
230 | 8 | buf[0] = '<'; | |
231 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
|
8 | if (!u_is_unicode(u)) { |
232 | // Invalid byte (negated) | ||
233 | 5 | u *= -1; | |
234 | 5 | hex_encode_byte(buf + 1, u & 0xFF); | |
235 | } else { | ||
236 | 3 | buf[1] = '?'; | |
237 | 3 | buf[2] = '?'; | |
238 | } | ||
239 | 8 | buf[3] = '>'; | |
240 | 8 | return U_SET_HEX_LEN; | |
241 | } | ||
242 | |||
243 | /* | ||
244 | * Total width of skipped characters is stored back to @width. | ||
245 | * | ||
246 | * Stored @width can be 1 more than given width if the last skipped | ||
247 | * character was double width or even 3 more if the last skipped | ||
248 | * character was invalid (<xx>). | ||
249 | * | ||
250 | * Returns number of bytes skipped. | ||
251 | */ | ||
252 | ✗ | size_t u_skip_chars(const char *str, int *width) | |
253 | { | ||
254 | ✗ | int w = *width; | |
255 | ✗ | size_t idx = 0; | |
256 | ✗ | while (str[idx] && w > 0) { | |
257 | ✗ | w -= u_char_width(u_str_get_char(str, &idx)); | |
258 | } | ||
259 | |||
260 | // Add 1..3 if skipped 'too much' (the last char was double | ||
261 | // width or invalid (<xx>)) | ||
262 | ✗ | *width -= w; | |
263 | ✗ | return idx; | |
264 | } | ||
265 |