dte test coverage


Directory: ./
File: src/util/utf8.c
Date: 2024-12-21 16:03:22
Exec Total Coverage
Lines: 117 125 93.6%
Functions: 12 13 92.3%
Branches: 38 46 82.6%

Line Branch Exec Source
1 #include <stdbool.h>
2 #include <stdint.h>
3 #include "utf8.h"
4 #include "ascii.h"
5 #include "debug.h"
6 #include "numtostr.h"
7
8 enum {
9 I = -1, // Invalid byte
10 C = 0, // Continuation byte
11 };
12
13 // https://en.wikipedia.org/wiki/UTF-8#Byte_map
14 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
15 static const int8_t seq_len_table[256] = {
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
24 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 80..8F
25 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 90..9F
26 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // A0..AF
27 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // B0..BF
28 I, I, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..CF
29 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
30 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
31 4, 4, 4, 4, 4, I, I, I, I, I, I, I, I, I, I, I // F0..FF
32 };
33
34 55 static int u_seq_len(unsigned char first_byte)
35 {
36 55 int8_t len = seq_len_table[first_byte];
37 55 BUG_ON(len < I || len > UTF8_MAX_SEQ_LEN);
38 55 return len;
39 }
40
41 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288
42 74 static bool u_is_continuation_byte(unsigned char u)
43 {
44 // (u & 0b11000000) == 0b10000000
45 74 return (u & 0xC0) == 0x80;
46 }
47
48 // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
49 // https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings
50 // https://en.wikipedia.org/wiki/UTF-8#Error_handling
51 33 static bool u_seq_len_ok(CodePoint u, int len)
52 {
53 33 return u_char_size(u) == len;
54 }
55
56 /*
57 * Len Mask Note
58 * -------------------------------------------------
59 * 1 0111 1111 Not supported by this function!
60 * 2 0001 1111
61 * 3 0000 1111
62 * 4 0000 0111
63 * 5 0000 0011 Forbidden by RFC 3629
64 * 6 0000 0001 Forbidden by RFC 3629
65 */
66 35 static unsigned int u_get_first_byte_mask(unsigned int len)
67 {
68 35 BUG_ON(len < 2);
69 35 BUG_ON(len > UTF8_MAX_SEQ_LEN);
70 35 return (0x80 >> len) - 1;
71 }
72
73 49 size_t u_str_width(const unsigned char *str)
74 {
75 49 size_t i = 0, w = 0;
76
2/2
✓ Branch 0 taken 266 times.
✓ Branch 1 taken 49 times.
315 while (str[i]) {
77 266 w += u_char_width(u_str_get_char(str, &i));
78 }
79 49 return w;
80 }
81
82 14 CodePoint u_prev_char(const unsigned char *str, size_t *idx)
83 {
84 14 size_t i = *idx;
85 14 unsigned char ch = str[--i];
86
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 11 times.
14 if (likely(ch < 0x80)) {
87 3 *idx = i;
88 3 return (CodePoint)ch;
89 }
90
91
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 6 times.
11 if (!u_is_continuation_byte(ch)) {
92 5 goto invalid;
93 }
94
95 6 CodePoint u = ch & 0x3f;
96
1/2
✓ Branch 0 taken 15 times.
✗ Branch 1 not taken.
15 for (unsigned int count = 1, shift = 6; i > 0; ) {
97 15 ch = str[--i];
98 15 unsigned int len = u_seq_len(ch);
99 15 count++;
100
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 6 times.
15 if (len == 0) {
101
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
9 if (count == 4) {
102 // Too long sequence
103 break;
104 }
105 9 u |= (ch & 0x3f) << shift;
106 9 shift += 6;
107
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 } else if (count != len) {
108 // Incorrect length
109 break;
110 } else {
111 6 u |= (ch & u_get_first_byte_mask(len)) << shift;
112
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6 times.
6 if (!u_seq_len_ok(u, len)) {
113 break;
114 }
115 6 *idx = i;
116 6 return u;
117 }
118 }
119
120 invalid:
121 5 *idx = *idx - 1;
122 5 u = str[*idx];
123 5 return -u;
124 }
125
126 316 CodePoint u_str_get_char(const unsigned char *str, size_t *idx)
127 {
128 316 size_t i = *idx;
129 316 CodePoint u = str[i];
130
2/2
✓ Branch 0 taken 301 times.
✓ Branch 1 taken 15 times.
316 if (likely(u < 0x80)) {
131 301 *idx = i + 1;
132 301 return u;
133 }
134 15 return u_get_nonascii(str, i + UTF8_MAX_SEQ_LEN, idx);
135 }
136
137 1198 CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx)
138 {
139 1198 size_t i = *idx;
140 1198 CodePoint u = str[i];
141
2/2
✓ Branch 0 taken 1175 times.
✓ Branch 1 taken 23 times.
1198 if (likely(u < 0x80)) {
142 1175 *idx = i + 1;
143 1175 return u;
144 }
145 23 return u_get_nonascii(str, size, idx);
146 }
147
148 40 CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx)
149 {
150 40 size_t i = *idx;
151 40 unsigned int first = str[i++];
152 40 int len = u_seq_len(first);
153
4/4
✓ Branch 0 taken 30 times.
✓ Branch 1 taken 10 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 29 times.
40 if (unlikely(len < 2 || len > size - i + 1)) {
154 11 goto invalid;
155 }
156
157 29 CodePoint u = first & u_get_first_byte_mask(len);
158 29 int c = len - 1;
159 63 do {
160 63 unsigned char ch = str[i++];
161
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 61 times.
63 if (!u_is_continuation_byte(ch)) {
162 2 goto invalid;
163 }
164 61 u = (u << 6) | (ch & 0x3f);
165
2/2
✓ Branch 0 taken 34 times.
✓ Branch 1 taken 27 times.
61 } while (--c);
166
167
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 25 times.
27 if (!u_seq_len_ok(u, len)) {
168 // Overlong encoding
169 2 goto invalid;
170 }
171
172 25 *idx = i;
173 25 return u;
174
175 15 invalid:
176 15 *idx += 1;
177 15 return -first;
178 }
179
180 89 size_t u_set_char_raw(char *buf, CodePoint u)
181 {
182 89 unsigned int prefix = 0;
183 89 size_t len = u_char_size(u);
184 89 BUG_ON(len == 0 || len > UTF8_MAX_SEQ_LEN);
185
186
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 9 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 68 times.
89 switch (len) {
187 8 case 4:
188 8 buf[3] = (u & 0x3F) | 0x80;
189 8 u >>= 6;
190 8 prefix |= 0xF0;
191 // Fallthrough
192 17 case 3:
193 17 buf[2] = (u & 0x3F) | 0x80;
194 17 u >>= 6;
195 17 prefix |= 0xE0;
196 // Fallthrough
197 21 case 2:
198 21 buf[1] = (u & 0x3F) | 0x80;
199 21 u >>= 6;
200 21 prefix |= 0xC0;
201 }
202
203 89 buf[0] = (u & 0xFF) | prefix;
204 89 return len;
205 }
206
207 320 size_t u_set_char(char *buf, CodePoint u)
208 {
209
2/2
✓ Branch 0 taken 301 times.
✓ Branch 1 taken 19 times.
320 if (likely(u <= 0x7F)) {
210 301 size_t i = 0;
211
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 296 times.
301 if (unlikely(ascii_iscntrl(u))) {
212 // Use caret notation for control chars:
213 5 buf[i++] = '^';
214 5 u = (u + 64) & 0x7F;
215 }
216 301 buf[i++] = u;
217 301 return i;
218 }
219
220
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 11 times.
19 if (u_is_unprintable(u)) {
221 8 return u_set_hex(buf, u);
222 }
223
224 11 BUG_ON(u > 0x10FFFF); // (implied by !u_is_unprintable(u))
225 11 return u_set_char_raw(buf, u);
226 }
227
228 8 size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u)
229 {
230 8 buf[0] = '<';
231
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
8 if (!u_is_unicode(u)) {
232 // Invalid byte (negated)
233 5 u *= -1;
234 5 hex_encode_byte(buf + 1, u & 0xFF);
235 } else {
236 3 buf[1] = '?';
237 3 buf[2] = '?';
238 }
239 8 buf[3] = '>';
240 8 return U_SET_HEX_LEN;
241 }
242
243 /*
244 * Total width of skipped characters is stored back to @width.
245 *
246 * Stored @width can be 1 more than given width if the last skipped
247 * character was double width or even 3 more if the last skipped
248 * character was invalid (<xx>).
249 *
250 * Returns number of bytes skipped.
251 */
252 size_t u_skip_chars(const char *str, int *width)
253 {
254 int w = *width;
255 size_t idx = 0;
256 while (str[idx] && w > 0) {
257 w -= u_char_width(u_str_get_char(str, &idx));
258 }
259
260 // Add 1..3 if skipped 'too much' (the last char was double
261 // width or invalid (<xx>))
262 *width -= w;
263 return idx;
264 }
265