| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include <stdbool.h> | ||
| 2 | #include <stdint.h> | ||
| 3 | #include "utf8.h" | ||
| 4 | #include "ascii.h" | ||
| 5 | #include "debug.h" | ||
| 6 | #include "numtostr.h" | ||
| 7 | |||
| 8 | enum { | ||
| 9 | I = -1, // Invalid byte | ||
| 10 | C = 0, // Continuation byte | ||
| 11 | }; | ||
| 12 | |||
| 13 | // https://en.wikipedia.org/wiki/UTF-8#Byte_map | ||
| 14 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
| 15 | static const int8_t seq_len_table[256] = { | ||
| 16 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F | ||
| 17 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F | ||
| 18 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F | ||
| 19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F | ||
| 20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F | ||
| 21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F | ||
| 22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F | ||
| 23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F | ||
| 24 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 80..8F | ||
| 25 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // 90..9F | ||
| 26 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // A0..AF | ||
| 27 | C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, // B0..BF | ||
| 28 | I, I, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..CF | ||
| 29 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF | ||
| 30 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF | ||
| 31 | 4, 4, 4, 4, 4, I, I, I, I, I, I, I, I, I, I, I // F0..FF | ||
| 32 | }; | ||
| 33 | |||
| 34 | 116 | static int u_seq_len(unsigned char first_byte) | |
| 35 | { | ||
| 36 | 116 | int8_t len = seq_len_table[first_byte]; | |
| 37 | 116 | BUG_ON(len < I || len > UTF8_MAX_SEQ_LEN); | |
| 38 | 116 | return len; | |
| 39 | } | ||
| 40 | |||
| 41 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288 | ||
| 42 | 127 | static bool u_is_continuation_byte(unsigned char u) | |
| 43 | { | ||
| 44 | // (u & 0b11000000) == 0b10000000 | ||
| 45 | 127 | return (u & 0xC0) == 0x80; | |
| 46 | } | ||
| 47 | |||
| 48 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
| 49 | // https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703:~:text=%E2%80%9Cnon%2Dshortest%20form%E2%80%9D | ||
| 50 | // https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings | ||
| 51 | // https://en.wikipedia.org/wiki/UTF-8#Error_handling | ||
| 52 | 48 | static bool u_is_overlong_sequence(CodePoint u, size_t seq_len) | |
| 53 | { | ||
| 54 | 48 | BUG_ON(seq_len > UTF8_MAX_SEQ_LEN); | |
| 55 | 48 | return u_char_size(u) != seq_len; | |
| 56 | } | ||
| 57 | |||
| 58 | /* | ||
| 59 | * Unicode §3.9.4: "A conformant encoding form conversion will treat any | ||
| 60 | * ill-formed code unit sequence as an error condition. (See conformance | ||
| 61 | * clause C10.)" | ||
| 62 | * | ||
| 63 | * C10: "When a process interprets a code unit sequence which purports | ||
| 64 | * to be in a Unicode character encoding form, it shall treat ill-formed | ||
| 65 | * code unit sequences as an error condition and shall not interpret such | ||
| 66 | * sequences as characters." | ||
| 67 | * | ||
| 68 | * Unicode §3.9.3: | ||
| 69 | * | ||
| 70 | * • "Before the Unicode Standard, Version 3.1, the problematic “non-shortest | ||
| 71 | * form” byte sequences in UTF-8 were those where BMP characters could be | ||
| 72 | * represented in more than one way. These sequences are ill-formed, because | ||
| 73 | * they are not allowed by Table 3-7." | ||
| 74 | * • "Because surrogate code points are not Unicode scalar values, any UTF-8 | ||
| 75 | * byte sequence that would otherwise map to code points U+D800..U+DFFF | ||
| 76 | * is ill-formed." | ||
| 77 | * | ||
| 78 | * See also: | ||
| 79 | * | ||
| 80 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31737 | ||
| 81 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G23402 | ||
| 82 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 | ||
| 83 | * • https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G31703 | ||
| 84 | */ | ||
| 85 | 48 | static bool u_is_illformed(CodePoint u, size_t seq_len) | |
| 86 | { | ||
| 87 |
4/4✓ Branch 0 (3→4) taken 44 times.
✓ Branch 1 (3→6) taken 4 times.
✓ Branch 2 (4→5) taken 2 times.
✓ Branch 3 (4→6) taken 42 times.
|
48 | return u_is_overlong_sequence(u, seq_len) || u_is_surrogate(u); |
| 88 | } | ||
| 89 | |||
| 90 | /* | ||
| 91 | * Len Mask Note | ||
| 92 | * ------------------------------------------------- | ||
| 93 | * 1 0111 1111 Not supported by this function! | ||
| 94 | * 2 0001 1111 | ||
| 95 | * 3 0000 1111 | ||
| 96 | * 4 0000 0111 | ||
| 97 | * 5 0000 0011 Forbidden by RFC 3629 | ||
| 98 | * 6 0000 0001 Forbidden by RFC 3629 | ||
| 99 | */ | ||
| 100 | 56 | static unsigned int u_get_first_byte_mask(unsigned int seq_len) | |
| 101 | { | ||
| 102 | 56 | BUG_ON(seq_len < 2 || seq_len > UTF8_MAX_SEQ_LEN); | |
| 103 | 56 | return (0x80 >> seq_len) - 1; | |
| 104 | } | ||
| 105 | |||
| 106 | 50 | size_t u_str_width(const char *str) | |
| 107 | { | ||
| 108 | 50 | size_t i = 0, w = 0; | |
| 109 |
2/2✓ Branch 0 (5→3) taken 274 times.
✓ Branch 1 (5→6) taken 50 times.
|
324 | while (str[i]) { |
| 110 | 274 | w += u_char_width(u_str_get_char(str, &i)); | |
| 111 | } | ||
| 112 | 50 | return w; | |
| 113 | } | ||
| 114 | |||
| 115 | 19 | CodePoint u_prev_char(const char *str, size_t *idx) | |
| 116 | { | ||
| 117 | 19 | size_t i = *idx; | |
| 118 | 19 | unsigned char ch = str[--i]; | |
| 119 |
2/2✓ Branch 0 (2→3) taken 8 times.
✓ Branch 1 (2→4) taken 11 times.
|
19 | if (likely(ch < 0x80)) { |
| 120 | 8 | *idx = i; | |
| 121 | 8 | return (CodePoint)ch; | |
| 122 | } | ||
| 123 | |||
| 124 |
2/2✓ Branch 0 (4→5) taken 5 times.
✓ Branch 1 (4→6) taken 6 times.
|
11 | if (!u_is_continuation_byte(ch)) { |
| 125 | 5 | goto invalid; | |
| 126 | } | ||
| 127 | |||
| 128 | 6 | CodePoint u = ch & 0x3f; | |
| 129 |
1/2✓ Branch 0 (19→7) taken 15 times.
✗ Branch 1 (19→20) not taken.
|
15 | for (unsigned int count = 1, shift = 6; i > 0; ) { |
| 130 | 15 | ch = str[--i]; | |
| 131 | 15 | unsigned int len = u_seq_len(ch); | |
| 132 | 15 | count++; | |
| 133 |
2/2✓ Branch 0 (8→9) taken 9 times.
✓ Branch 1 (8→12) taken 6 times.
|
15 | if (len == 0) { |
| 134 |
1/2✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→11) taken 9 times.
|
9 | if (count == 4) { |
| 135 | // Too long sequence | ||
| 136 | break; | ||
| 137 | } | ||
| 138 | 9 | u |= (ch & 0x3f) << shift; | |
| 139 | 9 | shift += 6; | |
| 140 |
1/2✗ Branch 0 (12→13) not taken.
✓ Branch 1 (12→14) taken 6 times.
|
6 | } else if (count != len) { |
| 141 | // Incorrect length | ||
| 142 | break; | ||
| 143 | } else { | ||
| 144 | 6 | u |= (ch & u_get_first_byte_mask(len)) << shift; | |
| 145 |
1/2✗ Branch 0 (16→17) not taken.
✓ Branch 1 (16→18) taken 6 times.
|
6 | if (u_is_illformed(u, len)) { |
| 146 | break; | ||
| 147 | } | ||
| 148 | 6 | *idx = i; | |
| 149 | 6 | return u; | |
| 150 | } | ||
| 151 | } | ||
| 152 | |||
| 153 | ✗ | invalid: | |
| 154 | 5 | *idx = *idx - 1; | |
| 155 | 5 | u = (unsigned char)str[*idx]; | |
| 156 | 5 | return -u; | |
| 157 | } | ||
| 158 | |||
| 159 | 15159 | CodePoint u_get_char(const char *str, size_t size, size_t *idx) | |
| 160 | { | ||
| 161 | 15159 | size_t i = *idx; | |
| 162 | 15159 | CodePoint u = (unsigned char)str[i]; | |
| 163 |
2/2✓ Branch 0 (2→3) taken 15060 times.
✓ Branch 1 (2→4) taken 99 times.
|
15159 | if (likely(u < 0x80)) { |
| 164 | 15060 | *idx = i + 1; | |
| 165 | 15060 | return u; | |
| 166 | } | ||
| 167 | 99 | return u_get_nonascii(str, size, idx); | |
| 168 | } | ||
| 169 | |||
| 170 | 101 | CodePoint u_get_nonascii(const char *str, size_t size, size_t *idx) | |
| 171 | { | ||
| 172 | 101 | size_t i = *idx; | |
| 173 | 101 | unsigned int first = (unsigned char)str[i++]; | |
| 174 | 101 | int seq_len = u_seq_len(first); | |
| 175 |
4/4✓ Branch 0 (3→4) taken 51 times.
✓ Branch 1 (3→5) taken 50 times.
✓ Branch 2 (4→5) taken 1 times.
✓ Branch 3 (4→6) taken 50 times.
|
101 | if (unlikely(seq_len < 2 || seq_len > size - i + 1)) { |
| 176 | 51 | goto invalid; | |
| 177 | } | ||
| 178 | |||
| 179 | 50 | unsigned int count = seq_len - 2; | |
| 180 | 50 | CodePoint u = first & u_get_first_byte_mask(seq_len); | |
| 181 | |||
| 182 | 116 | do { | |
| 183 | 116 | unsigned char ch = str[i++]; | |
| 184 |
2/2✓ Branch 0 (8→9) taken 8 times.
✓ Branch 1 (8→10) taken 108 times.
|
116 | if (!u_is_continuation_byte(ch)) { |
| 185 | 8 | goto invalid; | |
| 186 | } | ||
| 187 | 108 | u = (u << 6) | (ch & 0x3f); | |
| 188 |
2/2✓ Branch 0 (10→8) taken 66 times.
✓ Branch 1 (10→11) taken 42 times.
|
108 | } while (count--); |
| 189 | |||
| 190 |
2/2✓ Branch 0 (12→13) taken 6 times.
✓ Branch 1 (12→14) taken 36 times.
|
42 | if (u_is_illformed(u, seq_len)) { |
| 191 | 6 | goto invalid; | |
| 192 | } | ||
| 193 | |||
| 194 | 36 | *idx = i; | |
| 195 | 36 | return u; | |
| 196 | |||
| 197 | 65 | invalid: | |
| 198 | 65 | *idx += 1; | |
| 199 | 65 | return -first; | |
| 200 | } | ||
| 201 | |||
| 202 | 147 | size_t u_set_char_raw(char *buf, CodePoint u) | |
| 203 | { | ||
| 204 | 147 | unsigned char *ubuf = (unsigned char*)buf; | |
| 205 | 147 | unsigned int prefix = 0; | |
| 206 | 147 | size_t len = u_char_size(u); | |
| 207 | 147 | BUG_ON(len == 0 || len > UTF8_MAX_SEQ_LEN); | |
| 208 | |||
| 209 |
4/4✓ Branch 0 (4→5) taken 8 times.
✓ Branch 1 (4→6) taken 9 times.
✓ Branch 2 (4→7) taken 4 times.
✓ Branch 3 (4→8) taken 126 times.
|
147 | switch (len) { |
| 210 | 8 | case 4: | |
| 211 | 8 | ubuf[3] = (u & 0x3F) | 0x80; | |
| 212 | 8 | u >>= 6; | |
| 213 | 8 | prefix |= 0xF0; | |
| 214 | // Fallthrough | ||
| 215 | 17 | case 3: | |
| 216 | 17 | ubuf[2] = (u & 0x3F) | 0x80; | |
| 217 | 17 | u >>= 6; | |
| 218 | 17 | prefix |= 0xE0; | |
| 219 | // Fallthrough | ||
| 220 | 21 | case 2: | |
| 221 | 21 | ubuf[1] = (u & 0x3F) | 0x80; | |
| 222 | 21 | u >>= 6; | |
| 223 | 21 | prefix |= 0xC0; | |
| 224 | } | ||
| 225 | |||
| 226 | 147 | ubuf[0] = (u & 0xFF) | prefix; | |
| 227 | 147 | return len; | |
| 228 | } | ||
| 229 | |||
| 230 | 8529 | size_t u_set_char(char *buf, CodePoint u) | |
| 231 | { | ||
| 232 |
2/2✓ Branch 0 (2→3) taken 8510 times.
✓ Branch 1 (2→6) taken 19 times.
|
8529 | if (likely(u <= 0x7F)) { |
| 233 | 8510 | size_t i = 0; | |
| 234 |
2/2✓ Branch 0 (3→4) taken 8 times.
✓ Branch 1 (3→5) taken 8502 times.
|
8510 | if (unlikely(ascii_iscntrl(u))) { |
| 235 | // Use caret notation for control chars: | ||
| 236 | 8 | buf[i++] = '^'; | |
| 237 | 8 | u = (u + 64) & 0x7F; | |
| 238 | } | ||
| 239 | 8510 | buf[i++] = u; | |
| 240 | 8510 | return i; | |
| 241 | } | ||
| 242 | |||
| 243 |
2/2✓ Branch 0 (7→8) taken 8 times.
✓ Branch 1 (7→9) taken 11 times.
|
19 | if (u_is_unprintable(u)) { |
| 244 | 8 | return u_set_hex(buf, u); | |
| 245 | } | ||
| 246 | |||
| 247 | 11 | BUG_ON(u > 0x10FFFF); // (implied by !u_is_unprintable(u)) | |
| 248 | 11 | return u_set_char_raw(buf, u); | |
| 249 | } | ||
| 250 | |||
| 251 | 8 | size_t u_set_hex(char buf[static U_SET_HEX_LEN], CodePoint u) | |
| 252 | { | ||
| 253 | 8 | buf[0] = '<'; | |
| 254 |
2/2✓ Branch 0 (2→3) taken 5 times.
✓ Branch 1 (2→4) taken 3 times.
|
8 | if (!u_is_unicode(u)) { |
| 255 | // Invalid byte (negated) | ||
| 256 | 5 | u *= -1; | |
| 257 | 5 | hex_encode_byte(buf + 1, u & 0xFF); | |
| 258 | } else { | ||
| 259 | 3 | buf[1] = '?'; | |
| 260 | 3 | buf[2] = '?'; | |
| 261 | } | ||
| 262 | 8 | buf[3] = '>'; | |
| 263 | 8 | return U_SET_HEX_LEN; | |
| 264 | } | ||
| 265 | |||
| 266 | // Return the number of bytes that must be skipped at the start of `str` | ||
| 267 | // in order to trim at least `skip_width` columns of display width. This | ||
| 268 | // can be used to e.g. obtain the longest suffix of `str` that can be | ||
| 269 | // displayed in a given number of columns. | ||
| 270 | 19 | size_t u_skip_chars(const char *str, unsigned int skip_width) | |
| 271 | { | ||
| 272 | 19 | size_t idx = 0; | |
| 273 |
4/4✓ Branch 0 (5→6) taken 93 times.
✓ Branch 1 (5→7) taken 4 times.
✓ Branch 2 (6→3) taken 78 times.
✓ Branch 3 (6→7) taken 15 times.
|
97 | for (unsigned int w = 0; str[idx] && w < skip_width; ) { |
| 274 | 78 | w += u_char_width(u_str_get_char(str, &idx)); | |
| 275 | } | ||
| 276 | 19 | return idx; | |
| 277 | } | ||
| 278 |