Line | Branch | Exec | Source |
---|---|---|---|
1 | #ifndef UTIL_UTF8_H | ||
2 | #define UTIL_UTF8_H | ||
3 | |||
4 | #include <stddef.h> | ||
5 | #include "debug.h" | ||
6 | #include "macros.h" | ||
7 | #include "unicode.h" | ||
8 | |||
9 | // Minimum `dest_len` value needed by u_make_printable() to guarantee | ||
10 | // truncation cannot occur | ||
11 | #define U_MAKE_PRINTABLE_MAXLEN(src_len) ((U_SET_CHAR_MAXLEN * src_len) + 1) | ||
12 | |||
13 | enum { | ||
14 | // Longest UTF-8 sequence (in bytes) permitted by RFC 3629 | ||
15 | // (maximum number of bytes written by u_set_char_raw()) | ||
16 | UTF8_MAX_SEQ_LEN = 4, // STRLEN(u8"\U0001F44D") | ||
17 | |||
18 | // Number of bytes written by u_set_hex() | ||
19 | U_SET_HEX_LEN = 4, // STRLEN("<ff>") | ||
20 | |||
21 | // Maximum number of bytes written by u_set_char() | ||
22 | U_SET_CHAR_MAXLEN = 4, // MAX(UTF8_MAX_SEQ_LEN, U_SET_HEX_LEN) | ||
23 | }; | ||
24 | |||
25 | typedef enum { | ||
26 | // Replace C0 control characters with Unicode "control picture" | ||
27 | // symbols, instead of using caret notation (which can become | ||
28 | // quite ambuguous when formatting terminal escape sequences) | ||
29 | MPF_C0_SYMBOLS = 1 << 0, | ||
30 | } MakePrintableFlags; | ||
31 | |||
32 | size_t u_str_width(const unsigned char *str) NONNULL_ARGS; | ||
33 | size_t u_skip_chars(const char *str, unsigned int skip_width) NONNULL_ARGS; | ||
34 | CodePoint u_prev_char(const unsigned char *str, size_t *idx) NONNULL_ARGS READWRITE(2); | ||
35 | CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3); | ||
36 | CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3); | ||
37 | size_t u_set_char_raw(char *buf, CodePoint u) NONNULL_ARGS; | ||
38 | size_t u_set_char(char *buf, CodePoint u) NONNULL_ARGS; | ||
39 | size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u) NONNULL_ARGS; | ||
40 | |||
41 | 402 | static inline CodePoint u_str_get_char(const unsigned char *str, size_t *idx) | |
42 | { | ||
43 | // We can use a dummy size here, since the null terminator in `str` | ||
44 | // guarantees u_get_char() won't read past the end | ||
45 | 402 | return u_get_char(str, *idx + UTF8_MAX_SEQ_LEN, idx); | |
46 | } | ||
47 | |||
48 | // Return the number of bytes needed to encode Unicode codepoint `u` | ||
49 | // in UTF-8, or 1 for codepoints exceeding UNICODE_MAX_VALID_CODEPOINT. | ||
50 | // Those in the latter category may have originated as values returned | ||
51 | // by u_get_nonascii() or u_prev_char() (i.e. invalid bytes in a | ||
52 | // sequence that have been negated). | ||
53 | 258 | static inline size_t u_char_size(CodePoint u) | |
54 | { | ||
55 | // If `u` is invalid, set `adj` to 3 and use to adjust the calculation | ||
56 | // so that 1 is returned | ||
57 | 258 | size_t inv = (u > UNICODE_MAX_VALID_CODEPOINT); | |
58 | 258 | size_t adj = inv | (inv << 1); | |
59 | |||
60 |
2/2✓ Branch 0 (2→3) taken 169 times.
✓ Branch 1 (2→4) taken 89 times.
|
258 | return 1 + (u > 0x7F) + (u > 0x7FF) + (u > 0xFFFF) - adj; |
61 | } | ||
62 | |||
63 | /* | ||
64 | * Copy into `dest` the printable representation of `src`, escaping | ||
65 | * control characters and other unprintable sequences as necessary. | ||
66 | * Bytes >= 0x80 are assumed to be the start of a UTF-8 multi-byte | ||
67 | * sequence, if subsequent bytes result in a valid encoding, or are | ||
68 | * otherwise byte-wise escaped. This is similar in purpose to the | ||
69 | * BSD strnvisx(3) function, but produces a truncated string if the | ||
70 | * destination buffer has insufficient space. If `dest_len` is at | ||
71 | * least `U_MAKE_PRINTABLE_MAXLEN(src_len)`, truncation can never | ||
72 | * happen. | ||
73 | */ | ||
74 | 13 | static inline size_t u_make_printable ( | |
75 | const char *restrict src, | ||
76 | size_t src_len, | ||
77 | char *restrict dest, | ||
78 | size_t dest_len, | ||
79 | MakePrintableFlags flags | ||
80 | ) { | ||
81 | 13 | BUG_ON(dest_len == 0); | |
82 | 13 | size_t len = 0; | |
83 | |||
84 |
4/4✓ Branch 0 (13→14) taken 8215 times.
✓ Branch 1 (13→15) taken 3 times.
✓ Branch 2 (14→5) taken 8205 times.
✓ Branch 3 (14→15) taken 10 times.
|
8218 | for (size_t i = 0; i < src_len && len + U_SET_CHAR_MAXLEN < dest_len; ) { |
85 | 8205 | CodePoint u = u_get_char(src, src_len, &i); | |
86 |
2/2✓ Branch 0 (6→7) taken 5 times.
✓ Branch 1 (6→11) taken 8200 times.
|
8205 | if (flags & MPF_C0_SYMBOLS) { |
87 |
4/4✓ Branch 0 (7→8) taken 1 times.
✓ Branch 1 (7→9) taken 4 times.
✓ Branch 2 (9→10) taken 1 times.
✓ Branch 3 (9→11) taken 3 times.
|
5 | u = (u < 0x20) ? u + 0x2400 : (u == 0x7F ? 0x2421 : u); |
88 | } | ||
89 | 8205 | len += u_set_char(dest + len, u); | |
90 | } | ||
91 | |||
92 | 13 | dest[len] = '\0'; | |
93 | 13 | return len; | |
94 | } | ||
95 | |||
96 | #endif | ||
97 |