Line | Branch | Exec | Source |
---|---|---|---|
1 | #ifndef UTIL_UTF8_H | ||
2 | #define UTIL_UTF8_H | ||
3 | |||
4 | #include <stddef.h> | ||
5 | #include "debug.h" | ||
6 | #include "macros.h" | ||
7 | #include "unicode.h" | ||
8 | |||
9 | // Minimum `dest_len` value needed by u_make_printable() to guarantee | ||
10 | // truncation cannot occur | ||
11 | #define U_MAKE_PRINTABLE_MAXLEN(src_len) ((4 * src_len) + 1) | ||
12 | |||
13 | enum { | ||
14 | // Longest UTF-8 sequence (in bytes) permitted by RFC 3629 | ||
15 | // (maximum number of bytes written by u_set_char_raw()) | ||
16 | UTF8_MAX_SEQ_LEN = 4, // STRLEN(u8"\U0001F44D") | ||
17 | |||
18 | // Number of bytes written by u_set_hex() | ||
19 | U_SET_HEX_LEN = 4, // STRLEN("<ff>") | ||
20 | |||
21 | // Maximum number of bytes written by u_set_char() | ||
22 | U_SET_CHAR_MAXLEN = 4, // MAX(UTF8_MAX_SEQ_LEN, U_SET_HEX_LEN) | ||
23 | }; | ||
24 | |||
25 | typedef enum { | ||
26 | // Replace C0 control characters with Unicode "control picture" | ||
27 | // symbols, instead of using caret notation (which can become | ||
28 | // quite ambuguous when formatting terminal escape sequences) | ||
29 | MPF_C0_SYMBOLS = 1 << 0, | ||
30 | } MakePrintableFlags; | ||
31 | |||
32 | size_t u_str_width(const unsigned char *str); | ||
33 | size_t u_skip_chars(const char *str, int *width); | ||
34 | CodePoint u_prev_char(const unsigned char *str, size_t *idx); | ||
35 | CodePoint u_str_get_char(const unsigned char *str, size_t *idx); | ||
36 | CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx); | ||
37 | CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx); | ||
38 | size_t u_set_char_raw(char *buf, CodePoint u); | ||
39 | size_t u_set_char(char *buf, CodePoint u); | ||
40 | size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u); | ||
41 | |||
42 | // Return the number of bytes needed to encode Unicode codepoint `u` | ||
43 | // in UTF-8, or 1 for codepoints exceeding UNICODE_MAX_VALID_CODEPOINT. | ||
44 | // Those in the latter category may have originated as values returned | ||
45 | // by u_get_nonascii() or u_prev_char() (i.e. invalid bytes in a | ||
46 | // sequence that have been negated). | ||
47 | 157 | static inline size_t u_char_size(CodePoint u) | |
48 | { | ||
49 | // If `u` is invalid, set `adj` to 3 and use to adjust the calculation | ||
50 | // so that 1 is returned | ||
51 | 157 | size_t inv = (u > UNICODE_MAX_VALID_CODEPOINT); | |
52 | 157 | size_t adj = inv | (inv << 1); | |
53 | |||
54 |
2/2✓ Branch 0 taken 82 times.
✓ Branch 1 taken 75 times.
|
157 | return 1 + (u > 0x7F) + (u > 0x7FF) + (u > 0xFFFF) - adj; |
55 | } | ||
56 | |||
57 | /* | ||
58 | * Copy into `dest` the printable representation of `src`, escaping | ||
59 | * control characters and other unprintable sequences as necessary. | ||
60 | * Bytes >= 0x80 are assumed to be the start of a UTF-8 multi-byte | ||
61 | * sequence, if subsequent bytes result in a valid encoding, or are | ||
62 | * otherwise byte-wise escaped. This is similar in purpose to the | ||
63 | * BSD strnvisx(3) function, but produces a truncated string if the | ||
64 | * destination buffer has insufficient space. If `dest_len` is at | ||
65 | * least `U_MAKE_PRINTABLE_MAXLEN(src_len)`, truncation can never | ||
66 | * happen. | ||
67 | */ | ||
68 | 10 | static inline size_t u_make_printable ( | |
69 | const char *restrict src, | ||
70 | size_t src_len, | ||
71 | char *restrict dest, | ||
72 | size_t dest_len, | ||
73 | MakePrintableFlags flags | ||
74 | ) { | ||
75 | 10 | BUG_ON(dest_len == 0); | |
76 | 10 | size_t len = 0; | |
77 | |||
78 |
4/4✓ Branch 0 taken 17 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 9 times.
|
18 | for (size_t i = 0; i < src_len && len + U_SET_CHAR_MAXLEN < dest_len; ) { |
79 | 8 | CodePoint u = u_get_char(src, src_len, &i); | |
80 |
2/2✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
|
8 | if (flags & MPF_C0_SYMBOLS) { |
81 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 3 times.
|
5 | u = (u < 0x20) ? u + 0x2400 : (u == 0x7F ? 0x2421 : u); |
82 | } | ||
83 | 8 | len += u_set_char(dest + len, u); | |
84 | } | ||
85 | |||
86 | 10 | dest[len] = '\0'; | |
87 | 10 | return len; | |
88 | } | ||
89 | |||
90 | #endif | ||
91 |