| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #ifndef UTIL_UTF8_H | ||
| 2 | #define UTIL_UTF8_H | ||
| 3 | |||
| 4 | #include <stddef.h> | ||
| 5 | #include "debug.h" | ||
| 6 | #include "macros.h" | ||
| 7 | #include "unicode.h" | ||
| 8 | |||
| 9 | // Minimum `dest_len` value needed by u_make_printable() to guarantee | ||
| 10 | // truncation cannot occur | ||
| 11 | #define U_MAKE_PRINTABLE_MAXLEN(src_len) ((U_SET_CHAR_MAXLEN * src_len) + 1) | ||
| 12 | |||
| 13 | enum { | ||
| 14 | // Longest UTF-8 sequence (in bytes) permitted by RFC 3629 | ||
| 15 | // (maximum number of bytes written by u_set_char_raw()) | ||
| 16 | UTF8_MAX_SEQ_LEN = 4, // STRLEN(u8"\U0001F44D") | ||
| 17 | |||
| 18 | // Number of bytes written by u_set_hex() | ||
| 19 | U_SET_HEX_LEN = 4, // STRLEN("<ff>") | ||
| 20 | |||
| 21 | // Maximum number of bytes written by u_set_char() | ||
| 22 | U_SET_CHAR_MAXLEN = 4, // MAX(UTF8_MAX_SEQ_LEN, U_SET_HEX_LEN) | ||
| 23 | }; | ||
| 24 | |||
| 25 | typedef enum { | ||
| 26 | // Replace C0 control characters with Unicode "control picture" | ||
| 27 | // symbols, instead of using caret notation (which can become | ||
| 28 | // quite ambiguous when formatting terminal escape sequences) | ||
| 29 | MPF_C0_SYMBOLS = 1 << 0, | ||
| 30 | } MakePrintableFlags; | ||
| 31 | |||
| 32 | size_t u_str_width(const char *str) NONNULL_ARGS; | ||
| 33 | size_t u_skip_chars(const char *str, unsigned int skip_width) NONNULL_ARGS; | ||
| 34 | CodePoint u_prev_char(const char *str, size_t *idx) NONNULL_ARGS READWRITE(2); | ||
| 35 | CodePoint u_get_char(const char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3); | ||
| 36 | CodePoint u_get_nonascii(const char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3); | ||
| 37 | size_t u_set_char_raw(char *buf, CodePoint u) NONNULL_ARGS; | ||
| 38 | size_t u_set_char(char *buf, CodePoint u) NONNULL_ARGS; | ||
| 39 | size_t u_set_hex(char buf[static U_SET_HEX_LEN], CodePoint u) NONNULL_ARGS; | ||
| 40 | |||
| 41 | 402 | static inline CodePoint u_str_get_char(const char *str, size_t *idx) | |
| 42 | { | ||
| 43 | // We can use a dummy size here, since the null terminator in `str` | ||
| 44 | // guarantees u_get_char() won't read past the end | ||
| 45 | 402 | return u_get_char(str, *idx + UTF8_MAX_SEQ_LEN, idx); | |
| 46 | } | ||
| 47 | |||
| 48 | // Return the number of bytes needed to encode Unicode codepoint `u` | ||
| 49 | // in UTF-8, or 1 for codepoints exceeding UNICODE_MAX_VALID_CODEPOINT. | ||
| 50 | // Those in the latter category may have originated as values returned | ||
| 51 | // by u_get_nonascii() or u_prev_char() (i.e. invalid bytes in a | ||
| 52 | // sequence that have been negated). | ||
| 53 | 258 | static inline size_t u_char_size(CodePoint u) | |
| 54 | { | ||
| 55 | // If `u` is invalid, set `adj` to 3 and use to adjust the calculation | ||
| 56 | // so that 1 is returned | ||
| 57 | 258 | size_t inv = (u > UNICODE_MAX_VALID_CODEPOINT); | |
| 58 | 258 | size_t adj = inv | (inv << 1); | |
| 59 | |||
| 60 |
2/2✓ Branch 2 → 3 taken 169 times.
✓ Branch 2 → 4 taken 89 times.
|
258 | return 1 + (u > 0x7F) + (u > 0x7FF) + (u > 0xFFFF) - adj; |
| 61 | } | ||
| 62 | |||
| 63 | /* | ||
| 64 | * Copy into `dest` the printable representation of `src`, escaping | ||
| 65 | * control characters and other unprintable sequences as necessary. | ||
| 66 | * Bytes >= 0x80 are assumed to be the start of a UTF-8 multi-byte | ||
| 67 | * sequence, if subsequent bytes result in a valid encoding, or are | ||
| 68 | * otherwise byte-wise escaped. This is similar in purpose to the | ||
| 69 | * BSD strnvisx(3) function, but produces a truncated string if the | ||
| 70 | * destination buffer has insufficient space. If `dest_len` is at | ||
| 71 | * least `U_MAKE_PRINTABLE_MAXLEN(src_len)`, truncation can never | ||
| 72 | * happen. | ||
| 73 | */ | ||
| 74 | 13 | static inline size_t u_make_printable ( | |
| 75 | const char *restrict src, | ||
| 76 | size_t src_len, | ||
| 77 | char *restrict dest, | ||
| 78 | size_t dest_len, | ||
| 79 | MakePrintableFlags flags | ||
| 80 | ) { | ||
| 81 | 13 | BUG_ON(dest_len == 0); | |
| 82 | 13 | size_t len = 0; | |
| 83 | |||
| 84 |
4/4✓ Branch 13 → 14 taken 8215 times.
✓ Branch 13 → 15 taken 3 times.
✓ Branch 14 → 5 taken 8205 times.
✓ Branch 14 → 15 taken 10 times.
|
8218 | for (size_t i = 0; i < src_len && len + U_SET_CHAR_MAXLEN < dest_len; ) { |
| 85 | 8205 | CodePoint u = u_get_char(src, src_len, &i); | |
| 86 |
2/2✓ Branch 6 → 7 taken 5 times.
✓ Branch 6 → 11 taken 8200 times.
|
8205 | if (flags & MPF_C0_SYMBOLS) { |
| 87 |
4/4✓ Branch 7 → 8 taken 1 time.
✓ Branch 7 → 9 taken 4 times.
✓ Branch 9 → 10 taken 1 time.
✓ Branch 9 → 11 taken 3 times.
|
5 | u = (u < 0x20) ? u + 0x2400 : (u == 0x7F ? 0x2421 : u); |
| 88 | } | ||
| 89 | 8205 | len += u_set_char(dest + len, u); | |
| 90 | } | ||
| 91 | |||
| 92 | 13 | dest[len] = '\0'; | |
| 93 | 13 | return len; | |
| 94 | } | ||
| 95 | |||
| 96 | #endif | ||
| 97 |