dte test coverage


Directory: ./
File: src/util/utf8.h
Date: 2025-05-08 15:05:54
Exec Total Coverage
Lines: 16 16 100.0%
Functions: 3 3 100.0%
Branches: 12 12 100.0%

Line Branch Exec Source
1 #ifndef UTIL_UTF8_H
2 #define UTIL_UTF8_H
3
4 #include <stddef.h>
5 #include "debug.h"
6 #include "macros.h"
7 #include "unicode.h"
8
9 // Minimum `dest_len` value needed by u_make_printable() to guarantee
10 // truncation cannot occur
11 #define U_MAKE_PRINTABLE_MAXLEN(src_len) ((U_SET_CHAR_MAXLEN * src_len) + 1)
12
13 enum {
14 // Longest UTF-8 sequence (in bytes) permitted by RFC 3629
15 // (maximum number of bytes written by u_set_char_raw())
16 UTF8_MAX_SEQ_LEN = 4, // STRLEN(u8"\U0001F44D")
17
18 // Number of bytes written by u_set_hex()
19 U_SET_HEX_LEN = 4, // STRLEN("<ff>")
20
21 // Maximum number of bytes written by u_set_char()
22 U_SET_CHAR_MAXLEN = 4, // MAX(UTF8_MAX_SEQ_LEN, U_SET_HEX_LEN)
23 };
24
25 typedef enum {
26 // Replace C0 control characters with Unicode "control picture"
27 // symbols, instead of using caret notation (which can become
28 // quite ambuguous when formatting terminal escape sequences)
29 MPF_C0_SYMBOLS = 1 << 0,
30 } MakePrintableFlags;
31
32 size_t u_str_width(const unsigned char *str) NONNULL_ARGS;
33 size_t u_skip_chars(const char *str, unsigned int skip_width) NONNULL_ARGS;
34 CodePoint u_prev_char(const unsigned char *str, size_t *idx) NONNULL_ARGS READWRITE(2);
35 CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3);
36 CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx) NONNULL_ARGS READWRITE(3);
37 size_t u_set_char_raw(char *buf, CodePoint u) NONNULL_ARGS;
38 size_t u_set_char(char *buf, CodePoint u) NONNULL_ARGS;
39 size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u) NONNULL_ARGS;
40
41 402 static inline CodePoint u_str_get_char(const unsigned char *str, size_t *idx)
42 {
43 // We can use a dummy size here, since the null terminator in `str`
44 // guarantees u_get_char() won't read past the end
45 402 return u_get_char(str, *idx + UTF8_MAX_SEQ_LEN, idx);
46 }
47
48 // Return the number of bytes needed to encode Unicode codepoint `u`
49 // in UTF-8, or 1 for codepoints exceeding UNICODE_MAX_VALID_CODEPOINT.
50 // Those in the latter category may have originated as values returned
51 // by u_get_nonascii() or u_prev_char() (i.e. invalid bytes in a
52 // sequence that have been negated).
53 258 static inline size_t u_char_size(CodePoint u)
54 {
55 // If `u` is invalid, set `adj` to 3 and use to adjust the calculation
56 // so that 1 is returned
57 258 size_t inv = (u > UNICODE_MAX_VALID_CODEPOINT);
58 258 size_t adj = inv | (inv << 1);
59
60
2/2
✓ Branch 0 (2→3) taken 169 times.
✓ Branch 1 (2→4) taken 89 times.
258 return 1 + (u > 0x7F) + (u > 0x7FF) + (u > 0xFFFF) - adj;
61 }
62
63 /*
64 * Copy into `dest` the printable representation of `src`, escaping
65 * control characters and other unprintable sequences as necessary.
66 * Bytes >= 0x80 are assumed to be the start of a UTF-8 multi-byte
67 * sequence, if subsequent bytes result in a valid encoding, or are
68 * otherwise byte-wise escaped. This is similar in purpose to the
69 * BSD strnvisx(3) function, but produces a truncated string if the
70 * destination buffer has insufficient space. If `dest_len` is at
71 * least `U_MAKE_PRINTABLE_MAXLEN(src_len)`, truncation can never
72 * happen.
73 */
74 13 static inline size_t u_make_printable (
75 const char *restrict src,
76 size_t src_len,
77 char *restrict dest,
78 size_t dest_len,
79 MakePrintableFlags flags
80 ) {
81 13 BUG_ON(dest_len == 0);
82 13 size_t len = 0;
83
84
4/4
✓ Branch 0 (13→14) taken 8215 times.
✓ Branch 1 (13→15) taken 3 times.
✓ Branch 2 (14→5) taken 8205 times.
✓ Branch 3 (14→15) taken 10 times.
8218 for (size_t i = 0; i < src_len && len + U_SET_CHAR_MAXLEN < dest_len; ) {
85 8205 CodePoint u = u_get_char(src, src_len, &i);
86
2/2
✓ Branch 0 (6→7) taken 5 times.
✓ Branch 1 (6→11) taken 8200 times.
8205 if (flags & MPF_C0_SYMBOLS) {
87
4/4
✓ Branch 0 (7→8) taken 1 times.
✓ Branch 1 (7→9) taken 4 times.
✓ Branch 2 (9→10) taken 1 times.
✓ Branch 3 (9→11) taken 3 times.
5 u = (u < 0x20) ? u + 0x2400 : (u == 0x7F ? 0x2421 : u);
88 }
89 8205 len += u_set_char(dest + len, u);
90 }
91
92 13 dest[len] = '\0';
93 13 return len;
94 }
95
96 #endif
97