dte test coverage


Directory: ./
File: src/util/utf8.h
Date: 2024-12-21 16:03:22
Exec Total Coverage
Lines: 14 14 100.0%
Functions: 2 2 100.0%
Branches: 12 12 100.0%

Line Branch Exec Source
1 #ifndef UTIL_UTF8_H
2 #define UTIL_UTF8_H
3
4 #include <stddef.h>
5 #include "debug.h"
6 #include "macros.h"
7 #include "unicode.h"
8
9 // Minimum `dest_len` value needed by u_make_printable() to guarantee
10 // truncation cannot occur
11 #define U_MAKE_PRINTABLE_MAXLEN(src_len) ((4 * src_len) + 1)
12
13 enum {
14 // Longest UTF-8 sequence (in bytes) permitted by RFC 3629
15 // (maximum number of bytes written by u_set_char_raw())
16 UTF8_MAX_SEQ_LEN = 4, // STRLEN(u8"\U0001F44D")
17
18 // Number of bytes written by u_set_hex()
19 U_SET_HEX_LEN = 4, // STRLEN("<ff>")
20
21 // Maximum number of bytes written by u_set_char()
22 U_SET_CHAR_MAXLEN = 4, // MAX(UTF8_MAX_SEQ_LEN, U_SET_HEX_LEN)
23 };
24
25 typedef enum {
26 // Replace C0 control characters with Unicode "control picture"
27 // symbols, instead of using caret notation (which can become
28 // quite ambuguous when formatting terminal escape sequences)
29 MPF_C0_SYMBOLS = 1 << 0,
30 } MakePrintableFlags;
31
32 size_t u_str_width(const unsigned char *str);
33 size_t u_skip_chars(const char *str, int *width);
34 CodePoint u_prev_char(const unsigned char *str, size_t *idx);
35 CodePoint u_str_get_char(const unsigned char *str, size_t *idx);
36 CodePoint u_get_char(const unsigned char *str, size_t size, size_t *idx);
37 CodePoint u_get_nonascii(const unsigned char *str, size_t size, size_t *idx);
38 size_t u_set_char_raw(char *buf, CodePoint u);
39 size_t u_set_char(char *buf, CodePoint u);
40 size_t u_set_hex(char buf[U_SET_HEX_LEN], CodePoint u);
41
42 // Return the number of bytes needed to encode Unicode codepoint `u`
43 // in UTF-8, or 1 for codepoints exceeding UNICODE_MAX_VALID_CODEPOINT.
44 // Those in the latter category may have originated as values returned
45 // by u_get_nonascii() or u_prev_char() (i.e. invalid bytes in a
46 // sequence that have been negated).
47 157 static inline size_t u_char_size(CodePoint u)
48 {
49 // If `u` is invalid, set `adj` to 3 and use to adjust the calculation
50 // so that 1 is returned
51 157 size_t inv = (u > UNICODE_MAX_VALID_CODEPOINT);
52 157 size_t adj = inv | (inv << 1);
53
54
2/2
✓ Branch 0 taken 82 times.
✓ Branch 1 taken 75 times.
157 return 1 + (u > 0x7F) + (u > 0x7FF) + (u > 0xFFFF) - adj;
55 }
56
57 /*
58 * Copy into `dest` the printable representation of `src`, escaping
59 * control characters and other unprintable sequences as necessary.
60 * Bytes >= 0x80 are assumed to be the start of a UTF-8 multi-byte
61 * sequence, if subsequent bytes result in a valid encoding, or are
62 * otherwise byte-wise escaped. This is similar in purpose to the
63 * BSD strnvisx(3) function, but produces a truncated string if the
64 * destination buffer has insufficient space. If `dest_len` is at
65 * least `U_MAKE_PRINTABLE_MAXLEN(src_len)`, truncation can never
66 * happen.
67 */
68 10 static inline size_t u_make_printable (
69 const char *restrict src,
70 size_t src_len,
71 char *restrict dest,
72 size_t dest_len,
73 MakePrintableFlags flags
74 ) {
75 10 BUG_ON(dest_len == 0);
76 10 size_t len = 0;
77
78
4/4
✓ Branch 0 taken 17 times.
✓ Branch 1 taken 1 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 9 times.
18 for (size_t i = 0; i < src_len && len + U_SET_CHAR_MAXLEN < dest_len; ) {
79 8 CodePoint u = u_get_char(src, src_len, &i);
80
2/2
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 3 times.
8 if (flags & MPF_C0_SYMBOLS) {
81
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 3 times.
5 u = (u < 0x20) ? u + 0x2400 : (u == 0x7F ? 0x2421 : u);
82 }
83 8 len += u_set_char(dest + len, u);
84 }
85
86 10 dest[len] = '\0';
87 10 return len;
88 }
89
90 #endif
91