dte test coverage


Directory: ./
File: src/encoding.c
Date: 2024-12-21 16:03:22
Exec Total Coverage
Lines: 36 36 100.0%
Functions: 7 7 100.0%
Branches: 25 26 96.2%

Line Branch Exec Source
1 #include "encoding.h"
2 #include "util/ascii.h"
3 #include "util/bsearch.h"
4 #include "util/debug.h"
5 #include "util/intern.h"
6 #include "util/xstring.h"
7
8 typedef struct {
9 const char alias[8];
10 EncodingType encoding;
11 } EncodingAlias;
12
13 static const char encoding_names[][16] = {
14 [UTF8] = "UTF-8",
15 [UTF16BE] = "UTF-16BE",
16 [UTF16LE] = "UTF-16LE",
17 [UTF32BE] = "UTF-32BE",
18 [UTF32LE] = "UTF-32LE",
19 };
20
21 static const EncodingAlias encoding_aliases[] = {
22 {"UCS-2", UTF16BE},
23 {"UCS-2BE", UTF16BE},
24 {"UCS-2LE", UTF16LE},
25 {"UCS-4", UTF32BE},
26 {"UCS-4BE", UTF32BE},
27 {"UCS-4LE", UTF32LE},
28 {"UCS2", UTF16BE},
29 {"UCS4", UTF32BE},
30 {"UTF-16", UTF16BE},
31 {"UTF-32", UTF32BE},
32 {"UTF16", UTF16BE},
33 {"UTF16BE", UTF16BE},
34 {"UTF16LE", UTF16LE},
35 {"UTF32", UTF32BE},
36 {"UTF32BE", UTF32BE},
37 {"UTF32LE", UTF32LE},
38 {"UTF8", UTF8},
39 };
40
41 static const ByteOrderMark boms[] = {
42 [UTF8] = {{0xef, 0xbb, 0xbf}, 3},
43 [UTF16BE] = {{0xfe, 0xff}, 2},
44 [UTF16LE] = {{0xff, 0xfe}, 2},
45 [UTF32BE] = {{0x00, 0x00, 0xfe, 0xff}, 4},
46 [UTF32LE] = {{0xff, 0xfe, 0x00, 0x00}, 4},
47 };
48
49 18 UNITTEST {
50 18 CHECK_BSEARCH_ARRAY(encoding_aliases, alias, ascii_strcmp_icase);
51 18 CHECK_STRING_ARRAY(encoding_names);
52 18 static_assert(ARRAYLEN(encoding_names) == UNKNOWN_ENCODING);
53 18 static_assert(ARRAYLEN(boms) == UNKNOWN_ENCODING);
54 18 }
55
56 70 static int enc_alias_cmp(const void *key, const void *elem)
57 {
58 70 const EncodingAlias *a = key;
59 70 const char *name = elem;
60 70 return ascii_strcmp_icase(a->alias, name);
61 }
62
63 209 EncodingType lookup_encoding(const char *name)
64 {
65
2/2
✓ Branch 0 taken 33 times.
✓ Branch 1 taken 176 times.
209 if (likely(name == encoding_names[UTF8])) {
66 return UTF8;
67 }
68
69
2/2
✓ Branch 0 taken 109 times.
✓ Branch 1 taken 18 times.
127 for (size_t i = 0; i < ARRAYLEN(encoding_names); i++) {
70
2/2
✓ Branch 0 taken 15 times.
✓ Branch 1 taken 94 times.
109 if (ascii_streq_icase(name, encoding_names[i])) {
71 15 return (EncodingType) i;
72 }
73 }
74
75 18 const EncodingAlias *a = BSEARCH(name, encoding_aliases, enc_alias_cmp);
76
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 6 times.
18 return a ? a->encoding : UNKNOWN_ENCODING;
77 }
78
79 131 const char *encoding_from_type(EncodingType type)
80 {
81 131 BUG_ON(type >= UNKNOWN_ENCODING);
82
83 // There's no need to call str_intern() here; the names in the array
84 // can be considered static interns
85 131 return encoding_names[type];
86 }
87
88 53 const char *encoding_normalize(const char *name)
89 {
90 53 EncodingType type = lookup_encoding(name);
91
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 1 times.
53 if (type != UNKNOWN_ENCODING) {
92 52 return encoding_from_type(type);
93 }
94
95 char upper[256];
96 size_t n;
97
3/4
✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 1 times.
8 for (n = 0; n < sizeof(upper) && name[n]; n++) {
98 7 upper[n] = ascii_toupper(name[n]);
99 }
100
101 1 return mem_intern(upper, n);
102 }
103
104 47 EncodingType detect_encoding_from_bom(const unsigned char *buf, size_t size)
105 {
106 // Skip exhaustive checks if there's clearly no BOM
107
4/4
✓ Branch 0 taken 41 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 13 times.
✓ Branch 3 taken 28 times.
47 if (size < 2 || ((unsigned int)buf[0]) - 1 < 0xEE) {
108 return UNKNOWN_ENCODING;
109 }
110
111 // Iterate array backwards to ensure UTF32LE is checked before UTF16LE
112
2/2
✓ Branch 0 taken 53 times.
✓ Branch 1 taken 7 times.
60 for (size_t n = ARRAYLEN(boms), i = n - 1; i < n; i--) {
113 53 const unsigned int bom_len = boms[i].len;
114 53 BUG_ON(bom_len == 0);
115
4/4
✓ Branch 0 taken 42 times.
✓ Branch 1 taken 11 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 36 times.
53 if (size >= bom_len && mem_equal(buf, boms[i].bytes, bom_len)) {
116 6 return (EncodingType)i;
117 }
118 }
119
120 return UNKNOWN_ENCODING;
121 }
122
123 5 const ByteOrderMark *get_bom_for_encoding(EncodingType type)
124 {
125
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
5 return encoding_type_has_bom(type) ? &boms[type] : NULL;
126 }
127