Line | Branch | Exec | Source |
---|---|---|---|
1 | #include "encoding.h" | ||
2 | #include "util/ascii.h" | ||
3 | #include "util/bsearch.h" | ||
4 | #include "util/debug.h" | ||
5 | #include "util/intern.h" | ||
6 | #include "util/xstring.h" | ||
7 | |||
8 | typedef struct { | ||
9 | const char alias[8]; | ||
10 | EncodingType encoding; | ||
11 | } EncodingAlias; | ||
12 | |||
13 | static const char encoding_names[][16] = { | ||
14 | [UTF8] = "UTF-8", | ||
15 | [UTF16BE] = "UTF-16BE", | ||
16 | [UTF16LE] = "UTF-16LE", | ||
17 | [UTF32BE] = "UTF-32BE", | ||
18 | [UTF32LE] = "UTF-32LE", | ||
19 | }; | ||
20 | |||
21 | static const EncodingAlias encoding_aliases[] = { | ||
22 | {"UCS-2", UTF16BE}, | ||
23 | {"UCS-2BE", UTF16BE}, | ||
24 | {"UCS-2LE", UTF16LE}, | ||
25 | {"UCS-4", UTF32BE}, | ||
26 | {"UCS-4BE", UTF32BE}, | ||
27 | {"UCS-4LE", UTF32LE}, | ||
28 | {"UCS2", UTF16BE}, | ||
29 | {"UCS4", UTF32BE}, | ||
30 | {"UTF-16", UTF16BE}, | ||
31 | {"UTF-32", UTF32BE}, | ||
32 | {"UTF16", UTF16BE}, | ||
33 | {"UTF16BE", UTF16BE}, | ||
34 | {"UTF16LE", UTF16LE}, | ||
35 | {"UTF32", UTF32BE}, | ||
36 | {"UTF32BE", UTF32BE}, | ||
37 | {"UTF32LE", UTF32LE}, | ||
38 | {"UTF8", UTF8}, | ||
39 | }; | ||
40 | |||
41 | static const ByteOrderMark boms[] = { | ||
42 | [UTF8] = {{0xef, 0xbb, 0xbf}, 3}, | ||
43 | [UTF16BE] = {{0xfe, 0xff}, 2}, | ||
44 | [UTF16LE] = {{0xff, 0xfe}, 2}, | ||
45 | [UTF32BE] = {{0x00, 0x00, 0xfe, 0xff}, 4}, | ||
46 | [UTF32LE] = {{0xff, 0xfe, 0x00, 0x00}, 4}, | ||
47 | }; | ||
48 | |||
49 | 18 | UNITTEST { | |
50 | 18 | CHECK_BSEARCH_ARRAY(encoding_aliases, alias, ascii_strcmp_icase); | |
51 | 18 | CHECK_STRING_ARRAY(encoding_names); | |
52 | 18 | static_assert(ARRAYLEN(encoding_names) == UNKNOWN_ENCODING); | |
53 | 18 | static_assert(ARRAYLEN(boms) == UNKNOWN_ENCODING); | |
54 | 18 | } | |
55 | |||
56 | 70 | static int enc_alias_cmp(const void *key, const void *elem) | |
57 | { | ||
58 | 70 | const EncodingAlias *a = key; | |
59 | 70 | const char *name = elem; | |
60 | 70 | return ascii_strcmp_icase(a->alias, name); | |
61 | } | ||
62 | |||
63 | 209 | EncodingType lookup_encoding(const char *name) | |
64 | { | ||
65 |
2/2✓ Branch 0 taken 33 times.
✓ Branch 1 taken 176 times.
|
209 | if (likely(name == encoding_names[UTF8])) { |
66 | return UTF8; | ||
67 | } | ||
68 | |||
69 |
2/2✓ Branch 0 taken 109 times.
✓ Branch 1 taken 18 times.
|
127 | for (size_t i = 0; i < ARRAYLEN(encoding_names); i++) { |
70 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 94 times.
|
109 | if (ascii_streq_icase(name, encoding_names[i])) { |
71 | 15 | return (EncodingType) i; | |
72 | } | ||
73 | } | ||
74 | |||
75 | 18 | const EncodingAlias *a = BSEARCH(name, encoding_aliases, enc_alias_cmp); | |
76 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 6 times.
|
18 | return a ? a->encoding : UNKNOWN_ENCODING; |
77 | } | ||
78 | |||
79 | 131 | const char *encoding_from_type(EncodingType type) | |
80 | { | ||
81 | 131 | BUG_ON(type >= UNKNOWN_ENCODING); | |
82 | |||
83 | // There's no need to call str_intern() here; the names in the array | ||
84 | // can be considered static interns | ||
85 | 131 | return encoding_names[type]; | |
86 | } | ||
87 | |||
88 | 53 | const char *encoding_normalize(const char *name) | |
89 | { | ||
90 | 53 | EncodingType type = lookup_encoding(name); | |
91 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 1 times.
|
53 | if (type != UNKNOWN_ENCODING) { |
92 | 52 | return encoding_from_type(type); | |
93 | } | ||
94 | |||
95 | char upper[256]; | ||
96 | size_t n; | ||
97 |
3/4✓ Branch 0 taken 8 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 7 times.
✓ Branch 3 taken 1 times.
|
8 | for (n = 0; n < sizeof(upper) && name[n]; n++) { |
98 | 7 | upper[n] = ascii_toupper(name[n]); | |
99 | } | ||
100 | |||
101 | 1 | return mem_intern(upper, n); | |
102 | } | ||
103 | |||
104 | 47 | EncodingType detect_encoding_from_bom(const unsigned char *buf, size_t size) | |
105 | { | ||
106 | // Skip exhaustive checks if there's clearly no BOM | ||
107 |
4/4✓ Branch 0 taken 41 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 13 times.
✓ Branch 3 taken 28 times.
|
47 | if (size < 2 || ((unsigned int)buf[0]) - 1 < 0xEE) { |
108 | return UNKNOWN_ENCODING; | ||
109 | } | ||
110 | |||
111 | // Iterate array backwards to ensure UTF32LE is checked before UTF16LE | ||
112 |
2/2✓ Branch 0 taken 53 times.
✓ Branch 1 taken 7 times.
|
60 | for (size_t n = ARRAYLEN(boms), i = n - 1; i < n; i--) { |
113 | 53 | const unsigned int bom_len = boms[i].len; | |
114 | 53 | BUG_ON(bom_len == 0); | |
115 |
4/4✓ Branch 0 taken 42 times.
✓ Branch 1 taken 11 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 36 times.
|
53 | if (size >= bom_len && mem_equal(buf, boms[i].bytes, bom_len)) { |
116 | 6 | return (EncodingType)i; | |
117 | } | ||
118 | } | ||
119 | |||
120 | return UNKNOWN_ENCODING; | ||
121 | } | ||
122 | |||
123 | 5 | const ByteOrderMark *get_bom_for_encoding(EncodingType type) | |
124 | { | ||
125 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 2 times.
|
5 | return encoding_type_has_bom(type) ? &boms[type] : NULL; |
126 | } | ||
127 |