| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include "encoding.h" | ||
| 2 | #include "util/ascii.h" | ||
| 3 | #include "util/bsearch.h" | ||
| 4 | #include "util/debug.h" | ||
| 5 | #include "util/intern.h" | ||
| 6 | #include "util/xstring.h" | ||
| 7 | |||
| 8 | typedef struct { | ||
| 9 | const char alias[8]; | ||
| 10 | EncodingType encoding; | ||
| 11 | } EncodingAlias; | ||
| 12 | |||
| 13 | static const char encoding_names[][16] = { | ||
| 14 | [UTF8] = "UTF-8", | ||
| 15 | [UTF16BE] = "UTF-16BE", | ||
| 16 | [UTF16LE] = "UTF-16LE", | ||
| 17 | [UTF32BE] = "UTF-32BE", | ||
| 18 | [UTF32LE] = "UTF-32LE", | ||
| 19 | }; | ||
| 20 | |||
| 21 | static const EncodingAlias encoding_aliases[] = { | ||
| 22 | {"UCS-2", UTF16BE}, | ||
| 23 | {"UCS-2BE", UTF16BE}, | ||
| 24 | {"UCS-2LE", UTF16LE}, | ||
| 25 | {"UCS-4", UTF32BE}, | ||
| 26 | {"UCS-4BE", UTF32BE}, | ||
| 27 | {"UCS-4LE", UTF32LE}, | ||
| 28 | {"UCS2", UTF16BE}, | ||
| 29 | {"UCS4", UTF32BE}, | ||
| 30 | {"UTF-16", UTF16BE}, | ||
| 31 | {"UTF-32", UTF32BE}, | ||
| 32 | {"UTF16", UTF16BE}, | ||
| 33 | {"UTF16BE", UTF16BE}, | ||
| 34 | {"UTF16LE", UTF16LE}, | ||
| 35 | {"UTF32", UTF32BE}, | ||
| 36 | {"UTF32BE", UTF32BE}, | ||
| 37 | {"UTF32LE", UTF32LE}, | ||
| 38 | {"UTF8", UTF8}, | ||
| 39 | }; | ||
| 40 | |||
| 41 | static const ByteOrderMark boms[] = { | ||
| 42 | [UTF8] = {{0xef, 0xbb, 0xbf}, 3}, | ||
| 43 | [UTF16BE] = {{0xfe, 0xff}, 2}, | ||
| 44 | [UTF16LE] = {{0xff, 0xfe}, 2}, | ||
| 45 | [UTF32BE] = {{0x00, 0x00, 0xfe, 0xff}, 4}, | ||
| 46 | [UTF32LE] = {{0xff, 0xfe, 0x00, 0x00}, 4}, | ||
| 47 | }; | ||
| 48 | |||
| 49 | 24 | UNITTEST { | |
| 50 | 24 | CHECK_BSEARCH_ARRAY_ICASE(encoding_aliases, alias); | |
| 51 | 24 | CHECK_STRING_ARRAY(encoding_names); | |
| 52 | 24 | static_assert(ARRAYLEN(encoding_names) == UNKNOWN_ENCODING); | |
| 53 | 24 | static_assert(ARRAYLEN(boms) == UNKNOWN_ENCODING); | |
| 54 | 24 | } | |
| 55 | |||
| 56 | 70 | static int enc_alias_cmp(const void *key, const void *elem) | |
| 57 | { | ||
| 58 | 70 | const EncodingAlias *a = key; | |
| 59 | 70 | const char *name = elem; | |
| 60 | 70 | return ascii_strcmp_icase(a->alias, name); | |
| 61 | } | ||
| 62 | |||
| 63 | 243 | EncodingType lookup_encoding(const char *name) | |
| 64 | { | ||
| 65 |
2/2✓ Branch 0 (2→7) taken 36 times.
✓ Branch 1 (2→11) taken 207 times.
|
243 | if (likely(name == encoding_names[UTF8])) { |
| 66 | return UTF8; | ||
| 67 | } | ||
| 68 | |||
| 69 |
2/2✓ Branch 0 (7→3) taken 112 times.
✓ Branch 1 (7→8) taken 18 times.
|
130 | for (size_t i = 0; i < ARRAYLEN(encoding_names); i++) { |
| 70 |
2/2✓ Branch 0 (4→5) taken 18 times.
✓ Branch 1 (4→6) taken 94 times.
|
112 | if (ascii_streq_icase(name, encoding_names[i])) { |
| 71 | 18 | return (EncodingType) i; | |
| 72 | } | ||
| 73 | } | ||
| 74 | |||
| 75 | 18 | const EncodingAlias *a = BSEARCH(name, encoding_aliases, enc_alias_cmp); | |
| 76 |
2/2✓ Branch 0 (9→10) taken 12 times.
✓ Branch 1 (9→11) taken 6 times.
|
18 | return a ? a->encoding : UNKNOWN_ENCODING; |
| 77 | } | ||
| 78 | |||
| 79 | 150 | const char *encoding_from_type(EncodingType type) | |
| 80 | { | ||
| 81 | 150 | BUG_ON(type >= UNKNOWN_ENCODING); | |
| 82 | |||
| 83 | // There's no need to call str_intern() here; the names in the array | ||
| 84 | // can be considered static interns | ||
| 85 | 150 | return encoding_names[type]; | |
| 86 | } | ||
| 87 | |||
| 88 | 62 | const char *encoding_normalize(const char *name) | |
| 89 | { | ||
| 90 | 62 | EncodingType type = lookup_encoding(name); | |
| 91 |
2/2✓ Branch 0 (3→4) taken 61 times.
✓ Branch 1 (3→5) taken 1 times.
|
62 | if (type != UNKNOWN_ENCODING) { |
| 92 | 61 | return encoding_from_type(type); | |
| 93 | } | ||
| 94 | |||
| 95 | 1 | char upper[256]; | |
| 96 | 1 | size_t n; | |
| 97 |
3/4✓ Branch 0 (7→8) taken 8 times.
✗ Branch 1 (7→9) not taken.
✓ Branch 2 (8→6) taken 7 times.
✓ Branch 3 (8→9) taken 1 times.
|
8 | for (n = 0; n < sizeof(upper) && name[n]; n++) { |
| 98 | 7 | upper[n] = ascii_toupper(name[n]); | |
| 99 | } | ||
| 100 | |||
| 101 | 1 | return mem_intern(upper, n); | |
| 102 | } | ||
| 103 | |||
| 104 | 51 | EncodingType detect_encoding_from_bom(const char *buf, size_t size) | |
| 105 | { | ||
| 106 | // Skip exhaustive checks if there's clearly no BOM | ||
| 107 |
4/4✓ Branch 0 (2→3) taken 42 times.
✓ Branch 1 (2→12) taken 9 times.
✓ Branch 2 (3→11) taken 13 times.
✓ Branch 3 (3→12) taken 29 times.
|
51 | if (size < 2 || ((unsigned int)(unsigned char)buf[0]) - 1 < 0xEE) { |
| 108 | return UNKNOWN_ENCODING; | ||
| 109 | } | ||
| 110 | |||
| 111 | // Iterate array backwards to ensure UTF32LE is checked before UTF16LE | ||
| 112 |
2/2✓ Branch 0 (11→4) taken 53 times.
✓ Branch 1 (11→12) taken 7 times.
|
60 | for (size_t n = ARRAYLEN(boms), i = n - 1; i < n; i--) { |
| 113 | 53 | const unsigned int bom_len = boms[i].len; | |
| 114 | 53 | BUG_ON(bom_len == 0); | |
| 115 |
4/4✓ Branch 0 (6→7) taken 42 times.
✓ Branch 1 (6→10) taken 11 times.
✓ Branch 2 (8→9) taken 6 times.
✓ Branch 3 (8→10) taken 36 times.
|
53 | if (size >= bom_len && mem_equal(buf, boms[i].bytes, bom_len)) { |
| 116 | 6 | return (EncodingType)i; | |
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 120 | return UNKNOWN_ENCODING; | ||
| 121 | } | ||
| 122 | |||
| 123 | 5 | const ByteOrderMark *get_bom_for_encoding(EncodingType type) | |
| 124 | { | ||
| 125 |
2/2✓ Branch 0 (2→3) taken 3 times.
✓ Branch 1 (2→4) taken 2 times.
|
5 | return encoding_type_has_bom(type) ? &boms[type] : NULL; |
| 126 | } | ||
| 127 |