src/convert.c
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include <errno.h> | ||
| 2 | #include <inttypes.h> | ||
| 3 | #include <stdlib.h> | ||
| 4 | #include <string.h> | ||
| 5 | #include "convert.h" | ||
| 6 | #include "block.h" | ||
| 7 | #include "buildvar-iconv.h" | ||
| 8 | #include "encoding.h" | ||
| 9 | #include "util/arith.h" | ||
| 10 | #include "util/debug.h" | ||
| 11 | #include "util/list.h" | ||
| 12 | #include "util/log.h" | ||
| 13 | #include "util/numtostr.h" | ||
| 14 | #include "util/str-util.h" | ||
| 15 | #include "util/utf8.h" | ||
| 16 | #include "util/xmalloc.h" | ||
| 17 | #include "util/xreadwrite.h" | ||
| 18 | |||
| 19 | typedef struct { | ||
| 20 | StringView text; | ||
| 21 | size_t ipos; | ||
| 22 | struct CharsetConverter *cconv; | ||
| 23 | } FileDecoder; | ||
| 24 | |||
| 25 | 56 | static void add_block(Buffer *buffer, Block *blk) | |
| 26 | { | ||
| 27 | 56 | buffer->nl += blk->nl; | |
| 28 | 56 | list_insert_before(&blk->node, &buffer->blocks); | |
| 29 | 56 | } | |
| 30 | |||
| 31 | 7713 | static Block *add_utf8_line ( | |
| 32 | Buffer *buffer, | ||
| 33 | const GlobalOptions *gopts, | ||
| 34 | ErrorBuffer *errbuf, | ||
| 35 | Block *blk, | ||
| 36 | StringView line | ||
| 37 | ) { | ||
| 38 | 7713 | size_t slimit = gopts->syntax_line_limit; | |
| 39 | 7713 | size_t size = line.length + 1; | |
| 40 | |||
| 41 |
2/2✓ Branch 2 → 3 taken 7685 times.
✓ Branch 2 → 6 taken 28 times.
|
7713 | if (blk) { |
| 42 | 7685 | size_t avail = blk->alloc - blk->size; | |
| 43 |
2/2✓ Branch 3 → 4 taken 7657 times.
✓ Branch 3 → 5 taken 28 times.
|
7685 | if (size <= avail) { |
| 44 | 7657 | goto copy; | |
| 45 | } | ||
| 46 | 28 | add_block(buffer, blk); | |
| 47 | } | ||
| 48 | |||
| 49 | 56 | size = MAX(size, 8192); | |
| 50 | 56 | blk = block_new(size); | |
| 51 | |||
| 52 | 7713 | copy: | |
| 53 |
2/6✓ Branch 7 → 8 taken 7713 times.
✗ Branch 7 → 13 not taken.
✗ Branch 8 → 9 not taken.
✓ Branch 8 → 13 taken 7713 times.
✗ Branch 9 → 10 not taken.
✗ Branch 9 → 13 not taken.
|
7713 | if (unlikely(slimit && line.length > slimit && buffer->options.syntax)) { |
| 54 | ✗ | char limit_str[PRECISE_FILESIZE_STR_MAX]; | |
| 55 | ✗ | filesize_to_str_precise(slimit, limit_str); | |
| 56 | ✗ | error_msg ( | |
| 57 | errbuf, | ||
| 58 | "line length (%zu) exceeds 'syntax-line-limit' option (%s); " | ||
| 59 | "disabling syntax highlighting", | ||
| 60 | line.length, limit_str | ||
| 61 | ); | ||
| 62 | ✗ | buffer->options.syntax = false; | |
| 63 | } | ||
| 64 | |||
| 65 | 7713 | memcpy(blk->data + blk->size, line.data, line.length); | |
| 66 | 7713 | blk->size += line.length; | |
| 67 | 7713 | blk->data[blk->size++] = '\n'; | |
| 68 | 7713 | blk->nl++; | |
| 69 | 7713 | return blk; | |
| 70 | } | ||
| 71 | |||
| 72 | 7747 | static bool read_utf8_line(FileDecoder *dec, StringView *linep) | |
| 73 | { | ||
| 74 | 7747 | size_t len = dec->text.length; | |
| 75 |
2/2✓ Branch 2 → 3 taken 34 times.
✓ Branch 2 → 5 taken 7713 times.
|
7747 | if (dec->ipos >= len) { |
| 76 | 34 | BUG_ON(dec->ipos > len); | |
| 77 | return false; | ||
| 78 | } | ||
| 79 | |||
| 80 | 7713 | *linep = get_delim(dec->text.data, &dec->ipos, len, '\n'); | |
| 81 | 7713 | return true; | |
| 82 | } | ||
| 83 | |||
| 84 | 34 | static bool file_decoder_read_utf8 ( | |
| 85 | Buffer *buffer, | ||
| 86 | const GlobalOptions *gopts, | ||
| 87 | ErrorBuffer *errbuf, | ||
| 88 | StringView text | ||
| 89 | ) { | ||
| 90 |
1/2✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 34 times.
|
34 | if (unlikely(!encoding_is_utf8(buffer->encoding))) { |
| 91 | ✗ | errno = EINVAL; | |
| 92 | ✗ | return false; | |
| 93 | } | ||
| 94 | |||
| 95 | 34 | FileDecoder dec = {.text = text}; | |
| 96 | 34 | StringView line; | |
| 97 |
2/2✓ Branch 6 → 7 taken 28 times.
✓ Branch 6 → 21 taken 6 times.
|
34 | if (!read_utf8_line(&dec, &line)) { |
| 98 | return true; | ||
| 99 | } | ||
| 100 | |||
| 101 |
2/2✓ Branch 8 → 9 taken 1 time.
✓ Branch 8 → 10 taken 27 times.
|
28 | if (strview_remove_matching_suffix(&line, "\r")) { |
| 102 | 1 | buffer->crlf_newlines = true; | |
| 103 | } | ||
| 104 | |||
| 105 | 28 | Block *blk = add_utf8_line(buffer, gopts, errbuf, NULL, line); | |
| 106 | |||
| 107 |
2/2✓ Branch 11 → 14 taken 1 time.
✓ Branch 11 → 17 taken 27 times.
|
28 | if (unlikely(buffer->crlf_newlines)) { |
| 108 |
2/2✓ Branch 15 → 12 taken 270 times.
✓ Branch 15 → 19 taken 1 time.
|
271 | while (read_utf8_line(&dec, &line)) { |
| 109 | 270 | strview_remove_matching_suffix(&line, "\r"); | |
| 110 | 270 | blk = add_utf8_line(buffer, gopts, errbuf, blk, line); | |
| 111 | } | ||
| 112 | } else { | ||
| 113 |
2/2✓ Branch 18 → 16 taken 7415 times.
✓ Branch 18 → 19 taken 27 times.
|
7442 | while (read_utf8_line(&dec, &line)) { |
| 114 | 7415 | blk = add_utf8_line(buffer, gopts, errbuf, blk, line); | |
| 115 | } | ||
| 116 | } | ||
| 117 | |||
| 118 |
1/2✓ Branch 19 → 20 taken 28 times.
✗ Branch 19 → 21 not taken.
|
28 | if (blk) { |
| 119 | 28 | add_block(buffer, blk); | |
| 120 | } | ||
| 121 | |||
| 122 | return true; | ||
| 123 | } | ||
| 124 | |||
| 125 | 1 | static size_t unix_to_dos(FileEncoder *enc, StringView text, size_t nr_newlines) | |
| 126 | { | ||
| 127 | 1 | BUG_ON(text.length && !strview_has_suffix(text, "\n")); // See sanity_check_blocks() | |
| 128 | 1 | BUG_ON(nr_newlines > text.length); | |
| 129 | |||
| 130 | 1 | const size_t new_len = text.length + nr_newlines; | |
| 131 |
1/2✓ Branch 8 → 9 taken 1 time.
✗ Branch 8 → 18 not taken.
|
1 | if (enc->nsize < new_len) { |
| 132 | 1 | enc->nsize = xmul(text.length, 2); | |
| 133 | 1 | enc->nbuf = xrealloc(enc->nbuf, enc->nsize); | |
| 134 | } | ||
| 135 | |||
| 136 | size_t seen_nl = 0; | ||
| 137 | size_t dest_pos = 0; | ||
| 138 | |||
| 139 |
2/2✓ Branch 19 → 12 taken 3 times.
✓ Branch 19 → 20 taken 1 time.
|
4 | for (size_t src_pos = 0; src_pos < text.length; ) { |
| 140 | 3 | const char *src = text.data + src_pos; | |
| 141 | 3 | char *dest = enc->nbuf + dest_pos; | |
| 142 | 3 | char *end = memccpy(dest, src, '\n', text.length - src_pos); | |
| 143 | 3 | BUG_ON(!end); // Loop condition prevents this | |
| 144 | |||
| 145 | 3 | size_t line_len = (size_t)(end - dest); | |
| 146 | 3 | src_pos += line_len; | |
| 147 | 3 | BUG_ON(src_pos > text.length); | |
| 148 | |||
| 149 | 3 | end[-1] = '\r'; | |
| 150 | 3 | end[0] = '\n'; | |
| 151 | 3 | dest_pos += line_len + 1; | |
| 152 | 3 | seen_nl++; | |
| 153 | } | ||
| 154 | |||
| 155 | 1 | BUG_ON(seen_nl != nr_newlines); | |
| 156 | 1 | BUG_ON(dest_pos != new_len); | |
| 157 | 1 | return dest_pos; | |
| 158 | } | ||
| 159 | |||
| 160 | #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation: | ||
| 161 | |||
| 162 | bool conversion_supported_by_iconv ( | ||
| 163 | const char* UNUSED_ARG(from), | ||
| 164 | const char* UNUSED_ARG(to) | ||
| 165 | ) { | ||
| 166 | errno = EINVAL; | ||
| 167 | return false; | ||
| 168 | } | ||
| 169 | |||
| 170 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | ||
| 171 | { | ||
| 172 | if (unlikely(!encoding_is_utf8(encoding))) { | ||
| 173 | BUG("unsupported conversion; should have been handled earlier"); | ||
| 174 | } | ||
| 175 | |||
| 176 | return (FileEncoder) { | ||
| 177 | .crlf = crlf, | ||
| 178 | .fd = fd, | ||
| 179 | }; | ||
| 180 | } | ||
| 181 | |||
| 182 | void file_encoder_free(FileEncoder *enc) | ||
| 183 | { | ||
| 184 | free(enc->nbuf); | ||
| 185 | } | ||
| 186 | |||
| 187 | ssize_t file_encoder_write ( | ||
| 188 | FileEncoder *enc, | ||
| 189 | const char *buf, | ||
| 190 | size_t size, | ||
| 191 | size_t nr_newlines | ||
| 192 | ) { | ||
| 193 | if (unlikely(enc->crlf)) { | ||
| 194 | size = unix_to_dos(enc, string_view(buf, size), nr_newlines); | ||
| 195 | buf = enc->nbuf; | ||
| 196 | } | ||
| 197 | return xwrite_all(enc->fd, buf, size); | ||
| 198 | } | ||
| 199 | |||
| 200 | size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc)) | ||
| 201 | { | ||
| 202 | return 0; | ||
| 203 | } | ||
| 204 | |||
| 205 | bool file_decoder_read ( | ||
| 206 | Buffer *buffer, | ||
| 207 | const GlobalOptions *gopts, | ||
| 208 | ErrorBuffer *errbuf, | ||
| 209 | StringView text | ||
| 210 | ) { | ||
| 211 | return file_decoder_read_utf8(buffer, gopts, errbuf, text); | ||
| 212 | } | ||
| 213 | |||
| 214 | #else // ICONV_DISABLE != 1; use full iconv implementation: | ||
| 215 | |||
| 216 | #include <iconv.h> | ||
| 217 | |||
| 218 | // UTF-8 encoding of U+00BF (inverted question mark; "¿") | ||
| 219 | #define REPLACEMENT "\xc2\xbf" | ||
| 220 | |||
| 221 | typedef struct CharsetConverter { | ||
| 222 | iconv_t cd; | ||
| 223 | char *obuf; | ||
| 224 | size_t osize; | ||
| 225 | size_t opos; | ||
| 226 | size_t consumed; | ||
| 227 | size_t errors; | ||
| 228 | |||
| 229 | // Temporary input buffer | ||
| 230 | char tbuf[16]; | ||
| 231 | size_t tcount; | ||
| 232 | |||
| 233 | // REPLACEMENT character, in target encoding | ||
| 234 | char rbuf[4]; | ||
| 235 | size_t rcount; | ||
| 236 | |||
| 237 | // Input character size in bytes, or zero for UTF-8 | ||
| 238 | size_t char_size; | ||
| 239 | } CharsetConverter; | ||
| 240 | |||
| 241 | 1 | static CharsetConverter *create(iconv_t cd) | |
| 242 | { | ||
| 243 | 1 | CharsetConverter *c = xcalloc1(sizeof(*c)); | |
| 244 | 1 | c->cd = cd; | |
| 245 | 1 | c->osize = 8192; | |
| 246 | 1 | c->obuf = xmalloc(c->osize); | |
| 247 | 1 | return c; | |
| 248 | } | ||
| 249 | |||
| 250 | 2 | static size_t iconv_wrapper ( | |
| 251 | iconv_t cd, | ||
| 252 | const char **restrict inbuf, | ||
| 253 | size_t *restrict inbytesleft, | ||
| 254 | char **restrict outbuf, | ||
| 255 | size_t *restrict outbytesleft | ||
| 256 | ) { | ||
| 257 | // POSIX defines the second parameter of iconv(3) as "char **restrict" | ||
| 258 | // but NetBSD declares it as "const char **restrict" | ||
| 259 | #ifdef __NetBSD__ | ||
| 260 | const char **restrict in = inbuf; | ||
| 261 | #else | ||
| 262 | 2 | char **restrict in = (char **restrict)inbuf; | |
| 263 | #endif | ||
| 264 | |||
| 265 | 2 | return iconv(cd, in, inbytesleft, outbuf, outbytesleft); | |
| 266 | } | ||
| 267 | |||
| 268 | ✗ | static void resize_obuf(CharsetConverter *c) | |
| 269 | { | ||
| 270 | ✗ | c->osize = xmul(2, c->osize); | |
| 271 | ✗ | c->obuf = xrealloc(c->obuf, c->osize); | |
| 272 | ✗ | } | |
| 273 | |||
| 274 | ✗ | static void add_replacement(CharsetConverter *c) | |
| 275 | { | ||
| 276 | ✗ | if (c->osize - c->opos < 4) { | |
| 277 | ✗ | resize_obuf(c); | |
| 278 | } | ||
| 279 | |||
| 280 | ✗ | memcpy(c->obuf + c->opos, c->rbuf, c->rcount); | |
| 281 | ✗ | c->opos += c->rcount; | |
| 282 | ✗ | } | |
| 283 | |||
| 284 | ✗ | static size_t handle_invalid(CharsetConverter *c, const char *buf, size_t count) | |
| 285 | { | ||
| 286 | ✗ | LOG_DEBUG("%zu %zu", c->char_size, count); | |
| 287 | ✗ | add_replacement(c); | |
| 288 | ✗ | if (c->char_size == 0) { | |
| 289 | // Converting from UTF-8 | ||
| 290 | ✗ | size_t idx = 0; | |
| 291 | ✗ | CodePoint u = u_get_char(buf, count, &idx); | |
| 292 | ✗ | LOG_DEBUG("U+%04" PRIX32, u); | |
| 293 | ✗ | return idx; | |
| 294 | } | ||
| 295 | ✗ | if (c->char_size > count) { | |
| 296 | // wtf | ||
| 297 | ✗ | return 1; | |
| 298 | } | ||
| 299 | return c->char_size; | ||
| 300 | } | ||
| 301 | |||
| 302 | 1 | static int xiconv(CharsetConverter *c, const char **ib, size_t *ic) | |
| 303 | { | ||
| 304 | 1 | while (1) { | |
| 305 | 1 | char *ob = c->obuf + c->opos; | |
| 306 | 1 | size_t oc = c->osize - c->opos; | |
| 307 | 1 | size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc); | |
| 308 | 1 | c->opos = ob - c->obuf; | |
| 309 |
1/2✗ Branch 4 → 5 not taken.
✓ Branch 4 → 12 taken 1 time.
|
1 | if (rc == (size_t)-1) { |
| 310 | ✗ | switch (errno) { | |
| 311 | ✗ | case EILSEQ: | |
| 312 | ✗ | c->errors++; | |
| 313 | // Reset | ||
| 314 | ✗ | iconv(c->cd, NULL, NULL, NULL, NULL); | |
| 315 | ✗ | return errno; | |
| 316 | case EINVAL: | ||
| 317 | return errno; | ||
| 318 | ✗ | case E2BIG: | |
| 319 | ✗ | resize_obuf(c); | |
| 320 | ✗ | continue; | |
| 321 | ✗ | default: | |
| 322 | − | BUG("iconv: %s", strerror(errno)); | |
| 323 | } | ||
| 324 | } else { | ||
| 325 | 1 | c->errors += rc; | |
| 326 | } | ||
| 327 | 1 | return 0; | |
| 328 | } | ||
| 329 | } | ||
| 330 | |||
| 331 | ✗ | static size_t convert_incomplete(CharsetConverter *c, const char *input, size_t len) | |
| 332 | { | ||
| 333 | ✗ | size_t ipos = 0; | |
| 334 | ✗ | while (c->tcount < sizeof(c->tbuf) && ipos < len) { | |
| 335 | ✗ | c->tbuf[c->tcount++] = input[ipos++]; | |
| 336 | ✗ | const char *ib = c->tbuf; | |
| 337 | ✗ | size_t ic = c->tcount; | |
| 338 | ✗ | int rc = xiconv(c, &ib, &ic); | |
| 339 | ✗ | if (ic > 0) { | |
| 340 | ✗ | memmove(c->tbuf, ib, ic); | |
| 341 | } | ||
| 342 | ✗ | c->tcount = ic; | |
| 343 | ✗ | if (rc == EINVAL) { | |
| 344 | // Incomplete character at end of input buffer; try again | ||
| 345 | // with more input data | ||
| 346 | ✗ | continue; | |
| 347 | } | ||
| 348 | ✗ | if (rc == EILSEQ) { | |
| 349 | // Invalid multibyte sequence | ||
| 350 | ✗ | size_t skip = handle_invalid(c, c->tbuf, c->tcount); | |
| 351 | ✗ | c->tcount -= skip; | |
| 352 | ✗ | if (c->tcount > 0) { | |
| 353 | ✗ | LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip); | |
| 354 | ✗ | memmove(c->tbuf, c->tbuf + skip, c->tcount); | |
| 355 | ✗ | continue; | |
| 356 | } | ||
| 357 | ✗ | return ipos; | |
| 358 | } | ||
| 359 | ✗ | break; | |
| 360 | } | ||
| 361 | |||
| 362 | ✗ | LOG_DEBUG("%zu %zu", ipos, c->tcount); | |
| 363 | ✗ | return ipos; | |
| 364 | } | ||
| 365 | |||
| 366 | 1 | static void cconv_process(CharsetConverter *c, const char *input, size_t len) | |
| 367 | { | ||
| 368 |
1/2✗ Branch 2 → 3 not taken.
✓ Branch 2 → 4 taken 1 time.
|
1 | if (c->consumed > 0) { |
| 369 | ✗ | size_t fill = c->opos - c->consumed; | |
| 370 | ✗ | memmove(c->obuf, c->obuf + c->consumed, fill); | |
| 371 | ✗ | c->opos = fill; | |
| 372 | ✗ | c->consumed = 0; | |
| 373 | } | ||
| 374 | |||
| 375 |
1/2✗ Branch 4 → 5 not taken.
✓ Branch 4 → 7 taken 1 time.
|
1 | if (c->tcount > 0) { |
| 376 | ✗ | size_t ipos = convert_incomplete(c, input, len); | |
| 377 | ✗ | input += ipos; | |
| 378 | ✗ | len -= ipos; | |
| 379 | } | ||
| 380 | |||
| 381 | 1 | const char *ib = input; | |
| 382 |
2/2✓ Branch 17 → 8 taken 1 time.
✓ Branch 17 → 18 taken 1 time.
|
2 | for (size_t ic = len; ic > 0; ) { |
| 383 | 1 | int r = xiconv(c, &ib, &ic); | |
| 384 |
1/2✗ Branch 9 → 10 not taken.
✓ Branch 9 → 13 taken 1 time.
|
1 | if (r == EINVAL) { |
| 385 | // Incomplete character at end of input buffer | ||
| 386 | ✗ | if (ic < sizeof(c->tbuf)) { | |
| 387 | ✗ | memcpy(c->tbuf, ib, ic); | |
| 388 | ✗ | c->tcount = ic; | |
| 389 | } else { | ||
| 390 | // FIXME | ||
| 391 | ✗ | } | |
| 392 | ✗ | ic = 0; | |
| 393 | ✗ | continue; | |
| 394 | } | ||
| 395 |
1/2✗ Branch 13 → 14 not taken.
✓ Branch 13 → 16 taken 1 time.
|
1 | if (r == EILSEQ) { |
| 396 | // Invalid multibyte sequence | ||
| 397 | ✗ | size_t skip = handle_invalid(c, ib, ic); | |
| 398 | ✗ | ic -= skip; | |
| 399 | ✗ | ib += skip; | |
| 400 | ✗ | continue; | |
| 401 | } | ||
| 402 | } | ||
| 403 | 1 | } | |
| 404 | |||
| 405 | ✗ | static CharsetConverter *cconv_to_utf8(const char *encoding) | |
| 406 | { | ||
| 407 | ✗ | iconv_t cd = iconv_open("UTF-8", encoding); | |
| 408 | ✗ | if (cd == (iconv_t)-1) { | |
| 409 | return NULL; | ||
| 410 | } | ||
| 411 | |||
| 412 | ✗ | CharsetConverter *c = create(cd); | |
| 413 | ✗ | c->rcount = copyliteral(c->rbuf, REPLACEMENT); | |
| 414 | |||
| 415 | ✗ | if (str_has_prefix(encoding, "UTF-16")) { | |
| 416 | ✗ | c->char_size = 2; | |
| 417 | ✗ | } else if (str_has_prefix(encoding, "UTF-32")) { | |
| 418 | ✗ | c->char_size = 4; | |
| 419 | } else { | ||
| 420 | ✗ | c->char_size = 1; | |
| 421 | } | ||
| 422 | |||
| 423 | return c; | ||
| 424 | } | ||
| 425 | |||
| 426 | 1 | static void encode_replacement(CharsetConverter *c) | |
| 427 | { | ||
| 428 | 1 | static const char rep[] = REPLACEMENT; | |
| 429 | 1 | const char *ib = rep; | |
| 430 | 1 | char *ob = c->rbuf; | |
| 431 | 1 | size_t ic = STRLEN(REPLACEMENT); | |
| 432 | 1 | size_t oc = sizeof(c->rbuf); | |
| 433 | 1 | size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc); | |
| 434 | |||
| 435 |
1/2✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 5 not taken.
|
1 | if (rc == (size_t)-1) { |
| 436 | 1 | c->rbuf[0] = '\xbf'; | |
| 437 | 1 | c->rcount = 1; | |
| 438 | } else { | ||
| 439 | ✗ | c->rcount = ob - c->rbuf; | |
| 440 | } | ||
| 441 | 1 | } | |
| 442 | |||
| 443 | 1 | static CharsetConverter *cconv_from_utf8(const char *encoding) | |
| 444 | { | ||
| 445 | 1 | iconv_t cd = iconv_open(encoding, "UTF-8"); | |
| 446 |
1/2✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 7 not taken.
|
1 | if (cd == (iconv_t)-1) { |
| 447 | return NULL; | ||
| 448 | } | ||
| 449 | 1 | CharsetConverter *c = create(cd); | |
| 450 | 1 | encode_replacement(c); | |
| 451 | 1 | return c; | |
| 452 | } | ||
| 453 | |||
| 454 | 1 | static void cconv_flush(CharsetConverter *c) | |
| 455 | { | ||
| 456 |
1/2✗ Branch 2 → 3 not taken.
✓ Branch 2 → 6 taken 1 time.
|
1 | if (c->tcount > 0) { |
| 457 | // Replace incomplete character at end of input buffer | ||
| 458 | ✗ | LOG_DEBUG("incomplete character at EOF"); | |
| 459 | ✗ | add_replacement(c); | |
| 460 | ✗ | c->tcount = 0; | |
| 461 | } | ||
| 462 | 1 | } | |
| 463 | |||
| 464 | ✗ | static char *cconv_consume_line(CharsetConverter *c, size_t *len) | |
| 465 | { | ||
| 466 | ✗ | char *line = c->obuf + c->consumed; | |
| 467 | ✗ | char *nl = memchr(line, '\n', c->opos - c->consumed); | |
| 468 | ✗ | if (!nl) { | |
| 469 | ✗ | *len = 0; | |
| 470 | ✗ | return NULL; | |
| 471 | } | ||
| 472 | |||
| 473 | ✗ | size_t n = nl - line + 1; | |
| 474 | ✗ | c->consumed += n; | |
| 475 | ✗ | *len = n; | |
| 476 | ✗ | return line; | |
| 477 | } | ||
| 478 | |||
| 479 | 1 | static char *cconv_consume_all(CharsetConverter *c, size_t *len) | |
| 480 | { | ||
| 481 | 1 | char *buf = c->obuf + c->consumed; | |
| 482 | 1 | *len = c->opos - c->consumed; | |
| 483 | 1 | c->consumed = c->opos; | |
| 484 | 1 | return buf; | |
| 485 | } | ||
| 486 | |||
| 487 | 1 | static void cconv_free(CharsetConverter *c) | |
| 488 | { | ||
| 489 | 1 | BUG_ON(!c); | |
| 490 | 1 | iconv_close(c->cd); | |
| 491 | 1 | free(c->obuf); | |
| 492 | 1 | free(c); | |
| 493 | 1 | } | |
| 494 | |||
| 495 | 2 | bool conversion_supported_by_iconv(const char *from, const char *to) | |
| 496 | { | ||
| 497 |
2/4✓ Branch 2 → 3 taken 2 times.
✗ Branch 2 → 4 not taken.
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 2 times.
|
2 | if (unlikely(from[0] == '\0' || to[0] == '\0')) { |
| 498 | ✗ | errno = EINVAL; | |
| 499 | ✗ | return false; | |
| 500 | } | ||
| 501 | |||
| 502 | 2 | iconv_t cd = iconv_open(to, from); | |
| 503 |
1/2✓ Branch 6 → 7 taken 2 times.
✗ Branch 6 → 9 not taken.
|
2 | if (cd == (iconv_t)-1) { |
| 504 | return false; | ||
| 505 | } | ||
| 506 | |||
| 507 | 2 | iconv_close(cd); | |
| 508 | 2 | return true; | |
| 509 | } | ||
| 510 | |||
| 511 | 22 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | |
| 512 | { | ||
| 513 | 22 | CharsetConverter *cconv = NULL; | |
| 514 |
2/2✓ Branch 3 → 4 taken 1 time.
✓ Branch 3 → 7 taken 21 times.
|
22 | if (unlikely(!encoding_is_utf8(encoding))) { |
| 515 | 1 | cconv = cconv_from_utf8(encoding); | |
| 516 |
1/2✗ Branch 5 → 6 not taken.
✓ Branch 5 → 7 taken 1 time.
|
1 | if (!cconv) { |
| 517 | − | BUG("unsupported conversion; should have been handled earlier"); | |
| 518 | } | ||
| 519 | } | ||
| 520 | |||
| 521 | 22 | return (FileEncoder) { | |
| 522 | .cconv = cconv, | ||
| 523 | .crlf = crlf, | ||
| 524 | .fd = fd, | ||
| 525 | }; | ||
| 526 | } | ||
| 527 | |||
| 528 | 22 | void file_encoder_free(FileEncoder *enc) | |
| 529 | { | ||
| 530 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
|
22 | if (enc->cconv) { |
| 531 | 1 | cconv_free(enc->cconv); | |
| 532 | } | ||
| 533 | 22 | free(enc->nbuf); | |
| 534 | 22 | } | |
| 535 | |||
| 536 | // NOTE: buf must contain whole characters! | ||
| 537 | 22 | ssize_t file_encoder_write ( | |
| 538 | FileEncoder *enc, | ||
| 539 | const char *buf, | ||
| 540 | size_t size, | ||
| 541 | size_t nr_newlines | ||
| 542 | ) { | ||
| 543 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 5 taken 21 times.
|
22 | if (unlikely(enc->crlf)) { |
| 544 | 1 | size = unix_to_dos(enc, string_view(buf, size), nr_newlines); | |
| 545 | 1 | buf = enc->nbuf; | |
| 546 | } | ||
| 547 |
2/2✓ Branch 5 → 6 taken 1 time.
✓ Branch 5 → 9 taken 21 times.
|
22 | if (unlikely(enc->cconv)) { |
| 548 | 1 | cconv_process(enc->cconv, buf, size); | |
| 549 | 1 | cconv_flush(enc->cconv); | |
| 550 | 1 | buf = cconv_consume_all(enc->cconv, &size); | |
| 551 | } | ||
| 552 | 22 | return xwrite_all(enc->fd, buf, size); | |
| 553 | } | ||
| 554 | |||
| 555 | 22 | size_t file_encoder_get_nr_errors(const FileEncoder *enc) | |
| 556 | { | ||
| 557 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
|
22 | return enc->cconv ? enc->cconv->errors : 0; |
| 558 | } | ||
| 559 | |||
| 560 | ✗ | static bool fill(FileDecoder *dec) | |
| 561 | { | ||
| 562 | ✗ | StringView text = dec->text; | |
| 563 | ✗ | if (dec->ipos == text.length) { | |
| 564 | return false; | ||
| 565 | } | ||
| 566 | |||
| 567 | // Smaller than cconv.obuf to make realloc less likely | ||
| 568 | ✗ | size_t max = 7 * 1024; | |
| 569 | |||
| 570 | ✗ | size_t icount = MIN(text.length - dec->ipos, max); | |
| 571 | ✗ | cconv_process(dec->cconv, text.data + dec->ipos, icount); | |
| 572 | ✗ | dec->ipos += icount; | |
| 573 | ✗ | if (dec->ipos == text.length) { | |
| 574 | // Must be flushed after all input has been fed | ||
| 575 | ✗ | cconv_flush(dec->cconv); | |
| 576 | } | ||
| 577 | return true; | ||
| 578 | } | ||
| 579 | |||
| 580 | ✗ | static bool decode_and_read_line(FileDecoder *dec, StringView *linep) | |
| 581 | { | ||
| 582 | ✗ | char *line; | |
| 583 | ✗ | size_t len; | |
| 584 | ✗ | while (1) { | |
| 585 | ✗ | line = cconv_consume_line(dec->cconv, &len); | |
| 586 | ✗ | if (line || !fill(dec)) { | |
| 587 | break; | ||
| 588 | } | ||
| 589 | } | ||
| 590 | |||
| 591 | ✗ | if (line) { | |
| 592 | // Newline not wanted | ||
| 593 | ✗ | len--; | |
| 594 | } else { | ||
| 595 | ✗ | line = cconv_consume_all(dec->cconv, &len); | |
| 596 | ✗ | if (len == 0) { | |
| 597 | return false; | ||
| 598 | } | ||
| 599 | } | ||
| 600 | |||
| 601 | ✗ | *linep = string_view(line, len); | |
| 602 | ✗ | return true; | |
| 603 | } | ||
| 604 | |||
| 605 | 34 | bool file_decoder_read ( | |
| 606 | Buffer *buffer, | ||
| 607 | const GlobalOptions *gopts, | ||
| 608 | ErrorBuffer *errbuf, | ||
| 609 | StringView text | ||
| 610 | ) { | ||
| 611 |
1/2✓ Branch 3 → 4 taken 34 times.
✗ Branch 3 → 5 not taken.
|
34 | if (encoding_is_utf8(buffer->encoding)) { |
| 612 | 34 | return file_decoder_read_utf8(buffer, gopts, errbuf, text); | |
| 613 | } | ||
| 614 | |||
| 615 | ✗ | CharsetConverter *cconv = cconv_to_utf8(buffer->encoding); | |
| 616 | ✗ | if (!cconv) { | |
| 617 | return false; | ||
| 618 | } | ||
| 619 | |||
| 620 | ✗ | FileDecoder dec = { | |
| 621 | .text = text, | ||
| 622 | .cconv = cconv, | ||
| 623 | }; | ||
| 624 | |||
| 625 | ✗ | StringView line; | |
| 626 | ✗ | if (decode_and_read_line(&dec, &line)) { | |
| 627 | ✗ | if (strview_remove_matching_suffix(&line, "\r")) { | |
| 628 | ✗ | buffer->crlf_newlines = true; | |
| 629 | } | ||
| 630 | |||
| 631 | ✗ | Block *blk = add_utf8_line(buffer, gopts, errbuf, NULL, line); | |
| 632 | ✗ | while (decode_and_read_line(&dec, &line)) { | |
| 633 | ✗ | if (buffer->crlf_newlines) { | |
| 634 | ✗ | strview_remove_matching_suffix(&line, "\r"); | |
| 635 | } | ||
| 636 | ✗ | blk = add_utf8_line(buffer, gopts, errbuf, blk, line); | |
| 637 | } | ||
| 638 | |||
| 639 | ✗ | if (blk) { | |
| 640 | ✗ | add_block(buffer, blk); | |
| 641 | } | ||
| 642 | } | ||
| 643 | |||
| 644 | ✗ | cconv_free(cconv); | |
| 645 | ✗ | return true; | |
| 646 | } | ||
| 647 | |||
| 648 | #endif | ||
| 649 |