src/convert.c
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #include <errno.h> | ||
| 2 | #include <inttypes.h> | ||
| 3 | #include <stdlib.h> | ||
| 4 | #include <string.h> | ||
| 5 | #include "convert.h" | ||
| 6 | #include "block.h" | ||
| 7 | #include "buildvar-iconv.h" | ||
| 8 | #include "encoding.h" | ||
| 9 | #include "util/arith.h" | ||
| 10 | #include "util/debug.h" | ||
| 11 | #include "util/list.h" | ||
| 12 | #include "util/log.h" | ||
| 13 | #include "util/str-util.h" | ||
| 14 | #include "util/utf8.h" | ||
| 15 | #include "util/xmalloc.h" | ||
| 16 | #include "util/xreadwrite.h" | ||
| 17 | |||
| 18 | enum { | ||
| 19 | // If any line exceeds this length when reading a file, syntax | ||
| 20 | // highlighting will be automatically disabled | ||
| 21 | SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB | ||
| 22 | }; | ||
| 23 | |||
| 24 | typedef struct { | ||
| 25 | const char *ibuf; | ||
| 26 | ssize_t ipos; | ||
| 27 | ssize_t isize; | ||
| 28 | struct CharsetConverter *cconv; | ||
| 29 | } FileDecoder; | ||
| 30 | |||
| 31 | 56 | static void add_block(Buffer *buffer, Block *blk) | |
| 32 | { | ||
| 33 | 56 | buffer->nl += blk->nl; | |
| 34 | 56 | list_insert_before(&blk->node, &buffer->blocks); | |
| 35 | 56 | } | |
| 36 | |||
| 37 | 7671 | static Block *add_utf8_line ( | |
| 38 | Buffer *buffer, | ||
| 39 | ErrorBuffer *errbuf, | ||
| 40 | Block *blk, | ||
| 41 | const char *line, | ||
| 42 | size_t len | ||
| 43 | ) { | ||
| 44 | 7671 | size_t size = len + 1; | |
| 45 |
2/2✓ Branch 2 → 3 taken 7643 times.
✓ Branch 2 → 6 taken 28 times.
|
7671 | if (blk) { |
| 46 | 7643 | size_t avail = blk->alloc - blk->size; | |
| 47 |
2/2✓ Branch 3 → 4 taken 7615 times.
✓ Branch 3 → 5 taken 28 times.
|
7643 | if (size <= avail) { |
| 48 | 7615 | goto copy; | |
| 49 | } | ||
| 50 | 28 | add_block(buffer, blk); | |
| 51 | } | ||
| 52 | 56 | size = MAX(size, 8192); | |
| 53 | 56 | blk = block_new(size); | |
| 54 | |||
| 55 | 7671 | copy: | |
| 56 |
1/4✗ Branch 7 → 8 not taken.
✓ Branch 7 → 11 taken 7671 times.
✗ Branch 8 → 9 not taken.
✗ Branch 8 → 11 not taken.
|
7671 | if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) { |
| 57 | // TODO: Make this limit configurable and add documentation | ||
| 58 | ✗ | error_msg ( | |
| 59 | errbuf, | ||
| 60 | "line length (%zu) exceeded limit (%ju); disabling syntax highlighting", | ||
| 61 | len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN | ||
| 62 | ); | ||
| 63 | ✗ | buffer->options.syntax = false; | |
| 64 | } | ||
| 65 | |||
| 66 | 7671 | memcpy(blk->data + blk->size, line, len); | |
| 67 | 7671 | blk->size += len; | |
| 68 | 7671 | blk->data[blk->size++] = '\n'; | |
| 69 | 7671 | blk->nl++; | |
| 70 | 7671 | return blk; | |
| 71 | } | ||
| 72 | |||
| 73 | 7705 | static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
| 74 | { | ||
| 75 | 7705 | const char *line = dec->ibuf + dec->ipos; | |
| 76 | 7705 | const char *nl = memchr(line, '\n', dec->isize - dec->ipos); | |
| 77 | 7705 | size_t len; | |
| 78 | |||
| 79 |
2/2✓ Branch 2 → 3 taken 7669 times.
✓ Branch 2 → 4 taken 36 times.
|
7705 | if (nl) { |
| 80 | 7669 | len = nl - line; | |
| 81 | 7669 | dec->ipos += len + 1; | |
| 82 | } else { | ||
| 83 | 36 | len = dec->isize - dec->ipos; | |
| 84 |
2/2✓ Branch 4 → 5 taken 2 times.
✓ Branch 4 → 7 taken 34 times.
|
36 | if (len == 0) { |
| 85 | return false; | ||
| 86 | } | ||
| 87 | 2 | dec->ipos += len; | |
| 88 | } | ||
| 89 | |||
| 90 | 7671 | *linep = line; | |
| 91 | 7671 | *lenp = len; | |
| 92 | 7671 | return true; | |
| 93 | } | ||
| 94 | |||
| 95 | 34 | static bool file_decoder_read_utf8(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len) | |
| 96 | { | ||
| 97 |
1/2✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 34 times.
|
34 | if (unlikely(!encoding_is_utf8(buffer->encoding))) { |
| 98 | ✗ | errno = EINVAL; | |
| 99 | ✗ | return false; | |
| 100 | } | ||
| 101 | |||
| 102 | 34 | FileDecoder dec = { | |
| 103 | .ibuf = text, | ||
| 104 | .isize = text_len, | ||
| 105 | }; | ||
| 106 | |||
| 107 | 34 | const char *line; | |
| 108 | 34 | size_t len; | |
| 109 | |||
| 110 |
2/2✓ Branch 6 → 7 taken 28 times.
✓ Branch 6 → 23 taken 6 times.
|
34 | if (!read_utf8_line(&dec, &line, &len)) { |
| 111 | return true; | ||
| 112 | } | ||
| 113 | |||
| 114 |
3/4✓ Branch 7 → 8 taken 28 times.
✗ Branch 7 → 10 not taken.
✓ Branch 8 → 9 taken 1 time.
✓ Branch 8 → 10 taken 27 times.
|
28 | if (len && line[len - 1] == '\r') { |
| 115 | 1 | buffer->crlf_newlines = true; | |
| 116 | 1 | len--; | |
| 117 | } | ||
| 118 | |||
| 119 | 28 | Block *blk = add_utf8_line(buffer, errbuf, NULL, line, len); | |
| 120 | |||
| 121 |
2/2✓ Branch 11 → 16 taken 1 time.
✓ Branch 11 → 19 taken 27 times.
|
28 | if (unlikely(buffer->crlf_newlines)) { |
| 122 |
2/2✓ Branch 17 → 12 taken 270 times.
✓ Branch 17 → 21 taken 1 time.
|
271 | while (read_utf8_line(&dec, &line, &len)) { |
| 123 |
4/4✓ Branch 12 → 13 taken 268 times.
✓ Branch 12 → 15 taken 2 times.
✓ Branch 13 → 14 taken 1 time.
✓ Branch 13 → 15 taken 267 times.
|
270 | if (len && line[len - 1] == '\r') { |
| 124 | 1 | len--; | |
| 125 | } | ||
| 126 | 270 | blk = add_utf8_line(buffer, errbuf, blk, line, len); | |
| 127 | } | ||
| 128 | } else { | ||
| 129 |
2/2✓ Branch 20 → 18 taken 7373 times.
✓ Branch 20 → 21 taken 27 times.
|
7400 | while (read_utf8_line(&dec, &line, &len)) { |
| 130 | 7373 | blk = add_utf8_line(buffer, errbuf, blk, line, len); | |
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 |
1/2✓ Branch 21 → 22 taken 28 times.
✗ Branch 21 → 23 not taken.
|
28 | if (blk) { |
| 135 | 28 | add_block(buffer, blk); | |
| 136 | } | ||
| 137 | |||
| 138 | return true; | ||
| 139 | } | ||
| 140 | |||
| 141 | 1 | static size_t unix_to_dos ( | |
| 142 | FileEncoder *enc, | ||
| 143 | const char *text, | ||
| 144 | size_t text_len, | ||
| 145 | size_t nr_newlines | ||
| 146 | ) { | ||
| 147 | 1 | BUG_ON(text_len && text[text_len - 1] != '\n'); // See sanity_check_blocks() | |
| 148 | 1 | BUG_ON(nr_newlines > text_len); | |
| 149 | |||
| 150 | 1 | const size_t new_len = text_len + nr_newlines; | |
| 151 |
1/2✓ Branch 7 → 8 taken 1 time.
✗ Branch 7 → 17 not taken.
|
1 | if (enc->nsize < new_len) { |
| 152 | 1 | enc->nsize = xmul(text_len, 2); | |
| 153 | 1 | enc->nbuf = xrealloc(enc->nbuf, enc->nsize); | |
| 154 | } | ||
| 155 | |||
| 156 | size_t seen_nl = 0; | ||
| 157 | size_t dest_pos = 0; | ||
| 158 | |||
| 159 |
2/2✓ Branch 18 → 11 taken 3 times.
✓ Branch 18 → 19 taken 1 time.
|
4 | for (size_t src_pos = 0; src_pos < text_len; ) { |
| 160 | 3 | const char *src = text + src_pos; | |
| 161 | 3 | char *dest = enc->nbuf + dest_pos; | |
| 162 | 3 | char *end = memccpy(dest, src, '\n', text_len - src_pos); | |
| 163 | 3 | BUG_ON(!end); // Loop condition prevents this | |
| 164 | |||
| 165 | 3 | size_t line_len = (size_t)(end - dest); | |
| 166 | 3 | src_pos += line_len; | |
| 167 | 3 | BUG_ON(src_pos > text_len); | |
| 168 | |||
| 169 | 3 | end[-1] = '\r'; | |
| 170 | 3 | end[0] = '\n'; | |
| 171 | 3 | dest_pos += line_len + 1; | |
| 172 | 3 | seen_nl++; | |
| 173 | } | ||
| 174 | |||
| 175 | 1 | BUG_ON(seen_nl != nr_newlines); | |
| 176 | 1 | BUG_ON(dest_pos != new_len); | |
| 177 | 1 | return dest_pos; | |
| 178 | } | ||
| 179 | |||
| 180 | #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation: | ||
| 181 | |||
| 182 | bool conversion_supported_by_iconv ( | ||
| 183 | const char* UNUSED_ARG(from), | ||
| 184 | const char* UNUSED_ARG(to) | ||
| 185 | ) { | ||
| 186 | errno = EINVAL; | ||
| 187 | return false; | ||
| 188 | } | ||
| 189 | |||
| 190 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | ||
| 191 | { | ||
| 192 | if (unlikely(!encoding_is_utf8(encoding))) { | ||
| 193 | BUG("unsupported conversion; should have been handled earlier"); | ||
| 194 | } | ||
| 195 | |||
| 196 | return (FileEncoder) { | ||
| 197 | .crlf = crlf, | ||
| 198 | .fd = fd, | ||
| 199 | }; | ||
| 200 | } | ||
| 201 | |||
| 202 | void file_encoder_free(FileEncoder *enc) | ||
| 203 | { | ||
| 204 | free(enc->nbuf); | ||
| 205 | } | ||
| 206 | |||
| 207 | ssize_t file_encoder_write ( | ||
| 208 | FileEncoder *enc, | ||
| 209 | const char *buf, | ||
| 210 | size_t size, | ||
| 211 | size_t nr_newlines | ||
| 212 | ) { | ||
| 213 | if (unlikely(enc->crlf)) { | ||
| 214 | size = unix_to_dos(enc, buf, size, nr_newlines); | ||
| 215 | buf = enc->nbuf; | ||
| 216 | } | ||
| 217 | return xwrite_all(enc->fd, buf, size); | ||
| 218 | } | ||
| 219 | |||
| 220 | size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc)) | ||
| 221 | { | ||
| 222 | return 0; | ||
| 223 | } | ||
| 224 | |||
| 225 | bool file_decoder_read(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len) | ||
| 226 | { | ||
| 227 | return file_decoder_read_utf8(buffer, errbuf, text, text_len); | ||
| 228 | } | ||
| 229 | |||
| 230 | #else // ICONV_DISABLE != 1; use full iconv implementation: | ||
| 231 | |||
| 232 | #include <iconv.h> | ||
| 233 | |||
| 234 | // UTF-8 encoding of U+00BF (inverted question mark; "¿") | ||
| 235 | #define REPLACEMENT "\xc2\xbf" | ||
| 236 | |||
| 237 | typedef struct CharsetConverter { | ||
| 238 | iconv_t cd; | ||
| 239 | char *obuf; | ||
| 240 | size_t osize; | ||
| 241 | size_t opos; | ||
| 242 | size_t consumed; | ||
| 243 | size_t errors; | ||
| 244 | |||
| 245 | // Temporary input buffer | ||
| 246 | char tbuf[16]; | ||
| 247 | size_t tcount; | ||
| 248 | |||
| 249 | // REPLACEMENT character, in target encoding | ||
| 250 | char rbuf[4]; | ||
| 251 | size_t rcount; | ||
| 252 | |||
| 253 | // Input character size in bytes, or zero for UTF-8 | ||
| 254 | size_t char_size; | ||
| 255 | } CharsetConverter; | ||
| 256 | |||
| 257 | 1 | static CharsetConverter *create(iconv_t cd) | |
| 258 | { | ||
| 259 | 1 | CharsetConverter *c = xcalloc1(sizeof(*c)); | |
| 260 | 1 | c->cd = cd; | |
| 261 | 1 | c->osize = 8192; | |
| 262 | 1 | c->obuf = xmalloc(c->osize); | |
| 263 | 1 | return c; | |
| 264 | } | ||
| 265 | |||
| 266 | 2 | static size_t iconv_wrapper ( | |
| 267 | iconv_t cd, | ||
| 268 | const char **restrict inbuf, | ||
| 269 | size_t *restrict inbytesleft, | ||
| 270 | char **restrict outbuf, | ||
| 271 | size_t *restrict outbytesleft | ||
| 272 | ) { | ||
| 273 | // POSIX defines the second parameter of iconv(3) as "char **restrict" | ||
| 274 | // but NetBSD declares it as "const char **restrict" | ||
| 275 | #ifdef __NetBSD__ | ||
| 276 | const char **restrict in = inbuf; | ||
| 277 | #else | ||
| 278 | 2 | char **restrict in = (char **restrict)inbuf; | |
| 279 | #endif | ||
| 280 | |||
| 281 | 2 | return iconv(cd, in, inbytesleft, outbuf, outbytesleft); | |
| 282 | } | ||
| 283 | |||
| 284 | ✗ | static void resize_obuf(CharsetConverter *c) | |
| 285 | { | ||
| 286 | ✗ | c->osize = xmul(2, c->osize); | |
| 287 | ✗ | c->obuf = xrealloc(c->obuf, c->osize); | |
| 288 | ✗ | } | |
| 289 | |||
| 290 | ✗ | static void add_replacement(CharsetConverter *c) | |
| 291 | { | ||
| 292 | ✗ | if (c->osize - c->opos < 4) { | |
| 293 | ✗ | resize_obuf(c); | |
| 294 | } | ||
| 295 | |||
| 296 | ✗ | memcpy(c->obuf + c->opos, c->rbuf, c->rcount); | |
| 297 | ✗ | c->opos += c->rcount; | |
| 298 | ✗ | } | |
| 299 | |||
| 300 | ✗ | static size_t handle_invalid(CharsetConverter *c, const char *buf, size_t count) | |
| 301 | { | ||
| 302 | ✗ | LOG_DEBUG("%zu %zu", c->char_size, count); | |
| 303 | ✗ | add_replacement(c); | |
| 304 | ✗ | if (c->char_size == 0) { | |
| 305 | // Converting from UTF-8 | ||
| 306 | ✗ | size_t idx = 0; | |
| 307 | ✗ | CodePoint u = u_get_char(buf, count, &idx); | |
| 308 | ✗ | LOG_DEBUG("U+%04" PRIX32, u); | |
| 309 | ✗ | return idx; | |
| 310 | } | ||
| 311 | ✗ | if (c->char_size > count) { | |
| 312 | // wtf | ||
| 313 | ✗ | return 1; | |
| 314 | } | ||
| 315 | return c->char_size; | ||
| 316 | } | ||
| 317 | |||
| 318 | 1 | static int xiconv(CharsetConverter *c, const char **ib, size_t *ic) | |
| 319 | { | ||
| 320 | 1 | while (1) { | |
| 321 | 1 | char *ob = c->obuf + c->opos; | |
| 322 | 1 | size_t oc = c->osize - c->opos; | |
| 323 | 1 | size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc); | |
| 324 | 1 | c->opos = ob - c->obuf; | |
| 325 |
1/2✗ Branch 4 → 5 not taken.
✓ Branch 4 → 12 taken 1 time.
|
1 | if (rc == (size_t)-1) { |
| 326 | ✗ | switch (errno) { | |
| 327 | ✗ | case EILSEQ: | |
| 328 | ✗ | c->errors++; | |
| 329 | // Reset | ||
| 330 | ✗ | iconv(c->cd, NULL, NULL, NULL, NULL); | |
| 331 | ✗ | return errno; | |
| 332 | case EINVAL: | ||
| 333 | return errno; | ||
| 334 | ✗ | case E2BIG: | |
| 335 | ✗ | resize_obuf(c); | |
| 336 | ✗ | continue; | |
| 337 | ✗ | default: | |
| 338 | − | BUG("iconv: %s", strerror(errno)); | |
| 339 | } | ||
| 340 | } else { | ||
| 341 | 1 | c->errors += rc; | |
| 342 | } | ||
| 343 | 1 | return 0; | |
| 344 | } | ||
| 345 | } | ||
| 346 | |||
| 347 | ✗ | static size_t convert_incomplete(CharsetConverter *c, const char *input, size_t len) | |
| 348 | { | ||
| 349 | ✗ | size_t ipos = 0; | |
| 350 | ✗ | while (c->tcount < sizeof(c->tbuf) && ipos < len) { | |
| 351 | ✗ | c->tbuf[c->tcount++] = input[ipos++]; | |
| 352 | ✗ | const char *ib = c->tbuf; | |
| 353 | ✗ | size_t ic = c->tcount; | |
| 354 | ✗ | int rc = xiconv(c, &ib, &ic); | |
| 355 | ✗ | if (ic > 0) { | |
| 356 | ✗ | memmove(c->tbuf, ib, ic); | |
| 357 | } | ||
| 358 | ✗ | c->tcount = ic; | |
| 359 | ✗ | if (rc == EINVAL) { | |
| 360 | // Incomplete character at end of input buffer; try again | ||
| 361 | // with more input data | ||
| 362 | ✗ | continue; | |
| 363 | } | ||
| 364 | ✗ | if (rc == EILSEQ) { | |
| 365 | // Invalid multibyte sequence | ||
| 366 | ✗ | size_t skip = handle_invalid(c, c->tbuf, c->tcount); | |
| 367 | ✗ | c->tcount -= skip; | |
| 368 | ✗ | if (c->tcount > 0) { | |
| 369 | ✗ | LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip); | |
| 370 | ✗ | memmove(c->tbuf, c->tbuf + skip, c->tcount); | |
| 371 | ✗ | continue; | |
| 372 | } | ||
| 373 | ✗ | return ipos; | |
| 374 | } | ||
| 375 | ✗ | break; | |
| 376 | } | ||
| 377 | |||
| 378 | ✗ | LOG_DEBUG("%zu %zu", ipos, c->tcount); | |
| 379 | ✗ | return ipos; | |
| 380 | } | ||
| 381 | |||
| 382 | 1 | static void cconv_process(CharsetConverter *c, const char *input, size_t len) | |
| 383 | { | ||
| 384 |
1/2✗ Branch 2 → 3 not taken.
✓ Branch 2 → 4 taken 1 time.
|
1 | if (c->consumed > 0) { |
| 385 | ✗ | size_t fill = c->opos - c->consumed; | |
| 386 | ✗ | memmove(c->obuf, c->obuf + c->consumed, fill); | |
| 387 | ✗ | c->opos = fill; | |
| 388 | ✗ | c->consumed = 0; | |
| 389 | } | ||
| 390 | |||
| 391 |
1/2✗ Branch 4 → 5 not taken.
✓ Branch 4 → 7 taken 1 time.
|
1 | if (c->tcount > 0) { |
| 392 | ✗ | size_t ipos = convert_incomplete(c, input, len); | |
| 393 | ✗ | input += ipos; | |
| 394 | ✗ | len -= ipos; | |
| 395 | } | ||
| 396 | |||
| 397 | 1 | const char *ib = input; | |
| 398 |
2/2✓ Branch 17 → 8 taken 1 time.
✓ Branch 17 → 18 taken 1 time.
|
2 | for (size_t ic = len; ic > 0; ) { |
| 399 | 1 | int r = xiconv(c, &ib, &ic); | |
| 400 |
1/2✗ Branch 9 → 10 not taken.
✓ Branch 9 → 13 taken 1 time.
|
1 | if (r == EINVAL) { |
| 401 | // Incomplete character at end of input buffer | ||
| 402 | ✗ | if (ic < sizeof(c->tbuf)) { | |
| 403 | ✗ | memcpy(c->tbuf, ib, ic); | |
| 404 | ✗ | c->tcount = ic; | |
| 405 | } else { | ||
| 406 | // FIXME | ||
| 407 | ✗ | } | |
| 408 | ✗ | ic = 0; | |
| 409 | ✗ | continue; | |
| 410 | } | ||
| 411 |
1/2✗ Branch 13 → 14 not taken.
✓ Branch 13 → 16 taken 1 time.
|
1 | if (r == EILSEQ) { |
| 412 | // Invalid multibyte sequence | ||
| 413 | ✗ | size_t skip = handle_invalid(c, ib, ic); | |
| 414 | ✗ | ic -= skip; | |
| 415 | ✗ | ib += skip; | |
| 416 | ✗ | continue; | |
| 417 | } | ||
| 418 | } | ||
| 419 | 1 | } | |
| 420 | |||
| 421 | ✗ | static CharsetConverter *cconv_to_utf8(const char *encoding) | |
| 422 | { | ||
| 423 | ✗ | iconv_t cd = iconv_open("UTF-8", encoding); | |
| 424 | ✗ | if (cd == (iconv_t)-1) { | |
| 425 | return NULL; | ||
| 426 | } | ||
| 427 | |||
| 428 | ✗ | CharsetConverter *c = create(cd); | |
| 429 | ✗ | c->rcount = copyliteral(c->rbuf, REPLACEMENT); | |
| 430 | |||
| 431 | ✗ | if (str_has_prefix(encoding, "UTF-16")) { | |
| 432 | ✗ | c->char_size = 2; | |
| 433 | ✗ | } else if (str_has_prefix(encoding, "UTF-32")) { | |
| 434 | ✗ | c->char_size = 4; | |
| 435 | } else { | ||
| 436 | ✗ | c->char_size = 1; | |
| 437 | } | ||
| 438 | |||
| 439 | return c; | ||
| 440 | } | ||
| 441 | |||
| 442 | 1 | static void encode_replacement(CharsetConverter *c) | |
| 443 | { | ||
| 444 | 1 | static const char rep[] = REPLACEMENT; | |
| 445 | 1 | const char *ib = rep; | |
| 446 | 1 | char *ob = c->rbuf; | |
| 447 | 1 | size_t ic = STRLEN(REPLACEMENT); | |
| 448 | 1 | size_t oc = sizeof(c->rbuf); | |
| 449 | 1 | size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc); | |
| 450 | |||
| 451 |
1/2✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 5 not taken.
|
1 | if (rc == (size_t)-1) { |
| 452 | 1 | c->rbuf[0] = '\xbf'; | |
| 453 | 1 | c->rcount = 1; | |
| 454 | } else { | ||
| 455 | ✗ | c->rcount = ob - c->rbuf; | |
| 456 | } | ||
| 457 | 1 | } | |
| 458 | |||
| 459 | 1 | static CharsetConverter *cconv_from_utf8(const char *encoding) | |
| 460 | { | ||
| 461 | 1 | iconv_t cd = iconv_open(encoding, "UTF-8"); | |
| 462 |
1/2✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 7 not taken.
|
1 | if (cd == (iconv_t)-1) { |
| 463 | return NULL; | ||
| 464 | } | ||
| 465 | 1 | CharsetConverter *c = create(cd); | |
| 466 | 1 | encode_replacement(c); | |
| 467 | 1 | return c; | |
| 468 | } | ||
| 469 | |||
| 470 | 1 | static void cconv_flush(CharsetConverter *c) | |
| 471 | { | ||
| 472 |
1/2✗ Branch 2 → 3 not taken.
✓ Branch 2 → 6 taken 1 time.
|
1 | if (c->tcount > 0) { |
| 473 | // Replace incomplete character at end of input buffer | ||
| 474 | ✗ | LOG_DEBUG("incomplete character at EOF"); | |
| 475 | ✗ | add_replacement(c); | |
| 476 | ✗ | c->tcount = 0; | |
| 477 | } | ||
| 478 | 1 | } | |
| 479 | |||
| 480 | ✗ | static char *cconv_consume_line(CharsetConverter *c, size_t *len) | |
| 481 | { | ||
| 482 | ✗ | char *line = c->obuf + c->consumed; | |
| 483 | ✗ | char *nl = memchr(line, '\n', c->opos - c->consumed); | |
| 484 | ✗ | if (!nl) { | |
| 485 | ✗ | *len = 0; | |
| 486 | ✗ | return NULL; | |
| 487 | } | ||
| 488 | |||
| 489 | ✗ | size_t n = nl - line + 1; | |
| 490 | ✗ | c->consumed += n; | |
| 491 | ✗ | *len = n; | |
| 492 | ✗ | return line; | |
| 493 | } | ||
| 494 | |||
| 495 | 1 | static char *cconv_consume_all(CharsetConverter *c, size_t *len) | |
| 496 | { | ||
| 497 | 1 | char *buf = c->obuf + c->consumed; | |
| 498 | 1 | *len = c->opos - c->consumed; | |
| 499 | 1 | c->consumed = c->opos; | |
| 500 | 1 | return buf; | |
| 501 | } | ||
| 502 | |||
| 503 | 1 | static void cconv_free(CharsetConverter *c) | |
| 504 | { | ||
| 505 | 1 | BUG_ON(!c); | |
| 506 | 1 | iconv_close(c->cd); | |
| 507 | 1 | free(c->obuf); | |
| 508 | 1 | free(c); | |
| 509 | 1 | } | |
| 510 | |||
| 511 | 2 | bool conversion_supported_by_iconv(const char *from, const char *to) | |
| 512 | { | ||
| 513 |
2/4✓ Branch 2 → 3 taken 2 times.
✗ Branch 2 → 4 not taken.
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 2 times.
|
2 | if (unlikely(from[0] == '\0' || to[0] == '\0')) { |
| 514 | ✗ | errno = EINVAL; | |
| 515 | ✗ | return false; | |
| 516 | } | ||
| 517 | |||
| 518 | 2 | iconv_t cd = iconv_open(to, from); | |
| 519 |
1/2✓ Branch 6 → 7 taken 2 times.
✗ Branch 6 → 9 not taken.
|
2 | if (cd == (iconv_t)-1) { |
| 520 | return false; | ||
| 521 | } | ||
| 522 | |||
| 523 | 2 | iconv_close(cd); | |
| 524 | 2 | return true; | |
| 525 | } | ||
| 526 | |||
| 527 | 22 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | |
| 528 | { | ||
| 529 | 22 | CharsetConverter *cconv = NULL; | |
| 530 |
2/2✓ Branch 3 → 4 taken 1 time.
✓ Branch 3 → 7 taken 21 times.
|
22 | if (unlikely(!encoding_is_utf8(encoding))) { |
| 531 | 1 | cconv = cconv_from_utf8(encoding); | |
| 532 |
1/2✗ Branch 5 → 6 not taken.
✓ Branch 5 → 7 taken 1 time.
|
1 | if (!cconv) { |
| 533 | − | BUG("unsupported conversion; should have been handled earlier"); | |
| 534 | } | ||
| 535 | } | ||
| 536 | |||
| 537 | 22 | return (FileEncoder) { | |
| 538 | .cconv = cconv, | ||
| 539 | .crlf = crlf, | ||
| 540 | .fd = fd, | ||
| 541 | }; | ||
| 542 | } | ||
| 543 | |||
| 544 | 22 | void file_encoder_free(FileEncoder *enc) | |
| 545 | { | ||
| 546 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
|
22 | if (enc->cconv) { |
| 547 | 1 | cconv_free(enc->cconv); | |
| 548 | } | ||
| 549 | 22 | free(enc->nbuf); | |
| 550 | 22 | } | |
| 551 | |||
| 552 | // NOTE: buf must contain whole characters! | ||
| 553 | 22 | ssize_t file_encoder_write ( | |
| 554 | FileEncoder *enc, | ||
| 555 | const char *buf, | ||
| 556 | size_t size, | ||
| 557 | size_t nr_newlines | ||
| 558 | ) { | ||
| 559 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 5 taken 21 times.
|
22 | if (unlikely(enc->crlf)) { |
| 560 | 1 | size = unix_to_dos(enc, buf, size, nr_newlines); | |
| 561 | 1 | buf = enc->nbuf; | |
| 562 | } | ||
| 563 |
2/2✓ Branch 5 → 6 taken 1 time.
✓ Branch 5 → 9 taken 21 times.
|
22 | if (unlikely(enc->cconv)) { |
| 564 | 1 | cconv_process(enc->cconv, buf, size); | |
| 565 | 1 | cconv_flush(enc->cconv); | |
| 566 | 1 | buf = cconv_consume_all(enc->cconv, &size); | |
| 567 | } | ||
| 568 | 22 | return xwrite_all(enc->fd, buf, size); | |
| 569 | } | ||
| 570 | |||
| 571 | 22 | size_t file_encoder_get_nr_errors(const FileEncoder *enc) | |
| 572 | { | ||
| 573 |
2/2✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
|
22 | return enc->cconv ? enc->cconv->errors : 0; |
| 574 | } | ||
| 575 | |||
| 576 | ✗ | static bool fill(FileDecoder *dec) | |
| 577 | { | ||
| 578 | ✗ | if (dec->ipos == dec->isize) { | |
| 579 | return false; | ||
| 580 | } | ||
| 581 | |||
| 582 | // Smaller than cconv.obuf to make realloc less likely | ||
| 583 | ✗ | size_t max = 7 * 1024; | |
| 584 | |||
| 585 | ✗ | size_t icount = MIN(dec->isize - dec->ipos, max); | |
| 586 | ✗ | cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount); | |
| 587 | ✗ | dec->ipos += icount; | |
| 588 | ✗ | if (dec->ipos == dec->isize) { | |
| 589 | // Must be flushed after all input has been fed | ||
| 590 | ✗ | cconv_flush(dec->cconv); | |
| 591 | } | ||
| 592 | return true; | ||
| 593 | } | ||
| 594 | |||
| 595 | ✗ | static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
| 596 | { | ||
| 597 | ✗ | char *line; | |
| 598 | ✗ | size_t len; | |
| 599 | ✗ | while (1) { | |
| 600 | ✗ | line = cconv_consume_line(dec->cconv, &len); | |
| 601 | ✗ | if (line || !fill(dec)) { | |
| 602 | break; | ||
| 603 | } | ||
| 604 | } | ||
| 605 | |||
| 606 | ✗ | if (line) { | |
| 607 | // Newline not wanted | ||
| 608 | ✗ | len--; | |
| 609 | } else { | ||
| 610 | ✗ | line = cconv_consume_all(dec->cconv, &len); | |
| 611 | ✗ | if (len == 0) { | |
| 612 | return false; | ||
| 613 | } | ||
| 614 | } | ||
| 615 | |||
| 616 | ✗ | *linep = line; | |
| 617 | ✗ | *lenp = len; | |
| 618 | ✗ | return true; | |
| 619 | } | ||
| 620 | |||
| 621 | 34 | bool file_decoder_read(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len) | |
| 622 | { | ||
| 623 |
1/2✓ Branch 3 → 4 taken 34 times.
✗ Branch 3 → 5 not taken.
|
34 | if (encoding_is_utf8(buffer->encoding)) { |
| 624 | 34 | return file_decoder_read_utf8(buffer, errbuf, text, text_len); | |
| 625 | } | ||
| 626 | |||
| 627 | ✗ | CharsetConverter *cconv = cconv_to_utf8(buffer->encoding); | |
| 628 | ✗ | if (!cconv) { | |
| 629 | return false; | ||
| 630 | } | ||
| 631 | |||
| 632 | ✗ | FileDecoder dec = { | |
| 633 | .ibuf = text, | ||
| 634 | .isize = text_len, | ||
| 635 | .cconv = cconv, | ||
| 636 | }; | ||
| 637 | |||
| 638 | ✗ | const char *line; | |
| 639 | ✗ | size_t len; | |
| 640 | |||
| 641 | ✗ | if (decode_and_read_line(&dec, &line, &len)) { | |
| 642 | ✗ | if (len && line[len - 1] == '\r') { | |
| 643 | ✗ | buffer->crlf_newlines = true; | |
| 644 | ✗ | len--; | |
| 645 | } | ||
| 646 | ✗ | Block *blk = add_utf8_line(buffer, errbuf, NULL, line, len); | |
| 647 | ✗ | while (decode_and_read_line(&dec, &line, &len)) { | |
| 648 | ✗ | if (buffer->crlf_newlines && len && line[len - 1] == '\r') { | |
| 649 | ✗ | len--; | |
| 650 | } | ||
| 651 | ✗ | blk = add_utf8_line(buffer, errbuf, blk, line, len); | |
| 652 | } | ||
| 653 | ✗ | if (blk) { | |
| 654 | ✗ | add_block(buffer, blk); | |
| 655 | } | ||
| 656 | } | ||
| 657 | |||
| 658 | ✗ | cconv_free(cconv); | |
| 659 | ✗ | return true; | |
| 660 | } | ||
| 661 | |||
| 662 | #endif | ||
| 663 |