dte test coverage


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 50.0% high: ≥ 85.0%
Coverage Exec / Excl / Total
Lines: 53.6% 162 / 2 / 304
Functions: 71.4% 20 / 0 / 28
Branches: 39.0% 46 / 18 / 136

src/convert.c
Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "load-save.h"
10 #include "util/arith.h"
11 #include "util/debug.h"
12 #include "util/list.h"
13 #include "util/log.h"
14 #include "util/numtostr.h"
15 #include "util/str-util.h"
16 #include "util/utf8.h"
17 #include "util/xmalloc.h"
18 #include "util/xreadwrite.h"
19
20 typedef struct {
21 StringView text;
22 size_t ipos;
23 struct CharsetConverter *cconv;
24 } FileDecoder;
25
26 56 static void add_block(Buffer *buffer, Block *blk)
27 {
28 56 buffer->nl += blk->nl;
29 56 list_insert_before(&blk->node, &buffer->blocks);
30 56 }
31
32 7780 static Block *add_utf8_line(Buffer *buffer, Block *blk, StringView line)
33 {
34 7780 const size_t len = line.length;
35 7780 size_t size = len + 1;
36
2/2
✓ Branch 2 → 3 taken 7752 times.
✓ Branch 2 → 6 taken 28 times.
7780 if (blk) {
37 7752 size_t avail = blk->alloc - blk->size;
38
2/2
✓ Branch 3 → 4 taken 7724 times.
✓ Branch 3 → 5 taken 28 times.
7752 if (size <= avail) {
39 7724 goto copy;
40 }
41 28 add_block(buffer, blk);
42 }
43
44 56 size = MAX(size, 8192);
45 56 blk = block_new(size);
46
47 7780 copy:
48 7780 memcpy(blk->data + blk->size, line.data, len);
49 7780 blk->size += len;
50 7780 blk->data[blk->size++] = '\n';
51 7780 blk->nl++;
52 7780 return blk;
53 }
54
55 7814 static bool read_utf8_line(FileDecoder *dec, StringView *linep)
56 {
57 7814 size_t len = dec->text.length;
58
2/2
✓ Branch 2 → 3 taken 34 times.
✓ Branch 2 → 5 taken 7780 times.
7814 if (dec->ipos >= len) {
59 34 BUG_ON(dec->ipos > len);
60 return false;
61 }
62
63 7780 *linep = get_delim(dec->text.data, &dec->ipos, len, '\n');
64 7780 return true;
65 }
66
67 34 static bool file_decoder_read_utf8(Buffer *buffer, StringView text, size_t *longest_line)
68 {
69
1/2
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 34 times.
34 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
70 errno = EINVAL;
71 return false;
72 }
73
74 34 FileDecoder dec = {.text = text};
75 34 StringView line;
76
2/2
✓ Branch 6 → 7 taken 6 times.
✓ Branch 6 → 8 taken 28 times.
34 if (!read_utf8_line(&dec, &line)) {
77 6 *longest_line = 0;
78 6 return true;
79 }
80
81
2/2
✓ Branch 9 → 10 taken 1 time.
✓ Branch 9 → 11 taken 27 times.
28 if (strview_remove_matching_suffix(&line, "\r")) {
82 1 buffer->crlf_newlines = true;
83 }
84
85 28 Block *blk = add_utf8_line(buffer, NULL, line);
86 28 size_t maxline = line.length;
87
88
2/2
✓ Branch 12 → 16 taken 1 time.
✓ Branch 12 → 20 taken 27 times.
28 if (unlikely(buffer->crlf_newlines)) {
89
2/2
✓ Branch 17 → 13 taken 270 times.
✓ Branch 17 → 22 taken 1 time.
271 while (read_utf8_line(&dec, &line)) {
90 270 strview_remove_matching_suffix(&line, "\r");
91 270 blk = add_utf8_line(buffer, blk, line);
92 270 maxline = MAX(maxline, line.length);
93 }
94 } else {
95
2/2
✓ Branch 21 → 18 taken 7482 times.
✓ Branch 21 → 22 taken 27 times.
7509 while (read_utf8_line(&dec, &line)) {
96 7482 blk = add_utf8_line(buffer, blk, line);
97 7482 maxline = MAX(maxline, line.length);
98 }
99 }
100
101
1/2
✓ Branch 22 → 23 taken 28 times.
✗ Branch 22 → 24 not taken.
28 if (blk) {
102 28 add_block(buffer, blk);
103 }
104
105 28 *longest_line = maxline;
106 28 return true;
107 }
108
109 1 static size_t unix_to_dos(FileEncoder *enc, StringView text, size_t nr_newlines)
110 {
111 1 BUG_ON(text.length && !strview_has_suffix(text, "\n")); // See sanity_check_blocks()
112 1 BUG_ON(nr_newlines > text.length);
113
114 1 const size_t new_len = text.length + nr_newlines;
115
1/2
✓ Branch 8 → 9 taken 1 time.
✗ Branch 8 → 12 not taken.
1 if (enc->nsize < new_len) {
116 1 enc->nsize = xmul(text.length, 2);
117 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
118 }
119
120 1 size_t seen_nl = 0;
121 1 size_t dest_pos = 0;
122
123
2/2
✓ Branch 19 → 13 taken 3 times.
✓ Branch 19 → 20 taken 1 time.
4 for (size_t src_pos = 0; src_pos < text.length; ) {
124 3 const char *src = text.data + src_pos;
125 3 char *dest = enc->nbuf + dest_pos;
126 3 char *end = memccpy(dest, src, '\n', text.length - src_pos);
127 3 BUG_ON(!end); // Loop condition prevents this
128
129 3 size_t line_len = (size_t)(end - dest);
130 3 src_pos += line_len;
131 3 BUG_ON(src_pos > text.length);
132
133 3 end[-1] = '\r';
134 3 end[0] = '\n';
135 3 dest_pos += line_len + 1;
136 3 seen_nl++;
137 }
138
139 1 BUG_ON(seen_nl != nr_newlines);
140 1 BUG_ON(dest_pos != new_len);
141 1 return dest_pos;
142 }
143
144 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
145
146 bool conversion_supported_by_iconv (
147 const char* UNUSED_ARG(from),
148 const char* UNUSED_ARG(to)
149 ) {
150 errno = EINVAL;
151 return false;
152 }
153
154 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
155 {
156 if (unlikely(!encoding_is_utf8(encoding))) {
157 BUG("unsupported conversion; should have been handled earlier");
158 }
159
160 return (FileEncoder) {
161 .crlf = crlf,
162 .fd = fd,
163 };
164 }
165
166 void file_encoder_free(FileEncoder *enc)
167 {
168 free(enc->nbuf);
169 }
170
171 ssize_t file_encoder_write (
172 FileEncoder *enc,
173 const char *buf,
174 size_t size,
175 size_t nr_newlines
176 ) {
177 if (unlikely(enc->crlf)) {
178 size = unix_to_dos(enc, string_view(buf, size), nr_newlines);
179 buf = enc->nbuf;
180 }
181 return xwrite_all(enc->fd, buf, size);
182 }
183
184 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
185 {
186 return 0;
187 }
188
189 bool file_decoder_read(Buffer *buffer, StringView text, size_t *longest_line)
190 {
191 return file_decoder_read_utf8(buffer, text, longest_line);
192 }
193
194 #else // ICONV_DISABLE != 1; use full iconv implementation:
195
196 #include <iconv.h>
197
198 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
199 #define REPLACEMENT "\xc2\xbf"
200
201 typedef struct CharsetConverter {
202 iconv_t cd;
203 char *obuf;
204 size_t osize;
205 size_t opos;
206 size_t consumed;
207 size_t errors;
208
209 // Temporary input buffer
210 char tbuf[16];
211 size_t tcount;
212
213 // REPLACEMENT character, in target encoding
214 char rbuf[4];
215 size_t rcount;
216
217 // Input character size in bytes, or zero for UTF-8
218 size_t char_size;
219 } CharsetConverter;
220
221 1 static CharsetConverter *create(iconv_t cd)
222 {
223 1 CharsetConverter *c = xcalloc1(sizeof(*c));
224 1 c->cd = cd;
225 1 c->osize = 8192;
226 1 c->obuf = xmalloc(c->osize);
227 1 return c;
228 }
229
230 2 static size_t iconv_wrapper (
231 iconv_t cd,
232 const char **restrict inbuf,
233 size_t *restrict inbytesleft,
234 char **restrict outbuf,
235 size_t *restrict outbytesleft
236 ) {
237 // POSIX defines the second parameter of iconv(3) as "char **restrict"
238 // but NetBSD declares it as "const char **restrict"
239 #ifdef __NetBSD__
240 const char **restrict in = inbuf;
241 #else
242 2 char **restrict in = (char **restrict)inbuf;
243 #endif
244
245 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
246 }
247
248 static void resize_obuf(CharsetConverter *c)
249 {
250 c->osize = xmul(2, c->osize);
251 c->obuf = xrealloc(c->obuf, c->osize);
252 }
253
254 static void add_replacement(CharsetConverter *c)
255 {
256 if (c->osize - c->opos < 4) {
257 resize_obuf(c);
258 }
259
260 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
261 c->opos += c->rcount;
262 }
263
264 static size_t handle_invalid(CharsetConverter *c, const char *buf, size_t count)
265 {
266 LOG_DEBUG("%zu %zu", c->char_size, count);
267 add_replacement(c);
268 if (c->char_size == 0) {
269 // Converting from UTF-8
270 size_t idx = 0;
271 CodePoint u = u_get_char(buf, count, &idx);
272 LOG_DEBUG("U+%04" PRIX32, u);
273 return idx;
274 }
275 if (c->char_size > count) {
276 // wtf
277 return 1;
278 }
279 return c->char_size;
280 }
281
282 1 static int xiconv(CharsetConverter *c, const char **ib, size_t *ic)
283 {
284 1 while (1) {
285 1 char *ob = c->obuf + c->opos;
286 1 size_t oc = c->osize - c->opos;
287 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
288 1 c->opos = ob - c->obuf;
289
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 12 taken 1 time.
1 if (rc == (size_t)-1) {
290 switch (errno) {
291 case EILSEQ:
292 c->errors++;
293 // Reset
294 iconv(c->cd, NULL, NULL, NULL, NULL);
295 return errno;
296 case EINVAL:
297 return errno;
298 case E2BIG:
299 resize_obuf(c);
300 continue;
301 default:
302 BUG("iconv: %s", strerror(errno));
303 }
304 } else {
305 1 c->errors += rc;
306 }
307 1 return 0;
308 }
309 }
310
311 static size_t convert_incomplete(CharsetConverter *c, const char *input, size_t len)
312 {
313 size_t ipos = 0;
314 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
315 c->tbuf[c->tcount++] = input[ipos++];
316 const char *ib = c->tbuf;
317 size_t ic = c->tcount;
318 int rc = xiconv(c, &ib, &ic);
319 if (ic > 0) {
320 memmove(c->tbuf, ib, ic);
321 }
322 c->tcount = ic;
323 if (rc == EINVAL) {
324 // Incomplete character at end of input buffer; try again
325 // with more input data
326 continue;
327 }
328 if (rc == EILSEQ) {
329 // Invalid multibyte sequence
330 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
331 c->tcount -= skip;
332 if (c->tcount > 0) {
333 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
334 memmove(c->tbuf, c->tbuf + skip, c->tcount);
335 continue;
336 }
337 return ipos;
338 }
339 break;
340 }
341
342 LOG_DEBUG("%zu %zu", ipos, c->tcount);
343 return ipos;
344 }
345
346 1 static void cconv_process(CharsetConverter *c, const char *input, size_t len)
347 {
348
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 4 taken 1 time.
1 if (c->consumed > 0) {
349 size_t fill = c->opos - c->consumed;
350 memmove(c->obuf, c->obuf + c->consumed, fill);
351 c->opos = fill;
352 c->consumed = 0;
353 }
354
355
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 7 taken 1 time.
1 if (c->tcount > 0) {
356 size_t ipos = convert_incomplete(c, input, len);
357 input += ipos;
358 len -= ipos;
359 }
360
361 1 const char *ib = input;
362
2/2
✓ Branch 17 → 8 taken 1 time.
✓ Branch 17 → 18 taken 1 time.
2 for (size_t ic = len; ic > 0; ) {
363 1 int r = xiconv(c, &ib, &ic);
364
1/2
✗ Branch 9 → 10 not taken.
✓ Branch 9 → 13 taken 1 time.
1 if (r == EINVAL) {
365 // Incomplete character at end of input buffer
366 if (ic < sizeof(c->tbuf)) {
367 memcpy(c->tbuf, ib, ic);
368 c->tcount = ic;
369 } else {
370 // FIXME
371 }
372 ic = 0;
373 continue;
374 }
375
1/2
✗ Branch 13 → 14 not taken.
✓ Branch 13 → 16 taken 1 time.
1 if (r == EILSEQ) {
376 // Invalid multibyte sequence
377 size_t skip = handle_invalid(c, ib, ic);
378 ic -= skip;
379 ib += skip;
380 continue;
381 }
382 }
383 1 }
384
385 static CharsetConverter *cconv_to_utf8(const char *encoding)
386 {
387 iconv_t cd = iconv_open("UTF-8", encoding);
388 if (cd == (iconv_t)-1) {
389 return NULL;
390 }
391
392 CharsetConverter *c = create(cd);
393 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
394
395 if (str_has_prefix(encoding, "UTF-16")) {
396 c->char_size = 2;
397 } else if (str_has_prefix(encoding, "UTF-32")) {
398 c->char_size = 4;
399 } else {
400 c->char_size = 1;
401 }
402
403 return c;
404 }
405
406 1 static void encode_replacement(CharsetConverter *c)
407 {
408 1 static const char rep[] = REPLACEMENT;
409 1 const char *ib = rep;
410 1 char *ob = c->rbuf;
411 1 size_t ic = STRLEN(REPLACEMENT);
412 1 size_t oc = sizeof(c->rbuf);
413 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
414
415
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 5 not taken.
1 if (rc == (size_t)-1) {
416 1 c->rbuf[0] = '\xbf';
417 1 c->rcount = 1;
418 } else {
419 c->rcount = ob - c->rbuf;
420 }
421 1 }
422
423 1 static CharsetConverter *cconv_from_utf8(const char *encoding)
424 {
425 1 iconv_t cd = iconv_open(encoding, "UTF-8");
426
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 7 not taken.
1 if (cd == (iconv_t)-1) {
427 return NULL;
428 }
429 1 CharsetConverter *c = create(cd);
430 1 encode_replacement(c);
431 1 return c;
432 }
433
434 1 static void cconv_flush(CharsetConverter *c)
435 {
436
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 6 taken 1 time.
1 if (c->tcount > 0) {
437 // Replace incomplete character at end of input buffer
438 LOG_DEBUG("incomplete character at EOF");
439 add_replacement(c);
440 c->tcount = 0;
441 }
442 1 }
443
444 static char *cconv_consume_line(CharsetConverter *c, size_t *len)
445 {
446 char *line = c->obuf + c->consumed;
447 char *nl = memchr(line, '\n', c->opos - c->consumed);
448 if (!nl) {
449 *len = 0;
450 return NULL;
451 }
452
453 size_t n = nl - line + 1;
454 c->consumed += n;
455 *len = n;
456 return line;
457 }
458
459 1 static char *cconv_consume_all(CharsetConverter *c, size_t *len)
460 {
461 1 char *buf = c->obuf + c->consumed;
462 1 *len = c->opos - c->consumed;
463 1 c->consumed = c->opos;
464 1 return buf;
465 }
466
467 1 static void cconv_free(CharsetConverter *c)
468 {
469 1 BUG_ON(!c);
470 1 iconv_close(c->cd);
471 1 free(c->obuf);
472 1 free(c);
473 1 }
474
475 2 bool conversion_supported_by_iconv(const char *from, const char *to)
476 {
477
2/4
✓ Branch 2 → 3 taken 2 times.
✗ Branch 2 → 4 not taken.
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
478 errno = EINVAL;
479 return false;
480 }
481
482 2 iconv_t cd = iconv_open(to, from);
483
1/2
✓ Branch 6 → 7 taken 2 times.
✗ Branch 6 → 9 not taken.
2 if (cd == (iconv_t)-1) {
484 return false;
485 }
486
487 2 iconv_close(cd);
488 2 return true;
489 }
490
491 22 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
492 {
493 22 CharsetConverter *cconv = NULL;
494
2/2
✓ Branch 3 → 4 taken 1 time.
✓ Branch 3 → 7 taken 21 times.
22 if (unlikely(!encoding_is_utf8(encoding))) {
495 1 cconv = cconv_from_utf8(encoding);
496
1/2
✗ Branch 5 → 6 not taken.
✓ Branch 5 → 7 taken 1 time.
1 if (!cconv) {
497 BUG("unsupported conversion; should have been handled earlier");
498 }
499 }
500
501 22 return (FileEncoder) {
502 .cconv = cconv,
503 .crlf = crlf,
504 .fd = fd,
505 };
506 }
507
508 22 void file_encoder_free(FileEncoder *enc)
509 {
510
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 if (enc->cconv) {
511 1 cconv_free(enc->cconv);
512 }
513 22 free(enc->nbuf);
514 22 }
515
516 // NOTE: buf must contain whole characters!
517 22 ssize_t file_encoder_write (
518 FileEncoder *enc,
519 const char *buf,
520 size_t size,
521 size_t nr_newlines
522 ) {
523
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 5 taken 21 times.
22 if (unlikely(enc->crlf)) {
524 1 size = unix_to_dos(enc, string_view(buf, size), nr_newlines);
525 1 buf = enc->nbuf;
526 }
527
2/2
✓ Branch 5 → 6 taken 1 time.
✓ Branch 5 → 9 taken 21 times.
22 if (unlikely(enc->cconv)) {
528 1 cconv_process(enc->cconv, buf, size);
529 1 cconv_flush(enc->cconv);
530 1 buf = cconv_consume_all(enc->cconv, &size);
531 }
532 22 return xwrite_all(enc->fd, buf, size);
533 }
534
535 22 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
536 {
537
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 return enc->cconv ? enc->cconv->errors : 0;
538 }
539
540 static bool fill(FileDecoder *dec)
541 {
542 StringView text = dec->text;
543 if (dec->ipos == text.length) {
544 return false;
545 }
546
547 // Smaller than cconv.obuf to make realloc less likely
548 size_t max = 7 * 1024;
549
550 size_t icount = MIN(text.length - dec->ipos, max);
551 cconv_process(dec->cconv, text.data + dec->ipos, icount);
552 dec->ipos += icount;
553 if (dec->ipos == text.length) {
554 // Must be flushed after all input has been fed
555 cconv_flush(dec->cconv);
556 }
557 return true;
558 }
559
560 static bool decode_and_read_line(FileDecoder *dec, StringView *linep)
561 {
562 char *line;
563 size_t len;
564 while (1) {
565 line = cconv_consume_line(dec->cconv, &len);
566 if (line || !fill(dec)) {
567 break;
568 }
569 }
570
571 if (line) {
572 // Newline not wanted
573 len--;
574 } else {
575 line = cconv_consume_all(dec->cconv, &len);
576 if (len == 0) {
577 return false;
578 }
579 }
580
581 *linep = string_view(line, len);
582 return true;
583 }
584
585 34 bool file_decoder_read(Buffer *buffer, StringView text, size_t *longest_line)
586 {
587
1/2
✓ Branch 3 → 4 taken 34 times.
✗ Branch 3 → 5 not taken.
34 if (encoding_is_utf8(buffer->encoding)) {
588 34 return file_decoder_read_utf8(buffer, text, longest_line);
589 }
590
591 CharsetConverter *cconv = cconv_to_utf8(buffer->encoding);
592 if (!cconv) {
593 return false;
594 }
595
596 FileDecoder dec = {.text = text, .cconv = cconv};
597 StringView line;
598 if (!decode_and_read_line(&dec, &line)) {
599 *longest_line = 0;
600 cconv_free(cconv);
601 return true;
602 }
603
604 if (strview_remove_matching_suffix(&line, "\r")) {
605 buffer->crlf_newlines = true;
606 }
607
608 Block *blk = add_utf8_line(buffer, NULL, line);
609 size_t maxline = line.length;
610 while (decode_and_read_line(&dec, &line)) {
611 if (buffer->crlf_newlines) {
612 strview_remove_matching_suffix(&line, "\r");
613 }
614 blk = add_utf8_line(buffer, blk, line);
615 maxline = MAX(maxline, line.length);
616 }
617
618 if (blk) {
619 add_block(buffer, blk);
620 }
621
622 *longest_line = maxline;
623 cconv_free(cconv);
624 return true;
625 }
626
627 #endif
628