dte test coverage


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 50.0% high: ≥ 85.0%
Coverage Exec / Excl / Total
Lines: 52.7% 154 / 2 / 294
Functions: 71.4% 20 / 0 / 28
Branches: 38.7% 48 / 18 / 142

src/convert.c
Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "util/arith.h"
10 #include "util/debug.h"
11 #include "util/list.h"
12 #include "util/log.h"
13 #include "util/numtostr.h"
14 #include "util/str-util.h"
15 #include "util/utf8.h"
16 #include "util/xmalloc.h"
17 #include "util/xreadwrite.h"
18
19 typedef struct {
20 StringView text;
21 size_t ipos;
22 struct CharsetConverter *cconv;
23 } FileDecoder;
24
25 56 static void add_block(Buffer *buffer, Block *blk)
26 {
27 56 buffer->nl += blk->nl;
28 56 list_insert_before(&blk->node, &buffer->blocks);
29 56 }
30
31 7713 static Block *add_utf8_line (
32 Buffer *buffer,
33 const GlobalOptions *gopts,
34 ErrorBuffer *errbuf,
35 Block *blk,
36 StringView line
37 ) {
38 7713 size_t slimit = gopts->syntax_line_limit;
39 7713 size_t size = line.length + 1;
40
41
2/2
✓ Branch 2 → 3 taken 7685 times.
✓ Branch 2 → 6 taken 28 times.
7713 if (blk) {
42 7685 size_t avail = blk->alloc - blk->size;
43
2/2
✓ Branch 3 → 4 taken 7657 times.
✓ Branch 3 → 5 taken 28 times.
7685 if (size <= avail) {
44 7657 goto copy;
45 }
46 28 add_block(buffer, blk);
47 }
48
49 56 size = MAX(size, 8192);
50 56 blk = block_new(size);
51
52 7713 copy:
53
2/6
✓ Branch 7 → 8 taken 7713 times.
✗ Branch 7 → 13 not taken.
✗ Branch 8 → 9 not taken.
✓ Branch 8 → 13 taken 7713 times.
✗ Branch 9 → 10 not taken.
✗ Branch 9 → 13 not taken.
7713 if (unlikely(slimit && line.length > slimit && buffer->options.syntax)) {
54 char limit_str[PRECISE_FILESIZE_STR_MAX];
55 filesize_to_str_precise(slimit, limit_str);
56 error_msg (
57 errbuf,
58 "line length (%zu) exceeds 'syntax-line-limit' option (%s); "
59 "disabling syntax highlighting",
60 line.length, limit_str
61 );
62 buffer->options.syntax = false;
63 }
64
65 7713 memcpy(blk->data + blk->size, line.data, line.length);
66 7713 blk->size += line.length;
67 7713 blk->data[blk->size++] = '\n';
68 7713 blk->nl++;
69 7713 return blk;
70 }
71
72 7747 static bool read_utf8_line(FileDecoder *dec, StringView *linep)
73 {
74 7747 size_t len = dec->text.length;
75
2/2
✓ Branch 2 → 3 taken 34 times.
✓ Branch 2 → 5 taken 7713 times.
7747 if (dec->ipos >= len) {
76 34 BUG_ON(dec->ipos > len);
77 return false;
78 }
79
80 7713 *linep = get_delim(dec->text.data, &dec->ipos, len, '\n');
81 7713 return true;
82 }
83
84 34 static bool file_decoder_read_utf8 (
85 Buffer *buffer,
86 const GlobalOptions *gopts,
87 ErrorBuffer *errbuf,
88 StringView text
89 ) {
90
1/2
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 34 times.
34 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
91 errno = EINVAL;
92 return false;
93 }
94
95 34 FileDecoder dec = {.text = text};
96 34 StringView line;
97
2/2
✓ Branch 6 → 7 taken 28 times.
✓ Branch 6 → 21 taken 6 times.
34 if (!read_utf8_line(&dec, &line)) {
98 return true;
99 }
100
101
2/2
✓ Branch 8 → 9 taken 1 time.
✓ Branch 8 → 10 taken 27 times.
28 if (strview_remove_matching_suffix(&line, "\r")) {
102 1 buffer->crlf_newlines = true;
103 }
104
105 28 Block *blk = add_utf8_line(buffer, gopts, errbuf, NULL, line);
106
107
2/2
✓ Branch 11 → 14 taken 1 time.
✓ Branch 11 → 17 taken 27 times.
28 if (unlikely(buffer->crlf_newlines)) {
108
2/2
✓ Branch 15 → 12 taken 270 times.
✓ Branch 15 → 19 taken 1 time.
271 while (read_utf8_line(&dec, &line)) {
109 270 strview_remove_matching_suffix(&line, "\r");
110 270 blk = add_utf8_line(buffer, gopts, errbuf, blk, line);
111 }
112 } else {
113
2/2
✓ Branch 18 → 16 taken 7415 times.
✓ Branch 18 → 19 taken 27 times.
7442 while (read_utf8_line(&dec, &line)) {
114 7415 blk = add_utf8_line(buffer, gopts, errbuf, blk, line);
115 }
116 }
117
118
1/2
✓ Branch 19 → 20 taken 28 times.
✗ Branch 19 → 21 not taken.
28 if (blk) {
119 28 add_block(buffer, blk);
120 }
121
122 return true;
123 }
124
125 1 static size_t unix_to_dos(FileEncoder *enc, StringView text, size_t nr_newlines)
126 {
127 1 BUG_ON(text.length && !strview_has_suffix(text, "\n")); // See sanity_check_blocks()
128 1 BUG_ON(nr_newlines > text.length);
129
130 1 const size_t new_len = text.length + nr_newlines;
131
1/2
✓ Branch 8 → 9 taken 1 time.
✗ Branch 8 → 18 not taken.
1 if (enc->nsize < new_len) {
132 1 enc->nsize = xmul(text.length, 2);
133 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
134 }
135
136 size_t seen_nl = 0;
137 size_t dest_pos = 0;
138
139
2/2
✓ Branch 19 → 12 taken 3 times.
✓ Branch 19 → 20 taken 1 time.
4 for (size_t src_pos = 0; src_pos < text.length; ) {
140 3 const char *src = text.data + src_pos;
141 3 char *dest = enc->nbuf + dest_pos;
142 3 char *end = memccpy(dest, src, '\n', text.length - src_pos);
143 3 BUG_ON(!end); // Loop condition prevents this
144
145 3 size_t line_len = (size_t)(end - dest);
146 3 src_pos += line_len;
147 3 BUG_ON(src_pos > text.length);
148
149 3 end[-1] = '\r';
150 3 end[0] = '\n';
151 3 dest_pos += line_len + 1;
152 3 seen_nl++;
153 }
154
155 1 BUG_ON(seen_nl != nr_newlines);
156 1 BUG_ON(dest_pos != new_len);
157 1 return dest_pos;
158 }
159
160 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
161
162 bool conversion_supported_by_iconv (
163 const char* UNUSED_ARG(from),
164 const char* UNUSED_ARG(to)
165 ) {
166 errno = EINVAL;
167 return false;
168 }
169
170 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
171 {
172 if (unlikely(!encoding_is_utf8(encoding))) {
173 BUG("unsupported conversion; should have been handled earlier");
174 }
175
176 return (FileEncoder) {
177 .crlf = crlf,
178 .fd = fd,
179 };
180 }
181
182 void file_encoder_free(FileEncoder *enc)
183 {
184 free(enc->nbuf);
185 }
186
187 ssize_t file_encoder_write (
188 FileEncoder *enc,
189 const char *buf,
190 size_t size,
191 size_t nr_newlines
192 ) {
193 if (unlikely(enc->crlf)) {
194 size = unix_to_dos(enc, string_view(buf, size), nr_newlines);
195 buf = enc->nbuf;
196 }
197 return xwrite_all(enc->fd, buf, size);
198 }
199
200 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
201 {
202 return 0;
203 }
204
205 bool file_decoder_read (
206 Buffer *buffer,
207 const GlobalOptions *gopts,
208 ErrorBuffer *errbuf,
209 StringView text
210 ) {
211 return file_decoder_read_utf8(buffer, gopts, errbuf, text);
212 }
213
214 #else // ICONV_DISABLE != 1; use full iconv implementation:
215
216 #include <iconv.h>
217
218 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
219 #define REPLACEMENT "\xc2\xbf"
220
221 typedef struct CharsetConverter {
222 iconv_t cd;
223 char *obuf;
224 size_t osize;
225 size_t opos;
226 size_t consumed;
227 size_t errors;
228
229 // Temporary input buffer
230 char tbuf[16];
231 size_t tcount;
232
233 // REPLACEMENT character, in target encoding
234 char rbuf[4];
235 size_t rcount;
236
237 // Input character size in bytes, or zero for UTF-8
238 size_t char_size;
239 } CharsetConverter;
240
241 1 static CharsetConverter *create(iconv_t cd)
242 {
243 1 CharsetConverter *c = xcalloc1(sizeof(*c));
244 1 c->cd = cd;
245 1 c->osize = 8192;
246 1 c->obuf = xmalloc(c->osize);
247 1 return c;
248 }
249
250 2 static size_t iconv_wrapper (
251 iconv_t cd,
252 const char **restrict inbuf,
253 size_t *restrict inbytesleft,
254 char **restrict outbuf,
255 size_t *restrict outbytesleft
256 ) {
257 // POSIX defines the second parameter of iconv(3) as "char **restrict"
258 // but NetBSD declares it as "const char **restrict"
259 #ifdef __NetBSD__
260 const char **restrict in = inbuf;
261 #else
262 2 char **restrict in = (char **restrict)inbuf;
263 #endif
264
265 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
266 }
267
268 static void resize_obuf(CharsetConverter *c)
269 {
270 c->osize = xmul(2, c->osize);
271 c->obuf = xrealloc(c->obuf, c->osize);
272 }
273
274 static void add_replacement(CharsetConverter *c)
275 {
276 if (c->osize - c->opos < 4) {
277 resize_obuf(c);
278 }
279
280 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
281 c->opos += c->rcount;
282 }
283
284 static size_t handle_invalid(CharsetConverter *c, const char *buf, size_t count)
285 {
286 LOG_DEBUG("%zu %zu", c->char_size, count);
287 add_replacement(c);
288 if (c->char_size == 0) {
289 // Converting from UTF-8
290 size_t idx = 0;
291 CodePoint u = u_get_char(buf, count, &idx);
292 LOG_DEBUG("U+%04" PRIX32, u);
293 return idx;
294 }
295 if (c->char_size > count) {
296 // wtf
297 return 1;
298 }
299 return c->char_size;
300 }
301
302 1 static int xiconv(CharsetConverter *c, const char **ib, size_t *ic)
303 {
304 1 while (1) {
305 1 char *ob = c->obuf + c->opos;
306 1 size_t oc = c->osize - c->opos;
307 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
308 1 c->opos = ob - c->obuf;
309
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 12 taken 1 time.
1 if (rc == (size_t)-1) {
310 switch (errno) {
311 case EILSEQ:
312 c->errors++;
313 // Reset
314 iconv(c->cd, NULL, NULL, NULL, NULL);
315 return errno;
316 case EINVAL:
317 return errno;
318 case E2BIG:
319 resize_obuf(c);
320 continue;
321 default:
322 BUG("iconv: %s", strerror(errno));
323 }
324 } else {
325 1 c->errors += rc;
326 }
327 1 return 0;
328 }
329 }
330
331 static size_t convert_incomplete(CharsetConverter *c, const char *input, size_t len)
332 {
333 size_t ipos = 0;
334 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
335 c->tbuf[c->tcount++] = input[ipos++];
336 const char *ib = c->tbuf;
337 size_t ic = c->tcount;
338 int rc = xiconv(c, &ib, &ic);
339 if (ic > 0) {
340 memmove(c->tbuf, ib, ic);
341 }
342 c->tcount = ic;
343 if (rc == EINVAL) {
344 // Incomplete character at end of input buffer; try again
345 // with more input data
346 continue;
347 }
348 if (rc == EILSEQ) {
349 // Invalid multibyte sequence
350 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
351 c->tcount -= skip;
352 if (c->tcount > 0) {
353 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
354 memmove(c->tbuf, c->tbuf + skip, c->tcount);
355 continue;
356 }
357 return ipos;
358 }
359 break;
360 }
361
362 LOG_DEBUG("%zu %zu", ipos, c->tcount);
363 return ipos;
364 }
365
366 1 static void cconv_process(CharsetConverter *c, const char *input, size_t len)
367 {
368
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 4 taken 1 time.
1 if (c->consumed > 0) {
369 size_t fill = c->opos - c->consumed;
370 memmove(c->obuf, c->obuf + c->consumed, fill);
371 c->opos = fill;
372 c->consumed = 0;
373 }
374
375
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 7 taken 1 time.
1 if (c->tcount > 0) {
376 size_t ipos = convert_incomplete(c, input, len);
377 input += ipos;
378 len -= ipos;
379 }
380
381 1 const char *ib = input;
382
2/2
✓ Branch 17 → 8 taken 1 time.
✓ Branch 17 → 18 taken 1 time.
2 for (size_t ic = len; ic > 0; ) {
383 1 int r = xiconv(c, &ib, &ic);
384
1/2
✗ Branch 9 → 10 not taken.
✓ Branch 9 → 13 taken 1 time.
1 if (r == EINVAL) {
385 // Incomplete character at end of input buffer
386 if (ic < sizeof(c->tbuf)) {
387 memcpy(c->tbuf, ib, ic);
388 c->tcount = ic;
389 } else {
390 // FIXME
391 }
392 ic = 0;
393 continue;
394 }
395
1/2
✗ Branch 13 → 14 not taken.
✓ Branch 13 → 16 taken 1 time.
1 if (r == EILSEQ) {
396 // Invalid multibyte sequence
397 size_t skip = handle_invalid(c, ib, ic);
398 ic -= skip;
399 ib += skip;
400 continue;
401 }
402 }
403 1 }
404
405 static CharsetConverter *cconv_to_utf8(const char *encoding)
406 {
407 iconv_t cd = iconv_open("UTF-8", encoding);
408 if (cd == (iconv_t)-1) {
409 return NULL;
410 }
411
412 CharsetConverter *c = create(cd);
413 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
414
415 if (str_has_prefix(encoding, "UTF-16")) {
416 c->char_size = 2;
417 } else if (str_has_prefix(encoding, "UTF-32")) {
418 c->char_size = 4;
419 } else {
420 c->char_size = 1;
421 }
422
423 return c;
424 }
425
426 1 static void encode_replacement(CharsetConverter *c)
427 {
428 1 static const char rep[] = REPLACEMENT;
429 1 const char *ib = rep;
430 1 char *ob = c->rbuf;
431 1 size_t ic = STRLEN(REPLACEMENT);
432 1 size_t oc = sizeof(c->rbuf);
433 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
434
435
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 5 not taken.
1 if (rc == (size_t)-1) {
436 1 c->rbuf[0] = '\xbf';
437 1 c->rcount = 1;
438 } else {
439 c->rcount = ob - c->rbuf;
440 }
441 1 }
442
443 1 static CharsetConverter *cconv_from_utf8(const char *encoding)
444 {
445 1 iconv_t cd = iconv_open(encoding, "UTF-8");
446
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 7 not taken.
1 if (cd == (iconv_t)-1) {
447 return NULL;
448 }
449 1 CharsetConverter *c = create(cd);
450 1 encode_replacement(c);
451 1 return c;
452 }
453
454 1 static void cconv_flush(CharsetConverter *c)
455 {
456
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 6 taken 1 time.
1 if (c->tcount > 0) {
457 // Replace incomplete character at end of input buffer
458 LOG_DEBUG("incomplete character at EOF");
459 add_replacement(c);
460 c->tcount = 0;
461 }
462 1 }
463
464 static char *cconv_consume_line(CharsetConverter *c, size_t *len)
465 {
466 char *line = c->obuf + c->consumed;
467 char *nl = memchr(line, '\n', c->opos - c->consumed);
468 if (!nl) {
469 *len = 0;
470 return NULL;
471 }
472
473 size_t n = nl - line + 1;
474 c->consumed += n;
475 *len = n;
476 return line;
477 }
478
479 1 static char *cconv_consume_all(CharsetConverter *c, size_t *len)
480 {
481 1 char *buf = c->obuf + c->consumed;
482 1 *len = c->opos - c->consumed;
483 1 c->consumed = c->opos;
484 1 return buf;
485 }
486
487 1 static void cconv_free(CharsetConverter *c)
488 {
489 1 BUG_ON(!c);
490 1 iconv_close(c->cd);
491 1 free(c->obuf);
492 1 free(c);
493 1 }
494
495 2 bool conversion_supported_by_iconv(const char *from, const char *to)
496 {
497
2/4
✓ Branch 2 → 3 taken 2 times.
✗ Branch 2 → 4 not taken.
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
498 errno = EINVAL;
499 return false;
500 }
501
502 2 iconv_t cd = iconv_open(to, from);
503
1/2
✓ Branch 6 → 7 taken 2 times.
✗ Branch 6 → 9 not taken.
2 if (cd == (iconv_t)-1) {
504 return false;
505 }
506
507 2 iconv_close(cd);
508 2 return true;
509 }
510
511 22 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
512 {
513 22 CharsetConverter *cconv = NULL;
514
2/2
✓ Branch 3 → 4 taken 1 time.
✓ Branch 3 → 7 taken 21 times.
22 if (unlikely(!encoding_is_utf8(encoding))) {
515 1 cconv = cconv_from_utf8(encoding);
516
1/2
✗ Branch 5 → 6 not taken.
✓ Branch 5 → 7 taken 1 time.
1 if (!cconv) {
517 BUG("unsupported conversion; should have been handled earlier");
518 }
519 }
520
521 22 return (FileEncoder) {
522 .cconv = cconv,
523 .crlf = crlf,
524 .fd = fd,
525 };
526 }
527
528 22 void file_encoder_free(FileEncoder *enc)
529 {
530
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 if (enc->cconv) {
531 1 cconv_free(enc->cconv);
532 }
533 22 free(enc->nbuf);
534 22 }
535
536 // NOTE: buf must contain whole characters!
537 22 ssize_t file_encoder_write (
538 FileEncoder *enc,
539 const char *buf,
540 size_t size,
541 size_t nr_newlines
542 ) {
543
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 5 taken 21 times.
22 if (unlikely(enc->crlf)) {
544 1 size = unix_to_dos(enc, string_view(buf, size), nr_newlines);
545 1 buf = enc->nbuf;
546 }
547
2/2
✓ Branch 5 → 6 taken 1 time.
✓ Branch 5 → 9 taken 21 times.
22 if (unlikely(enc->cconv)) {
548 1 cconv_process(enc->cconv, buf, size);
549 1 cconv_flush(enc->cconv);
550 1 buf = cconv_consume_all(enc->cconv, &size);
551 }
552 22 return xwrite_all(enc->fd, buf, size);
553 }
554
555 22 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
556 {
557
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 return enc->cconv ? enc->cconv->errors : 0;
558 }
559
560 static bool fill(FileDecoder *dec)
561 {
562 StringView text = dec->text;
563 if (dec->ipos == text.length) {
564 return false;
565 }
566
567 // Smaller than cconv.obuf to make realloc less likely
568 size_t max = 7 * 1024;
569
570 size_t icount = MIN(text.length - dec->ipos, max);
571 cconv_process(dec->cconv, text.data + dec->ipos, icount);
572 dec->ipos += icount;
573 if (dec->ipos == text.length) {
574 // Must be flushed after all input has been fed
575 cconv_flush(dec->cconv);
576 }
577 return true;
578 }
579
580 static bool decode_and_read_line(FileDecoder *dec, StringView *linep)
581 {
582 char *line;
583 size_t len;
584 while (1) {
585 line = cconv_consume_line(dec->cconv, &len);
586 if (line || !fill(dec)) {
587 break;
588 }
589 }
590
591 if (line) {
592 // Newline not wanted
593 len--;
594 } else {
595 line = cconv_consume_all(dec->cconv, &len);
596 if (len == 0) {
597 return false;
598 }
599 }
600
601 *linep = string_view(line, len);
602 return true;
603 }
604
605 34 bool file_decoder_read (
606 Buffer *buffer,
607 const GlobalOptions *gopts,
608 ErrorBuffer *errbuf,
609 StringView text
610 ) {
611
1/2
✓ Branch 3 → 4 taken 34 times.
✗ Branch 3 → 5 not taken.
34 if (encoding_is_utf8(buffer->encoding)) {
612 34 return file_decoder_read_utf8(buffer, gopts, errbuf, text);
613 }
614
615 CharsetConverter *cconv = cconv_to_utf8(buffer->encoding);
616 if (!cconv) {
617 return false;
618 }
619
620 FileDecoder dec = {
621 .text = text,
622 .cconv = cconv,
623 };
624
625 StringView line;
626 if (decode_and_read_line(&dec, &line)) {
627 if (strview_remove_matching_suffix(&line, "\r")) {
628 buffer->crlf_newlines = true;
629 }
630
631 Block *blk = add_utf8_line(buffer, gopts, errbuf, NULL, line);
632 while (decode_and_read_line(&dec, &line)) {
633 if (buffer->crlf_newlines) {
634 strview_remove_matching_suffix(&line, "\r");
635 }
636 blk = add_utf8_line(buffer, gopts, errbuf, blk, line);
637 }
638
639 if (blk) {
640 add_block(buffer, blk);
641 }
642 }
643
644 cconv_free(cconv);
645 return true;
646 }
647
648 #endif
649