dte test coverage


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 50.0% high: ≥ 85.0%
Coverage Exec / Excl / Total
Lines: 54.2% 163 / 2 / 303
Functions: 71.4% 20 / 0 / 28
Branches: 39.7% 54 / 16 / 152

src/convert.c
Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "util/arith.h"
10 #include "util/debug.h"
11 #include "util/list.h"
12 #include "util/log.h"
13 #include "util/str-util.h"
14 #include "util/utf8.h"
15 #include "util/xmalloc.h"
16 #include "util/xreadwrite.h"
17
18 enum {
19 // If any line exceeds this length when reading a file, syntax
20 // highlighting will be automatically disabled
21 SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB
22 };
23
24 typedef struct {
25 const char *ibuf;
26 ssize_t ipos;
27 ssize_t isize;
28 struct CharsetConverter *cconv;
29 } FileDecoder;
30
31 56 static void add_block(Buffer *buffer, Block *blk)
32 {
33 56 buffer->nl += blk->nl;
34 56 list_insert_before(&blk->node, &buffer->blocks);
35 56 }
36
37 7671 static Block *add_utf8_line (
38 Buffer *buffer,
39 ErrorBuffer *errbuf,
40 Block *blk,
41 const char *line,
42 size_t len
43 ) {
44 7671 size_t size = len + 1;
45
2/2
✓ Branch 2 → 3 taken 7643 times.
✓ Branch 2 → 6 taken 28 times.
7671 if (blk) {
46 7643 size_t avail = blk->alloc - blk->size;
47
2/2
✓ Branch 3 → 4 taken 7615 times.
✓ Branch 3 → 5 taken 28 times.
7643 if (size <= avail) {
48 7615 goto copy;
49 }
50 28 add_block(buffer, blk);
51 }
52 56 size = MAX(size, 8192);
53 56 blk = block_new(size);
54
55 7671 copy:
56
1/4
✗ Branch 7 → 8 not taken.
✓ Branch 7 → 11 taken 7671 times.
✗ Branch 8 → 9 not taken.
✗ Branch 8 → 11 not taken.
7671 if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) {
57 // TODO: Make this limit configurable and add documentation
58 error_msg (
59 errbuf,
60 "line length (%zu) exceeded limit (%ju); disabling syntax highlighting",
61 len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN
62 );
63 buffer->options.syntax = false;
64 }
65
66 7671 memcpy(blk->data + blk->size, line, len);
67 7671 blk->size += len;
68 7671 blk->data[blk->size++] = '\n';
69 7671 blk->nl++;
70 7671 return blk;
71 }
72
73 7705 static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
74 {
75 7705 const char *line = dec->ibuf + dec->ipos;
76 7705 const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
77 7705 size_t len;
78
79
2/2
✓ Branch 2 → 3 taken 7669 times.
✓ Branch 2 → 4 taken 36 times.
7705 if (nl) {
80 7669 len = nl - line;
81 7669 dec->ipos += len + 1;
82 } else {
83 36 len = dec->isize - dec->ipos;
84
2/2
✓ Branch 4 → 5 taken 2 times.
✓ Branch 4 → 7 taken 34 times.
36 if (len == 0) {
85 return false;
86 }
87 2 dec->ipos += len;
88 }
89
90 7671 *linep = line;
91 7671 *lenp = len;
92 7671 return true;
93 }
94
95 34 static bool file_decoder_read_utf8(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len)
96 {
97
1/2
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 34 times.
34 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
98 errno = EINVAL;
99 return false;
100 }
101
102 34 FileDecoder dec = {
103 .ibuf = text,
104 .isize = text_len,
105 };
106
107 34 const char *line;
108 34 size_t len;
109
110
2/2
✓ Branch 6 → 7 taken 28 times.
✓ Branch 6 → 23 taken 6 times.
34 if (!read_utf8_line(&dec, &line, &len)) {
111 return true;
112 }
113
114
3/4
✓ Branch 7 → 8 taken 28 times.
✗ Branch 7 → 10 not taken.
✓ Branch 8 → 9 taken 1 time.
✓ Branch 8 → 10 taken 27 times.
28 if (len && line[len - 1] == '\r') {
115 1 buffer->crlf_newlines = true;
116 1 len--;
117 }
118
119 28 Block *blk = add_utf8_line(buffer, errbuf, NULL, line, len);
120
121
2/2
✓ Branch 11 → 16 taken 1 time.
✓ Branch 11 → 19 taken 27 times.
28 if (unlikely(buffer->crlf_newlines)) {
122
2/2
✓ Branch 17 → 12 taken 270 times.
✓ Branch 17 → 21 taken 1 time.
271 while (read_utf8_line(&dec, &line, &len)) {
123
4/4
✓ Branch 12 → 13 taken 268 times.
✓ Branch 12 → 15 taken 2 times.
✓ Branch 13 → 14 taken 1 time.
✓ Branch 13 → 15 taken 267 times.
270 if (len && line[len - 1] == '\r') {
124 1 len--;
125 }
126 270 blk = add_utf8_line(buffer, errbuf, blk, line, len);
127 }
128 } else {
129
2/2
✓ Branch 20 → 18 taken 7373 times.
✓ Branch 20 → 21 taken 27 times.
7400 while (read_utf8_line(&dec, &line, &len)) {
130 7373 blk = add_utf8_line(buffer, errbuf, blk, line, len);
131 }
132 }
133
134
1/2
✓ Branch 21 → 22 taken 28 times.
✗ Branch 21 → 23 not taken.
28 if (blk) {
135 28 add_block(buffer, blk);
136 }
137
138 return true;
139 }
140
141 1 static size_t unix_to_dos (
142 FileEncoder *enc,
143 const char *text,
144 size_t text_len,
145 size_t nr_newlines
146 ) {
147 1 BUG_ON(text_len && text[text_len - 1] != '\n'); // See sanity_check_blocks()
148 1 BUG_ON(nr_newlines > text_len);
149
150 1 const size_t new_len = text_len + nr_newlines;
151
1/2
✓ Branch 7 → 8 taken 1 time.
✗ Branch 7 → 17 not taken.
1 if (enc->nsize < new_len) {
152 1 enc->nsize = xmul(text_len, 2);
153 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
154 }
155
156 size_t seen_nl = 0;
157 size_t dest_pos = 0;
158
159
2/2
✓ Branch 18 → 11 taken 3 times.
✓ Branch 18 → 19 taken 1 time.
4 for (size_t src_pos = 0; src_pos < text_len; ) {
160 3 const char *src = text + src_pos;
161 3 char *dest = enc->nbuf + dest_pos;
162 3 char *end = memccpy(dest, src, '\n', text_len - src_pos);
163 3 BUG_ON(!end); // Loop condition prevents this
164
165 3 size_t line_len = (size_t)(end - dest);
166 3 src_pos += line_len;
167 3 BUG_ON(src_pos > text_len);
168
169 3 end[-1] = '\r';
170 3 end[0] = '\n';
171 3 dest_pos += line_len + 1;
172 3 seen_nl++;
173 }
174
175 1 BUG_ON(seen_nl != nr_newlines);
176 1 BUG_ON(dest_pos != new_len);
177 1 return dest_pos;
178 }
179
180 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
181
182 bool conversion_supported_by_iconv (
183 const char* UNUSED_ARG(from),
184 const char* UNUSED_ARG(to)
185 ) {
186 errno = EINVAL;
187 return false;
188 }
189
190 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
191 {
192 if (unlikely(!encoding_is_utf8(encoding))) {
193 BUG("unsupported conversion; should have been handled earlier");
194 }
195
196 return (FileEncoder) {
197 .crlf = crlf,
198 .fd = fd,
199 };
200 }
201
202 void file_encoder_free(FileEncoder *enc)
203 {
204 free(enc->nbuf);
205 }
206
207 ssize_t file_encoder_write (
208 FileEncoder *enc,
209 const char *buf,
210 size_t size,
211 size_t nr_newlines
212 ) {
213 if (unlikely(enc->crlf)) {
214 size = unix_to_dos(enc, buf, size, nr_newlines);
215 buf = enc->nbuf;
216 }
217 return xwrite_all(enc->fd, buf, size);
218 }
219
220 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
221 {
222 return 0;
223 }
224
225 bool file_decoder_read(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len)
226 {
227 return file_decoder_read_utf8(buffer, errbuf, text, text_len);
228 }
229
230 #else // ICONV_DISABLE != 1; use full iconv implementation:
231
232 #include <iconv.h>
233
234 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
235 #define REPLACEMENT "\xc2\xbf"
236
237 typedef struct CharsetConverter {
238 iconv_t cd;
239 char *obuf;
240 size_t osize;
241 size_t opos;
242 size_t consumed;
243 size_t errors;
244
245 // Temporary input buffer
246 char tbuf[16];
247 size_t tcount;
248
249 // REPLACEMENT character, in target encoding
250 char rbuf[4];
251 size_t rcount;
252
253 // Input character size in bytes, or zero for UTF-8
254 size_t char_size;
255 } CharsetConverter;
256
257 1 static CharsetConverter *create(iconv_t cd)
258 {
259 1 CharsetConverter *c = xcalloc1(sizeof(*c));
260 1 c->cd = cd;
261 1 c->osize = 8192;
262 1 c->obuf = xmalloc(c->osize);
263 1 return c;
264 }
265
266 2 static size_t iconv_wrapper (
267 iconv_t cd,
268 const char **restrict inbuf,
269 size_t *restrict inbytesleft,
270 char **restrict outbuf,
271 size_t *restrict outbytesleft
272 ) {
273 // POSIX defines the second parameter of iconv(3) as "char **restrict"
274 // but NetBSD declares it as "const char **restrict"
275 #ifdef __NetBSD__
276 const char **restrict in = inbuf;
277 #else
278 2 char **restrict in = (char **restrict)inbuf;
279 #endif
280
281 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
282 }
283
284 static void resize_obuf(CharsetConverter *c)
285 {
286 c->osize = xmul(2, c->osize);
287 c->obuf = xrealloc(c->obuf, c->osize);
288 }
289
290 static void add_replacement(CharsetConverter *c)
291 {
292 if (c->osize - c->opos < 4) {
293 resize_obuf(c);
294 }
295
296 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
297 c->opos += c->rcount;
298 }
299
300 static size_t handle_invalid(CharsetConverter *c, const char *buf, size_t count)
301 {
302 LOG_DEBUG("%zu %zu", c->char_size, count);
303 add_replacement(c);
304 if (c->char_size == 0) {
305 // Converting from UTF-8
306 size_t idx = 0;
307 CodePoint u = u_get_char(buf, count, &idx);
308 LOG_DEBUG("U+%04" PRIX32, u);
309 return idx;
310 }
311 if (c->char_size > count) {
312 // wtf
313 return 1;
314 }
315 return c->char_size;
316 }
317
318 1 static int xiconv(CharsetConverter *c, const char **ib, size_t *ic)
319 {
320 1 while (1) {
321 1 char *ob = c->obuf + c->opos;
322 1 size_t oc = c->osize - c->opos;
323 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
324 1 c->opos = ob - c->obuf;
325
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 12 taken 1 time.
1 if (rc == (size_t)-1) {
326 switch (errno) {
327 case EILSEQ:
328 c->errors++;
329 // Reset
330 iconv(c->cd, NULL, NULL, NULL, NULL);
331 return errno;
332 case EINVAL:
333 return errno;
334 case E2BIG:
335 resize_obuf(c);
336 continue;
337 default:
338 BUG("iconv: %s", strerror(errno));
339 }
340 } else {
341 1 c->errors += rc;
342 }
343 1 return 0;
344 }
345 }
346
347 static size_t convert_incomplete(CharsetConverter *c, const char *input, size_t len)
348 {
349 size_t ipos = 0;
350 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
351 c->tbuf[c->tcount++] = input[ipos++];
352 const char *ib = c->tbuf;
353 size_t ic = c->tcount;
354 int rc = xiconv(c, &ib, &ic);
355 if (ic > 0) {
356 memmove(c->tbuf, ib, ic);
357 }
358 c->tcount = ic;
359 if (rc == EINVAL) {
360 // Incomplete character at end of input buffer; try again
361 // with more input data
362 continue;
363 }
364 if (rc == EILSEQ) {
365 // Invalid multibyte sequence
366 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
367 c->tcount -= skip;
368 if (c->tcount > 0) {
369 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
370 memmove(c->tbuf, c->tbuf + skip, c->tcount);
371 continue;
372 }
373 return ipos;
374 }
375 break;
376 }
377
378 LOG_DEBUG("%zu %zu", ipos, c->tcount);
379 return ipos;
380 }
381
382 1 static void cconv_process(CharsetConverter *c, const char *input, size_t len)
383 {
384
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 4 taken 1 time.
1 if (c->consumed > 0) {
385 size_t fill = c->opos - c->consumed;
386 memmove(c->obuf, c->obuf + c->consumed, fill);
387 c->opos = fill;
388 c->consumed = 0;
389 }
390
391
1/2
✗ Branch 4 → 5 not taken.
✓ Branch 4 → 7 taken 1 time.
1 if (c->tcount > 0) {
392 size_t ipos = convert_incomplete(c, input, len);
393 input += ipos;
394 len -= ipos;
395 }
396
397 1 const char *ib = input;
398
2/2
✓ Branch 17 → 8 taken 1 time.
✓ Branch 17 → 18 taken 1 time.
2 for (size_t ic = len; ic > 0; ) {
399 1 int r = xiconv(c, &ib, &ic);
400
1/2
✗ Branch 9 → 10 not taken.
✓ Branch 9 → 13 taken 1 time.
1 if (r == EINVAL) {
401 // Incomplete character at end of input buffer
402 if (ic < sizeof(c->tbuf)) {
403 memcpy(c->tbuf, ib, ic);
404 c->tcount = ic;
405 } else {
406 // FIXME
407 }
408 ic = 0;
409 continue;
410 }
411
1/2
✗ Branch 13 → 14 not taken.
✓ Branch 13 → 16 taken 1 time.
1 if (r == EILSEQ) {
412 // Invalid multibyte sequence
413 size_t skip = handle_invalid(c, ib, ic);
414 ic -= skip;
415 ib += skip;
416 continue;
417 }
418 }
419 1 }
420
421 static CharsetConverter *cconv_to_utf8(const char *encoding)
422 {
423 iconv_t cd = iconv_open("UTF-8", encoding);
424 if (cd == (iconv_t)-1) {
425 return NULL;
426 }
427
428 CharsetConverter *c = create(cd);
429 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
430
431 if (str_has_prefix(encoding, "UTF-16")) {
432 c->char_size = 2;
433 } else if (str_has_prefix(encoding, "UTF-32")) {
434 c->char_size = 4;
435 } else {
436 c->char_size = 1;
437 }
438
439 return c;
440 }
441
442 1 static void encode_replacement(CharsetConverter *c)
443 {
444 1 static const char rep[] = REPLACEMENT;
445 1 const char *ib = rep;
446 1 char *ob = c->rbuf;
447 1 size_t ic = STRLEN(REPLACEMENT);
448 1 size_t oc = sizeof(c->rbuf);
449 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
450
451
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 5 not taken.
1 if (rc == (size_t)-1) {
452 1 c->rbuf[0] = '\xbf';
453 1 c->rcount = 1;
454 } else {
455 c->rcount = ob - c->rbuf;
456 }
457 1 }
458
459 1 static CharsetConverter *cconv_from_utf8(const char *encoding)
460 {
461 1 iconv_t cd = iconv_open(encoding, "UTF-8");
462
1/2
✓ Branch 3 → 4 taken 1 time.
✗ Branch 3 → 7 not taken.
1 if (cd == (iconv_t)-1) {
463 return NULL;
464 }
465 1 CharsetConverter *c = create(cd);
466 1 encode_replacement(c);
467 1 return c;
468 }
469
470 1 static void cconv_flush(CharsetConverter *c)
471 {
472
1/2
✗ Branch 2 → 3 not taken.
✓ Branch 2 → 6 taken 1 time.
1 if (c->tcount > 0) {
473 // Replace incomplete character at end of input buffer
474 LOG_DEBUG("incomplete character at EOF");
475 add_replacement(c);
476 c->tcount = 0;
477 }
478 1 }
479
480 static char *cconv_consume_line(CharsetConverter *c, size_t *len)
481 {
482 char *line = c->obuf + c->consumed;
483 char *nl = memchr(line, '\n', c->opos - c->consumed);
484 if (!nl) {
485 *len = 0;
486 return NULL;
487 }
488
489 size_t n = nl - line + 1;
490 c->consumed += n;
491 *len = n;
492 return line;
493 }
494
495 1 static char *cconv_consume_all(CharsetConverter *c, size_t *len)
496 {
497 1 char *buf = c->obuf + c->consumed;
498 1 *len = c->opos - c->consumed;
499 1 c->consumed = c->opos;
500 1 return buf;
501 }
502
503 1 static void cconv_free(CharsetConverter *c)
504 {
505 1 BUG_ON(!c);
506 1 iconv_close(c->cd);
507 1 free(c->obuf);
508 1 free(c);
509 1 }
510
511 2 bool conversion_supported_by_iconv(const char *from, const char *to)
512 {
513
2/4
✓ Branch 2 → 3 taken 2 times.
✗ Branch 2 → 4 not taken.
✗ Branch 3 → 4 not taken.
✓ Branch 3 → 5 taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
514 errno = EINVAL;
515 return false;
516 }
517
518 2 iconv_t cd = iconv_open(to, from);
519
1/2
✓ Branch 6 → 7 taken 2 times.
✗ Branch 6 → 9 not taken.
2 if (cd == (iconv_t)-1) {
520 return false;
521 }
522
523 2 iconv_close(cd);
524 2 return true;
525 }
526
527 22 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
528 {
529 22 CharsetConverter *cconv = NULL;
530
2/2
✓ Branch 3 → 4 taken 1 time.
✓ Branch 3 → 7 taken 21 times.
22 if (unlikely(!encoding_is_utf8(encoding))) {
531 1 cconv = cconv_from_utf8(encoding);
532
1/2
✗ Branch 5 → 6 not taken.
✓ Branch 5 → 7 taken 1 time.
1 if (!cconv) {
533 BUG("unsupported conversion; should have been handled earlier");
534 }
535 }
536
537 22 return (FileEncoder) {
538 .cconv = cconv,
539 .crlf = crlf,
540 .fd = fd,
541 };
542 }
543
544 22 void file_encoder_free(FileEncoder *enc)
545 {
546
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 if (enc->cconv) {
547 1 cconv_free(enc->cconv);
548 }
549 22 free(enc->nbuf);
550 22 }
551
552 // NOTE: buf must contain whole characters!
553 22 ssize_t file_encoder_write (
554 FileEncoder *enc,
555 const char *buf,
556 size_t size,
557 size_t nr_newlines
558 ) {
559
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 5 taken 21 times.
22 if (unlikely(enc->crlf)) {
560 1 size = unix_to_dos(enc, buf, size, nr_newlines);
561 1 buf = enc->nbuf;
562 }
563
2/2
✓ Branch 5 → 6 taken 1 time.
✓ Branch 5 → 9 taken 21 times.
22 if (unlikely(enc->cconv)) {
564 1 cconv_process(enc->cconv, buf, size);
565 1 cconv_flush(enc->cconv);
566 1 buf = cconv_consume_all(enc->cconv, &size);
567 }
568 22 return xwrite_all(enc->fd, buf, size);
569 }
570
571 22 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
572 {
573
2/2
✓ Branch 2 → 3 taken 1 time.
✓ Branch 2 → 4 taken 21 times.
22 return enc->cconv ? enc->cconv->errors : 0;
574 }
575
576 static bool fill(FileDecoder *dec)
577 {
578 if (dec->ipos == dec->isize) {
579 return false;
580 }
581
582 // Smaller than cconv.obuf to make realloc less likely
583 size_t max = 7 * 1024;
584
585 size_t icount = MIN(dec->isize - dec->ipos, max);
586 cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
587 dec->ipos += icount;
588 if (dec->ipos == dec->isize) {
589 // Must be flushed after all input has been fed
590 cconv_flush(dec->cconv);
591 }
592 return true;
593 }
594
595 static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
596 {
597 char *line;
598 size_t len;
599 while (1) {
600 line = cconv_consume_line(dec->cconv, &len);
601 if (line || !fill(dec)) {
602 break;
603 }
604 }
605
606 if (line) {
607 // Newline not wanted
608 len--;
609 } else {
610 line = cconv_consume_all(dec->cconv, &len);
611 if (len == 0) {
612 return false;
613 }
614 }
615
616 *linep = line;
617 *lenp = len;
618 return true;
619 }
620
621 34 bool file_decoder_read(Buffer *buffer, ErrorBuffer *errbuf, const char *text, size_t text_len)
622 {
623
1/2
✓ Branch 3 → 4 taken 34 times.
✗ Branch 3 → 5 not taken.
34 if (encoding_is_utf8(buffer->encoding)) {
624 34 return file_decoder_read_utf8(buffer, errbuf, text, text_len);
625 }
626
627 CharsetConverter *cconv = cconv_to_utf8(buffer->encoding);
628 if (!cconv) {
629 return false;
630 }
631
632 FileDecoder dec = {
633 .ibuf = text,
634 .isize = text_len,
635 .cconv = cconv,
636 };
637
638 const char *line;
639 size_t len;
640
641 if (decode_and_read_line(&dec, &line, &len)) {
642 if (len && line[len - 1] == '\r') {
643 buffer->crlf_newlines = true;
644 len--;
645 }
646 Block *blk = add_utf8_line(buffer, errbuf, NULL, line, len);
647 while (decode_and_read_line(&dec, &line, &len)) {
648 if (buffer->crlf_newlines && len && line[len - 1] == '\r') {
649 len--;
650 }
651 blk = add_utf8_line(buffer, errbuf, blk, line, len);
652 }
653 if (blk) {
654 add_block(buffer, blk);
655 }
656 }
657
658 cconv_free(cconv);
659 return true;
660 }
661
662 #endif
663