dte test coverage


Directory: ./
File: src/convert.c
Date: 2025-07-01 18:22:46
Exec Total Coverage
Lines: 151 289 52.2%
Functions: 20 28 71.4%
Branches: 56 138 40.6%

Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "util/arith.h"
10 #include "util/debug.h"
11 #include "util/intern.h"
12 #include "util/list.h"
13 #include "util/log.h"
14 #include "util/str-util.h"
15 #include "util/utf8.h"
16 #include "util/xmalloc.h"
17 #include "util/xreadwrite.h"
18
19 enum {
20 // If any line exceeds this length when reading a file, syntax
21 // highlighting will be automatically disabled
22 SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB
23 };
24
25 typedef struct {
26 const unsigned char *ibuf;
27 ssize_t ipos;
28 ssize_t isize;
29 struct cconv *cconv;
30 } FileDecoder;
31
32 56 static void add_block(Buffer *buffer, Block *blk)
33 {
34 56 buffer->nl += blk->nl;
35 56 list_insert_before(&blk->node, &buffer->blocks);
36 56 }
37
38 7590 static Block *add_utf8_line (
39 Buffer *buffer,
40 Block *blk,
41 const unsigned char *line,
42 size_t len
43 ) {
44 7590 size_t size = len + 1;
45
2/2
✓ Branch 0 (2→3) taken 7562 times.
✓ Branch 1 (2→6) taken 28 times.
7590 if (blk) {
46 7562 size_t avail = blk->alloc - blk->size;
47
2/2
✓ Branch 0 (3→4) taken 7534 times.
✓ Branch 1 (3→5) taken 28 times.
7562 if (size <= avail) {
48 7534 goto copy;
49 }
50 28 add_block(buffer, blk);
51 }
52 56 size = MAX(size, 8192);
53 56 blk = block_new(size);
54
55 7590 copy:
56
1/4
✗ Branch 0 (7→8) not taken.
✓ Branch 1 (7→11) taken 7590 times.
✗ Branch 2 (8→9) not taken.
✗ Branch 3 (8→11) not taken.
7590 if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) {
57 // TODO: Make the limit configurable and add documentation
58 // TODO: Pass in an ErrorBuffer* and use error_msg() instead of LOG_NOTICE()
59 LOG_NOTICE (
60 "line length (%zu) exceeded limit (%ju); disabling syntax highlighting",
61 len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN
62 );
63 buffer->options.syntax = false;
64 }
65
66 7590 memcpy(blk->data + blk->size, line, len);
67 7590 blk->size += len;
68 7590 blk->data[blk->size++] = '\n';
69 7590 blk->nl++;
70 7590 return blk;
71 }
72
73 7624 static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
74 {
75 7624 const char *line = dec->ibuf + dec->ipos;
76 7624 const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
77 7624 size_t len;
78
79
2/2
✓ Branch 0 (2→3) taken 7588 times.
✓ Branch 1 (2→4) taken 36 times.
7624 if (nl) {
80 7588 len = nl - line;
81 7588 dec->ipos += len + 1;
82 } else {
83 36 len = dec->isize - dec->ipos;
84
2/2
✓ Branch 0 (4→5) taken 2 times.
✓ Branch 1 (4→7) taken 34 times.
36 if (len == 0) {
85 return false;
86 }
87 2 dec->ipos += len;
88 }
89
90 7590 *linep = line;
91 7590 *lenp = len;
92 7590 return true;
93 }
94
95 34 static bool file_decoder_read_utf8(Buffer *buffer, const unsigned char *buf, size_t size)
96 {
97
1/2
✗ Branch 0 (3→4) not taken.
✓ Branch 1 (3→5) taken 34 times.
34 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
98 errno = EINVAL;
99 return false;
100 }
101
102 34 FileDecoder dec = {
103 .ibuf = buf,
104 .isize = size,
105 };
106
107 34 const char *line;
108 34 size_t len;
109
110
2/2
✓ Branch 0 (6→7) taken 28 times.
✓ Branch 1 (6→23) taken 6 times.
34 if (!read_utf8_line(&dec, &line, &len)) {
111 return true;
112 }
113
114
3/4
✓ Branch 0 (7→8) taken 28 times.
✗ Branch 1 (7→10) not taken.
✓ Branch 2 (8→9) taken 1 times.
✓ Branch 3 (8→10) taken 27 times.
28 if (len && line[len - 1] == '\r') {
115 1 buffer->crlf_newlines = true;
116 1 len--;
117 }
118
119 28 Block *blk = add_utf8_line(buffer, NULL, line, len);
120
121
2/2
✓ Branch 0 (11→16) taken 1 times.
✓ Branch 1 (11→19) taken 27 times.
28 if (unlikely(buffer->crlf_newlines)) {
122
2/2
✓ Branch 0 (17→12) taken 270 times.
✓ Branch 1 (17→21) taken 1 times.
271 while (read_utf8_line(&dec, &line, &len)) {
123
4/4
✓ Branch 0 (12→13) taken 268 times.
✓ Branch 1 (12→15) taken 2 times.
✓ Branch 2 (13→14) taken 1 times.
✓ Branch 3 (13→15) taken 267 times.
270 if (len && line[len - 1] == '\r') {
124 1 len--;
125 }
126 270 blk = add_utf8_line(buffer, blk, line, len);
127 }
128 } else {
129
2/2
✓ Branch 0 (20→18) taken 7292 times.
✓ Branch 1 (20→21) taken 27 times.
7319 while (read_utf8_line(&dec, &line, &len)) {
130 7292 blk = add_utf8_line(buffer, blk, line, len);
131 }
132 }
133
134
1/2
✓ Branch 0 (21→22) taken 28 times.
✗ Branch 1 (21→23) not taken.
28 if (blk) {
135 28 add_block(buffer, blk);
136 }
137
138 return true;
139 }
140
141 1 static size_t unix_to_dos (
142 FileEncoder *enc,
143 const unsigned char *buf,
144 size_t size
145 ) {
146 // TODO: Pass in Buffer::nl and make this size adjustment more conservative
147 // (it's resized to handle the worst possible case, despite the fact that we
148 // already have the number of newlines pre-computed)
149
1/2
✓ Branch 0 (2→3) taken 1 times.
✗ Branch 1 (2→8) not taken.
1 if (enc->nsize < size * 2) {
150 1 enc->nsize = size * 2;
151 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
152 }
153
154 // TODO: Optimize this loop, by making use of memccpy(3)
155 size_t d = 0;
156
2/2
✓ Branch 0 (9→5) taken 21 times.
✓ Branch 1 (9→10) taken 1 times.
22 for (size_t s = 0; s < size; s++) {
157 21 unsigned char ch = buf[s];
158
2/2
✓ Branch 0 (5→6) taken 3 times.
✓ Branch 1 (5→7) taken 18 times.
21 if (ch == '\n') {
159 3 enc->nbuf[d++] = '\r';
160 }
161 21 enc->nbuf[d++] = ch;
162 }
163
164 1 return d;
165 }
166
167 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
168
169 bool conversion_supported_by_iconv (
170 const char* UNUSED_ARG(from),
171 const char* UNUSED_ARG(to)
172 ) {
173 errno = EINVAL;
174 return false;
175 }
176
177 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
178 {
179 if (unlikely(!encoding_is_utf8(encoding))) {
180 BUG("unsupported conversion; should have been handled earlier");
181 }
182
183 return (FileEncoder) {
184 .crlf = crlf,
185 .fd = fd,
186 };
187 }
188
189 void file_encoder_free(FileEncoder *enc)
190 {
191 free(enc->nbuf);
192 }
193
194 ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
195 {
196 if (unlikely(enc->crlf)) {
197 n = unix_to_dos(enc, buf, n);
198 buf = enc->nbuf;
199 }
200 return xwrite_all(enc->fd, buf, n);
201 }
202
203 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
204 {
205 return 0;
206 }
207
208 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
209 {
210 return file_decoder_read_utf8(buffer, buf, size);
211 }
212
213 #else // ICONV_DISABLE != 1; use full iconv implementation:
214
215 #include <iconv.h>
216
217 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
218 #define REPLACEMENT "\xc2\xbf"
219
220 struct cconv {
221 iconv_t cd;
222 char *obuf;
223 size_t osize;
224 size_t opos;
225 size_t consumed;
226 size_t errors;
227
228 // Temporary input buffer
229 char tbuf[16];
230 size_t tcount;
231
232 // REPLACEMENT character, in target encoding
233 char rbuf[4];
234 size_t rcount;
235
236 // Input character size in bytes, or zero for UTF-8
237 size_t char_size;
238 };
239
240 1 static struct cconv *create(iconv_t cd)
241 {
242 1 struct cconv *c = xcalloc1(sizeof(*c));
243 1 c->cd = cd;
244 1 c->osize = 8192;
245 1 c->obuf = xmalloc(c->osize);
246 1 return c;
247 }
248
249 2 static size_t iconv_wrapper (
250 iconv_t cd,
251 const char **restrict inbuf,
252 size_t *restrict inbytesleft,
253 char **restrict outbuf,
254 size_t *restrict outbytesleft
255 ) {
256 // POSIX defines the second parameter of iconv(3) as "char **restrict"
257 // but NetBSD declares it as "const char **restrict"
258 #ifdef __NetBSD__
259 const char **restrict in = inbuf;
260 #else
261 2 char **restrict in = (char **restrict)inbuf;
262 #endif
263
264 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
265 }
266
267 static void resize_obuf(struct cconv *c)
268 {
269 c->osize = xmul(2, c->osize);
270 c->obuf = xrealloc(c->obuf, c->osize);
271 }
272
273 static void add_replacement(struct cconv *c)
274 {
275 if (c->osize - c->opos < 4) {
276 resize_obuf(c);
277 }
278
279 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
280 c->opos += c->rcount;
281 }
282
283 static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
284 {
285 LOG_DEBUG("%zu %zu", c->char_size, count);
286 add_replacement(c);
287 if (c->char_size == 0) {
288 // Converting from UTF-8
289 size_t idx = 0;
290 CodePoint u = u_get_char(buf, count, &idx);
291 LOG_DEBUG("U+%04" PRIX32, u);
292 return idx;
293 }
294 if (c->char_size > count) {
295 // wtf
296 return 1;
297 }
298 return c->char_size;
299 }
300
301 1 static int xiconv(struct cconv *c, const char **ib, size_t *ic)
302 {
303 1 while (1) {
304 1 char *ob = c->obuf + c->opos;
305 1 size_t oc = c->osize - c->opos;
306 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
307 1 c->opos = ob - c->obuf;
308
1/2
✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→12) taken 1 times.
1 if (rc == (size_t)-1) {
309 switch (errno) {
310 case EILSEQ:
311 c->errors++;
312 // Reset
313 iconv(c->cd, NULL, NULL, NULL, NULL);
314 return errno;
315 case EINVAL:
316 return errno;
317 case E2BIG:
318 resize_obuf(c);
319 continue;
320 default:
321 BUG("iconv: %s", strerror(errno));
322 }
323 } else {
324 1 c->errors += rc;
325 }
326 1 return 0;
327 }
328 }
329
330 static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
331 {
332 size_t ipos = 0;
333 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
334 c->tbuf[c->tcount++] = input[ipos++];
335 const char *ib = c->tbuf;
336 size_t ic = c->tcount;
337 int rc = xiconv(c, &ib, &ic);
338 if (ic > 0) {
339 memmove(c->tbuf, ib, ic);
340 }
341 c->tcount = ic;
342 if (rc == EINVAL) {
343 // Incomplete character at end of input buffer; try again
344 // with more input data
345 continue;
346 }
347 if (rc == EILSEQ) {
348 // Invalid multibyte sequence
349 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
350 c->tcount -= skip;
351 if (c->tcount > 0) {
352 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
353 memmove(c->tbuf, c->tbuf + skip, c->tcount);
354 continue;
355 }
356 return ipos;
357 }
358 break;
359 }
360
361 LOG_DEBUG("%zu %zu", ipos, c->tcount);
362 return ipos;
363 }
364
365 1 static void cconv_process(struct cconv *c, const char *input, size_t len)
366 {
367
1/2
✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→4) taken 1 times.
1 if (c->consumed > 0) {
368 size_t fill = c->opos - c->consumed;
369 memmove(c->obuf, c->obuf + c->consumed, fill);
370 c->opos = fill;
371 c->consumed = 0;
372 }
373
374
1/2
✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→7) taken 1 times.
1 if (c->tcount > 0) {
375 size_t ipos = convert_incomplete(c, input, len);
376 input += ipos;
377 len -= ipos;
378 }
379
380 1 const char *ib = input;
381
2/2
✓ Branch 0 (17→8) taken 1 times.
✓ Branch 1 (17→18) taken 1 times.
2 for (size_t ic = len; ic > 0; ) {
382 1 int r = xiconv(c, &ib, &ic);
383
1/2
✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→13) taken 1 times.
1 if (r == EINVAL) {
384 // Incomplete character at end of input buffer
385 if (ic < sizeof(c->tbuf)) {
386 memcpy(c->tbuf, ib, ic);
387 c->tcount = ic;
388 } else {
389 // FIXME
390 }
391 ic = 0;
392 continue;
393 }
394
1/2
✗ Branch 0 (13→14) not taken.
✓ Branch 1 (13→16) taken 1 times.
1 if (r == EILSEQ) {
395 // Invalid multibyte sequence
396 size_t skip = handle_invalid(c, ib, ic);
397 ic -= skip;
398 ib += skip;
399 continue;
400 }
401 }
402 1 }
403
404 static struct cconv *cconv_to_utf8(const char *encoding)
405 {
406 iconv_t cd = iconv_open("UTF-8", encoding);
407 if (cd == (iconv_t)-1) {
408 return NULL;
409 }
410
411 struct cconv *c = create(cd);
412 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
413
414 if (str_has_prefix(encoding, "UTF-16")) {
415 c->char_size = 2;
416 } else if (str_has_prefix(encoding, "UTF-32")) {
417 c->char_size = 4;
418 } else {
419 c->char_size = 1;
420 }
421
422 return c;
423 }
424
425 1 static void encode_replacement(struct cconv *c)
426 {
427 1 static const unsigned char rep[] = REPLACEMENT;
428 1 const char *ib = rep;
429 1 char *ob = c->rbuf;
430 1 size_t ic = STRLEN(REPLACEMENT);
431 1 size_t oc = sizeof(c->rbuf);
432 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
433
434
1/2
✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→5) not taken.
1 if (rc == (size_t)-1) {
435 1 c->rbuf[0] = '\xbf';
436 1 c->rcount = 1;
437 } else {
438 c->rcount = ob - c->rbuf;
439 }
440 1 }
441
442 1 static struct cconv *cconv_from_utf8(const char *encoding)
443 {
444 1 iconv_t cd = iconv_open(encoding, "UTF-8");
445
1/2
✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→7) not taken.
1 if (cd == (iconv_t)-1) {
446 return NULL;
447 }
448 1 struct cconv *c = create(cd);
449 1 encode_replacement(c);
450 1 return c;
451 }
452
453 1 static void cconv_flush(struct cconv *c)
454 {
455
1/2
✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→6) taken 1 times.
1 if (c->tcount > 0) {
456 // Replace incomplete character at end of input buffer
457 LOG_DEBUG("incomplete character at EOF");
458 add_replacement(c);
459 c->tcount = 0;
460 }
461 1 }
462
463 static char *cconv_consume_line(struct cconv *c, size_t *len)
464 {
465 char *line = c->obuf + c->consumed;
466 char *nl = memchr(line, '\n', c->opos - c->consumed);
467 if (!nl) {
468 *len = 0;
469 return NULL;
470 }
471
472 size_t n = nl - line + 1;
473 c->consumed += n;
474 *len = n;
475 return line;
476 }
477
478 1 static char *cconv_consume_all(struct cconv *c, size_t *len)
479 {
480 1 char *buf = c->obuf + c->consumed;
481 1 *len = c->opos - c->consumed;
482 1 c->consumed = c->opos;
483 1 return buf;
484 }
485
486 1 static void cconv_free(struct cconv *c)
487 {
488 1 BUG_ON(!c);
489 1 iconv_close(c->cd);
490 1 free(c->obuf);
491 1 free(c);
492 1 }
493
494 2 bool conversion_supported_by_iconv(const char *from, const char *to)
495 {
496
2/4
✓ Branch 0 (2→3) taken 2 times.
✗ Branch 1 (2→4) not taken.
✗ Branch 2 (3→4) not taken.
✓ Branch 3 (3→5) taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
497 errno = EINVAL;
498 return false;
499 }
500
501 2 iconv_t cd = iconv_open(to, from);
502
1/2
✓ Branch 0 (6→7) taken 2 times.
✗ Branch 1 (6→9) not taken.
2 if (cd == (iconv_t)-1) {
503 return false;
504 }
505
506 2 iconv_close(cd);
507 2 return true;
508 }
509
510 21 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
511 {
512 21 struct cconv *cconv = NULL;
513
2/2
✓ Branch 0 (3→4) taken 1 times.
✓ Branch 1 (3→7) taken 20 times.
21 if (unlikely(!encoding_is_utf8(encoding))) {
514 1 cconv = cconv_from_utf8(encoding);
515
1/2
✗ Branch 0 (5→6) not taken.
✓ Branch 1 (5→7) taken 1 times.
1 if (!cconv) {
516 BUG("unsupported conversion; should have been handled earlier");
517 }
518 }
519
520 21 return (FileEncoder) {
521 .cconv = cconv,
522 .crlf = crlf,
523 .fd = fd,
524 };
525 }
526
527 21 void file_encoder_free(FileEncoder *enc)
528 {
529
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
21 if (enc->cconv) {
530 1 cconv_free(enc->cconv);
531 }
532 21 free(enc->nbuf);
533 21 }
534
535 // NOTE: buf must contain whole characters!
536 21 ssize_t file_encoder_write (
537 FileEncoder *enc,
538 const unsigned char *buf,
539 size_t size
540 ) {
541
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→5) taken 20 times.
21 if (unlikely(enc->crlf)) {
542 1 size = unix_to_dos(enc, buf, size);
543 1 buf = enc->nbuf;
544 }
545
2/2
✓ Branch 0 (5→6) taken 1 times.
✓ Branch 1 (5→9) taken 20 times.
21 if (unlikely(enc->cconv)) {
546 1 cconv_process(enc->cconv, buf, size);
547 1 cconv_flush(enc->cconv);
548 1 buf = cconv_consume_all(enc->cconv, &size);
549 }
550 21 return xwrite_all(enc->fd, buf, size);
551 }
552
553 21 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
554 {
555
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
21 return enc->cconv ? enc->cconv->errors : 0;
556 }
557
558 static bool fill(FileDecoder *dec)
559 {
560 if (dec->ipos == dec->isize) {
561 return false;
562 }
563
564 // Smaller than cconv.obuf to make realloc less likely
565 size_t max = 7 * 1024;
566
567 size_t icount = MIN(dec->isize - dec->ipos, max);
568 cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
569 dec->ipos += icount;
570 if (dec->ipos == dec->isize) {
571 // Must be flushed after all input has been fed
572 cconv_flush(dec->cconv);
573 }
574 return true;
575 }
576
577 static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
578 {
579 char *line;
580 size_t len;
581 while (1) {
582 line = cconv_consume_line(dec->cconv, &len);
583 if (line || !fill(dec)) {
584 break;
585 }
586 }
587
588 if (line) {
589 // Newline not wanted
590 len--;
591 } else {
592 line = cconv_consume_all(dec->cconv, &len);
593 if (len == 0) {
594 return false;
595 }
596 }
597
598 *linep = line;
599 *lenp = len;
600 return true;
601 }
602
603 34 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
604 {
605
1/2
✓ Branch 0 (3→4) taken 34 times.
✗ Branch 1 (3→5) not taken.
34 if (encoding_is_utf8(buffer->encoding)) {
606 34 return file_decoder_read_utf8(buffer, buf, size);
607 }
608
609 struct cconv *cconv = cconv_to_utf8(buffer->encoding);
610 if (!cconv) {
611 return false;
612 }
613
614 FileDecoder dec = {
615 .ibuf = buf,
616 .isize = size,
617 .cconv = cconv,
618 };
619
620 const char *line;
621 size_t len;
622
623 if (decode_and_read_line(&dec, &line, &len)) {
624 if (len && line[len - 1] == '\r') {
625 buffer->crlf_newlines = true;
626 len--;
627 }
628 Block *blk = add_utf8_line(buffer, NULL, line, len);
629 while (decode_and_read_line(&dec, &line, &len)) {
630 if (buffer->crlf_newlines && len && line[len - 1] == '\r') {
631 len--;
632 }
633 blk = add_utf8_line(buffer, blk, line, len);
634 }
635 if (blk) {
636 add_block(buffer, blk);
637 }
638 }
639
640 cconv_free(cconv);
641 return true;
642 }
643
644 #endif
645