dte test coverage


Directory: ./
File: src/convert.c
Date: 2025-02-14 16:55:22
Exec Total Coverage
Lines: 151 289 52.2%
Functions: 20 28 71.4%
Branches: 56 138 40.6%

Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "util/debug.h"
10 #include "util/intern.h"
11 #include "util/list.h"
12 #include "util/log.h"
13 #include "util/str-util.h"
14 #include "util/utf8.h"
15 #include "util/xmalloc.h"
16 #include "util/xreadwrite.h"
17
18 enum {
19 // If any line exceeds this length when reading a file, syntax
20 // highlighting will be automatically disabled
21 SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB
22 };
23
24 typedef struct {
25 const unsigned char *ibuf;
26 ssize_t ipos;
27 ssize_t isize;
28 struct cconv *cconv;
29 } FileDecoder;
30
31 56 static void add_block(Buffer *buffer, Block *blk)
32 {
33 56 buffer->nl += blk->nl;
34 56 list_insert_before(&blk->node, &buffer->blocks);
35 56 }
36
37 7316 static Block *add_utf8_line (
38 Buffer *buffer,
39 Block *blk,
40 const unsigned char *line,
41 size_t len
42 ) {
43 7316 size_t size = len + 1;
44
2/2
✓ Branch 0 (2→3) taken 7288 times.
✓ Branch 1 (2→6) taken 28 times.
7316 if (blk) {
45 7288 size_t avail = blk->alloc - blk->size;
46
2/2
✓ Branch 0 (3→4) taken 7260 times.
✓ Branch 1 (3→5) taken 28 times.
7288 if (size <= avail) {
47 7260 goto copy;
48 }
49 28 add_block(buffer, blk);
50 }
51 56 size = MAX(size, 8192);
52 56 blk = block_new(size);
53
54 7316 copy:
55
1/4
✗ Branch 0 (7→8) not taken.
✓ Branch 1 (7→11) taken 7316 times.
✗ Branch 2 (8→9) not taken.
✗ Branch 3 (8→11) not taken.
7316 if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) {
56 // TODO: Make the limit configurable and add documentation
57 // TODO: Pass in an ErrorBuffer* and use error_msg() instead of LOG_NOTICE()
58 LOG_NOTICE (
59 "line length (%zu) exceeded limit (%ju); disabling syntax highlighting",
60 len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN
61 );
62 buffer->options.syntax = false;
63 }
64
65 7316 memcpy(blk->data + blk->size, line, len);
66 7316 blk->size += len;
67 7316 blk->data[blk->size++] = '\n';
68 7316 blk->nl++;
69 7316 return blk;
70 }
71
72 7347 static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
73 {
74 7347 const char *line = dec->ibuf + dec->ipos;
75 7347 const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
76 7347 size_t len;
77
78
2/2
✓ Branch 0 (2→3) taken 7314 times.
✓ Branch 1 (2→4) taken 33 times.
7347 if (nl) {
79 7314 len = nl - line;
80 7314 dec->ipos += len + 1;
81 } else {
82 33 len = dec->isize - dec->ipos;
83
2/2
✓ Branch 0 (4→5) taken 2 times.
✓ Branch 1 (4→7) taken 31 times.
33 if (len == 0) {
84 return false;
85 }
86 2 dec->ipos += len;
87 }
88
89 7316 *linep = line;
90 7316 *lenp = len;
91 7316 return true;
92 }
93
94 31 static bool file_decoder_read_utf8(Buffer *buffer, const unsigned char *buf, size_t size)
95 {
96
1/2
✗ Branch 0 (3→4) not taken.
✓ Branch 1 (3→5) taken 31 times.
31 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
97 errno = EINVAL;
98 return false;
99 }
100
101 31 FileDecoder dec = {
102 .ibuf = buf,
103 .isize = size,
104 };
105
106 31 const char *line;
107 31 size_t len;
108
109
2/2
✓ Branch 0 (6→7) taken 28 times.
✓ Branch 1 (6→23) taken 3 times.
31 if (!read_utf8_line(&dec, &line, &len)) {
110 return true;
111 }
112
113
3/4
✓ Branch 0 (7→8) taken 28 times.
✗ Branch 1 (7→10) not taken.
✓ Branch 2 (8→9) taken 1 times.
✓ Branch 3 (8→10) taken 27 times.
28 if (len && line[len - 1] == '\r') {
114 1 buffer->crlf_newlines = true;
115 1 len--;
116 }
117
118 28 Block *blk = add_utf8_line(buffer, NULL, line, len);
119
120
2/2
✓ Branch 0 (11→16) taken 1 times.
✓ Branch 1 (11→19) taken 27 times.
28 if (unlikely(buffer->crlf_newlines)) {
121
2/2
✓ Branch 0 (17→12) taken 270 times.
✓ Branch 1 (17→21) taken 1 times.
271 while (read_utf8_line(&dec, &line, &len)) {
122
4/4
✓ Branch 0 (12→13) taken 268 times.
✓ Branch 1 (12→15) taken 2 times.
✓ Branch 2 (13→14) taken 1 times.
✓ Branch 3 (13→15) taken 267 times.
270 if (len && line[len - 1] == '\r') {
123 1 len--;
124 }
125 270 blk = add_utf8_line(buffer, blk, line, len);
126 }
127 } else {
128
2/2
✓ Branch 0 (20→18) taken 7018 times.
✓ Branch 1 (20→21) taken 27 times.
7045 while (read_utf8_line(&dec, &line, &len)) {
129 7018 blk = add_utf8_line(buffer, blk, line, len);
130 }
131 }
132
133
1/2
✓ Branch 0 (21→22) taken 28 times.
✗ Branch 1 (21→23) not taken.
28 if (blk) {
134 28 add_block(buffer, blk);
135 }
136
137 return true;
138 }
139
140 1 static size_t unix_to_dos (
141 FileEncoder *enc,
142 const unsigned char *buf,
143 size_t size
144 ) {
145 // TODO: Pass in Buffer::nl and make this size adjustment more conservative
146 // (it's resized to handle the worst possible case, despite the fact that we
147 // already have the number of newlines pre-computed)
148
1/2
✓ Branch 0 (2→3) taken 1 times.
✗ Branch 1 (2→8) not taken.
1 if (enc->nsize < size * 2) {
149 1 enc->nsize = size * 2;
150 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
151 }
152
153 // TODO: Optimize this loop, by making use of memccpy(3)
154 size_t d = 0;
155
2/2
✓ Branch 0 (9→5) taken 21 times.
✓ Branch 1 (9→10) taken 1 times.
22 for (size_t s = 0; s < size; s++) {
156 21 unsigned char ch = buf[s];
157
2/2
✓ Branch 0 (5→6) taken 3 times.
✓ Branch 1 (5→7) taken 18 times.
21 if (ch == '\n') {
158 3 enc->nbuf[d++] = '\r';
159 }
160 21 enc->nbuf[d++] = ch;
161 }
162
163 1 return d;
164 }
165
166 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
167
168 bool conversion_supported_by_iconv (
169 const char* UNUSED_ARG(from),
170 const char* UNUSED_ARG(to)
171 ) {
172 errno = EINVAL;
173 return false;
174 }
175
176 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
177 {
178 if (unlikely(!encoding_is_utf8(encoding))) {
179 BUG("unsupported conversion; should have been handled earlier");
180 }
181
182 return (FileEncoder) {
183 .crlf = crlf,
184 .fd = fd,
185 };
186 }
187
188 void file_encoder_free(FileEncoder *enc)
189 {
190 free(enc->nbuf);
191 }
192
193 ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
194 {
195 if (unlikely(enc->crlf)) {
196 n = unix_to_dos(enc, buf, n);
197 buf = enc->nbuf;
198 }
199 return xwrite_all(enc->fd, buf, n);
200 }
201
202 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
203 {
204 return 0;
205 }
206
207 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
208 {
209 return file_decoder_read_utf8(buffer, buf, size);
210 }
211
212 #else // ICONV_DISABLE != 1; use full iconv implementation:
213
214 #include <iconv.h>
215
216 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
217 #define REPLACEMENT "\xc2\xbf"
218
219 struct cconv {
220 iconv_t cd;
221 char *obuf;
222 size_t osize;
223 size_t opos;
224 size_t consumed;
225 size_t errors;
226
227 // Temporary input buffer
228 char tbuf[16];
229 size_t tcount;
230
231 // REPLACEMENT character, in target encoding
232 char rbuf[4];
233 size_t rcount;
234
235 // Input character size in bytes, or zero for UTF-8
236 size_t char_size;
237 };
238
239 1 static struct cconv *create(iconv_t cd)
240 {
241 1 struct cconv *c = xnew0(struct cconv, 1);
242 1 c->cd = cd;
243 1 c->osize = 8192;
244 1 c->obuf = xmalloc(c->osize);
245 1 return c;
246 }
247
248 2 static size_t iconv_wrapper (
249 iconv_t cd,
250 const char **restrict inbuf,
251 size_t *restrict inbytesleft,
252 char **restrict outbuf,
253 size_t *restrict outbytesleft
254 ) {
255 // POSIX defines the second parameter of iconv(3) as "char **restrict"
256 // but NetBSD declares it as "const char **restrict"
257 #ifdef __NetBSD__
258 const char **restrict in = inbuf;
259 #else
260 2 char **restrict in = (char **restrict)inbuf;
261 #endif
262
263 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
264 }
265
266 static void resize_obuf(struct cconv *c)
267 {
268 c->osize = xmul(2, c->osize);
269 c->obuf = xrealloc(c->obuf, c->osize);
270 }
271
272 static void add_replacement(struct cconv *c)
273 {
274 if (c->osize - c->opos < 4) {
275 resize_obuf(c);
276 }
277
278 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
279 c->opos += c->rcount;
280 }
281
282 static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
283 {
284 LOG_DEBUG("%zu %zu", c->char_size, count);
285 add_replacement(c);
286 if (c->char_size == 0) {
287 // Converting from UTF-8
288 size_t idx = 0;
289 CodePoint u = u_get_char(buf, count, &idx);
290 LOG_DEBUG("U+%04" PRIX32, u);
291 return idx;
292 }
293 if (c->char_size > count) {
294 // wtf
295 return 1;
296 }
297 return c->char_size;
298 }
299
300 1 static int xiconv(struct cconv *c, const char **ib, size_t *ic)
301 {
302 1 while (1) {
303 1 char *ob = c->obuf + c->opos;
304 1 size_t oc = c->osize - c->opos;
305 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
306 1 c->opos = ob - c->obuf;
307
1/2
✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→12) taken 1 times.
1 if (rc == (size_t)-1) {
308 switch (errno) {
309 case EILSEQ:
310 c->errors++;
311 // Reset
312 iconv(c->cd, NULL, NULL, NULL, NULL);
313 return errno;
314 case EINVAL:
315 return errno;
316 case E2BIG:
317 resize_obuf(c);
318 continue;
319 default:
320 BUG("iconv: %s", strerror(errno));
321 }
322 } else {
323 1 c->errors += rc;
324 }
325 1 return 0;
326 }
327 }
328
329 static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
330 {
331 size_t ipos = 0;
332 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
333 c->tbuf[c->tcount++] = input[ipos++];
334 const char *ib = c->tbuf;
335 size_t ic = c->tcount;
336 int rc = xiconv(c, &ib, &ic);
337 if (ic > 0) {
338 memmove(c->tbuf, ib, ic);
339 }
340 c->tcount = ic;
341 if (rc == EINVAL) {
342 // Incomplete character at end of input buffer; try again
343 // with more input data
344 continue;
345 }
346 if (rc == EILSEQ) {
347 // Invalid multibyte sequence
348 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
349 c->tcount -= skip;
350 if (c->tcount > 0) {
351 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
352 memmove(c->tbuf, c->tbuf + skip, c->tcount);
353 continue;
354 }
355 return ipos;
356 }
357 break;
358 }
359
360 LOG_DEBUG("%zu %zu", ipos, c->tcount);
361 return ipos;
362 }
363
364 1 static void cconv_process(struct cconv *c, const char *input, size_t len)
365 {
366
1/2
✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→4) taken 1 times.
1 if (c->consumed > 0) {
367 size_t fill = c->opos - c->consumed;
368 memmove(c->obuf, c->obuf + c->consumed, fill);
369 c->opos = fill;
370 c->consumed = 0;
371 }
372
373
1/2
✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→7) taken 1 times.
1 if (c->tcount > 0) {
374 size_t ipos = convert_incomplete(c, input, len);
375 input += ipos;
376 len -= ipos;
377 }
378
379 1 const char *ib = input;
380
2/2
✓ Branch 0 (17→8) taken 1 times.
✓ Branch 1 (17→18) taken 1 times.
2 for (size_t ic = len; ic > 0; ) {
381 1 int r = xiconv(c, &ib, &ic);
382
1/2
✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→13) taken 1 times.
1 if (r == EINVAL) {
383 // Incomplete character at end of input buffer
384 if (ic < sizeof(c->tbuf)) {
385 memcpy(c->tbuf, ib, ic);
386 c->tcount = ic;
387 } else {
388 // FIXME
389 }
390 ic = 0;
391 continue;
392 }
393
1/2
✗ Branch 0 (13→14) not taken.
✓ Branch 1 (13→16) taken 1 times.
1 if (r == EILSEQ) {
394 // Invalid multibyte sequence
395 size_t skip = handle_invalid(c, ib, ic);
396 ic -= skip;
397 ib += skip;
398 continue;
399 }
400 }
401 1 }
402
403 static struct cconv *cconv_to_utf8(const char *encoding)
404 {
405 iconv_t cd = iconv_open("UTF-8", encoding);
406 if (cd == (iconv_t)-1) {
407 return NULL;
408 }
409
410 struct cconv *c = create(cd);
411 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
412
413 if (str_has_prefix(encoding, "UTF-16")) {
414 c->char_size = 2;
415 } else if (str_has_prefix(encoding, "UTF-32")) {
416 c->char_size = 4;
417 } else {
418 c->char_size = 1;
419 }
420
421 return c;
422 }
423
424 1 static void encode_replacement(struct cconv *c)
425 {
426 1 static const unsigned char rep[] = REPLACEMENT;
427 1 const char *ib = rep;
428 1 char *ob = c->rbuf;
429 1 size_t ic = STRLEN(REPLACEMENT);
430 1 size_t oc = sizeof(c->rbuf);
431 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
432
433
1/2
✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→5) not taken.
1 if (rc == (size_t)-1) {
434 1 c->rbuf[0] = '\xbf';
435 1 c->rcount = 1;
436 } else {
437 c->rcount = ob - c->rbuf;
438 }
439 1 }
440
441 1 static struct cconv *cconv_from_utf8(const char *encoding)
442 {
443 1 iconv_t cd = iconv_open(encoding, "UTF-8");
444
1/2
✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→7) not taken.
1 if (cd == (iconv_t)-1) {
445 return NULL;
446 }
447 1 struct cconv *c = create(cd);
448 1 encode_replacement(c);
449 1 return c;
450 }
451
452 1 static void cconv_flush(struct cconv *c)
453 {
454
1/2
✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→6) taken 1 times.
1 if (c->tcount > 0) {
455 // Replace incomplete character at end of input buffer
456 LOG_DEBUG("incomplete character at EOF");
457 add_replacement(c);
458 c->tcount = 0;
459 }
460 1 }
461
462 static char *cconv_consume_line(struct cconv *c, size_t *len)
463 {
464 char *line = c->obuf + c->consumed;
465 char *nl = memchr(line, '\n', c->opos - c->consumed);
466 if (!nl) {
467 *len = 0;
468 return NULL;
469 }
470
471 size_t n = nl - line + 1;
472 c->consumed += n;
473 *len = n;
474 return line;
475 }
476
477 1 static char *cconv_consume_all(struct cconv *c, size_t *len)
478 {
479 1 char *buf = c->obuf + c->consumed;
480 1 *len = c->opos - c->consumed;
481 1 c->consumed = c->opos;
482 1 return buf;
483 }
484
485 1 static void cconv_free(struct cconv *c)
486 {
487 1 BUG_ON(!c);
488 1 iconv_close(c->cd);
489 1 free(c->obuf);
490 1 free(c);
491 1 }
492
493 2 bool conversion_supported_by_iconv(const char *from, const char *to)
494 {
495
2/4
✓ Branch 0 (2→3) taken 2 times.
✗ Branch 1 (2→4) not taken.
✗ Branch 2 (3→4) not taken.
✓ Branch 3 (3→5) taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
496 errno = EINVAL;
497 return false;
498 }
499
500 2 iconv_t cd = iconv_open(to, from);
501
1/2
✓ Branch 0 (6→7) taken 2 times.
✗ Branch 1 (6→9) not taken.
2 if (cd == (iconv_t)-1) {
502 return false;
503 }
504
505 2 iconv_close(cd);
506 2 return true;
507 }
508
509 21 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
510 {
511 21 struct cconv *cconv = NULL;
512
2/2
✓ Branch 0 (3→4) taken 1 times.
✓ Branch 1 (3→7) taken 20 times.
21 if (unlikely(!encoding_is_utf8(encoding))) {
513 1 cconv = cconv_from_utf8(encoding);
514
1/2
✗ Branch 0 (5→6) not taken.
✓ Branch 1 (5→7) taken 1 times.
1 if (!cconv) {
515 BUG("unsupported conversion; should have been handled earlier");
516 }
517 }
518
519 21 return (FileEncoder) {
520 .cconv = cconv,
521 .crlf = crlf,
522 .fd = fd,
523 };
524 }
525
526 21 void file_encoder_free(FileEncoder *enc)
527 {
528
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
21 if (enc->cconv) {
529 1 cconv_free(enc->cconv);
530 }
531 21 free(enc->nbuf);
532 21 }
533
534 // NOTE: buf must contain whole characters!
535 21 ssize_t file_encoder_write (
536 FileEncoder *enc,
537 const unsigned char *buf,
538 size_t size
539 ) {
540
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→5) taken 20 times.
21 if (unlikely(enc->crlf)) {
541 1 size = unix_to_dos(enc, buf, size);
542 1 buf = enc->nbuf;
543 }
544
2/2
✓ Branch 0 (5→6) taken 1 times.
✓ Branch 1 (5→9) taken 20 times.
21 if (unlikely(enc->cconv)) {
545 1 cconv_process(enc->cconv, buf, size);
546 1 cconv_flush(enc->cconv);
547 1 buf = cconv_consume_all(enc->cconv, &size);
548 }
549 21 return xwrite_all(enc->fd, buf, size);
550 }
551
552 21 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
553 {
554
2/2
✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
21 return enc->cconv ? enc->cconv->errors : 0;
555 }
556
557 static bool fill(FileDecoder *dec)
558 {
559 if (dec->ipos == dec->isize) {
560 return false;
561 }
562
563 // Smaller than cconv.obuf to make realloc less likely
564 size_t max = 7 * 1024;
565
566 size_t icount = MIN(dec->isize - dec->ipos, max);
567 cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
568 dec->ipos += icount;
569 if (dec->ipos == dec->isize) {
570 // Must be flushed after all input has been fed
571 cconv_flush(dec->cconv);
572 }
573 return true;
574 }
575
576 static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
577 {
578 char *line;
579 size_t len;
580 while (1) {
581 line = cconv_consume_line(dec->cconv, &len);
582 if (line || !fill(dec)) {
583 break;
584 }
585 }
586
587 if (line) {
588 // Newline not wanted
589 len--;
590 } else {
591 line = cconv_consume_all(dec->cconv, &len);
592 if (len == 0) {
593 return false;
594 }
595 }
596
597 *linep = line;
598 *lenp = len;
599 return true;
600 }
601
602 31 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
603 {
604
1/2
✓ Branch 0 (3→4) taken 31 times.
✗ Branch 1 (3→5) not taken.
31 if (encoding_is_utf8(buffer->encoding)) {
605 31 return file_decoder_read_utf8(buffer, buf, size);
606 }
607
608 struct cconv *cconv = cconv_to_utf8(buffer->encoding);
609 if (!cconv) {
610 return false;
611 }
612
613 FileDecoder dec = {
614 .ibuf = buf,
615 .isize = size,
616 .cconv = cconv,
617 };
618
619 const char *line;
620 size_t len;
621
622 if (decode_and_read_line(&dec, &line, &len)) {
623 if (len && line[len - 1] == '\r') {
624 buffer->crlf_newlines = true;
625 len--;
626 }
627 Block *blk = add_utf8_line(buffer, NULL, line, len);
628 while (decode_and_read_line(&dec, &line, &len)) {
629 if (buffer->crlf_newlines && len && line[len - 1] == '\r') {
630 len--;
631 }
632 blk = add_utf8_line(buffer, blk, line, len);
633 }
634 if (blk) {
635 add_block(buffer, blk);
636 }
637 }
638
639 cconv_free(cconv);
640 return true;
641 }
642
643 #endif
644