dte test coverage


Directory: ./
File: src/convert.c
Date: 2024-12-21 16:03:22
Exec Total Coverage
Lines: 151 289 52.2%
Functions: 20 28 71.4%
Branches: 56 138 40.6%

Line Branch Exec Source
1 #include <errno.h>
2 #include <inttypes.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include "convert.h"
6 #include "block.h"
7 #include "buildvar-iconv.h"
8 #include "encoding.h"
9 #include "error.h"
10 #include "util/debug.h"
11 #include "util/intern.h"
12 #include "util/list.h"
13 #include "util/log.h"
14 #include "util/str-util.h"
15 #include "util/utf8.h"
16 #include "util/xmalloc.h"
17 #include "util/xreadwrite.h"
18
19 enum {
20 // If any line exceeds this length when reading a file, syntax
21 // highlighting will be automatically disabled
22 SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB
23 };
24
25 typedef struct {
26 const unsigned char *ibuf;
27 ssize_t ipos;
28 ssize_t isize;
29 struct cconv *cconv;
30 } FileDecoder;
31
32 55 static void add_block(Buffer *buffer, Block *blk)
33 {
34 55 buffer->nl += blk->nl;
35 55 list_insert_before(&blk->node, &buffer->blocks);
36 55 }
37
38 7290 static Block *add_utf8_line (
39 Buffer *buffer,
40 Block *blk,
41 const unsigned char *line,
42 size_t len
43 ) {
44 7290 size_t size = len + 1;
45
2/2
✓ Branch 0 taken 7263 times.
✓ Branch 1 taken 27 times.
7290 if (blk) {
46 7263 size_t avail = blk->alloc - blk->size;
47
2/2
✓ Branch 0 taken 7235 times.
✓ Branch 1 taken 28 times.
7263 if (size <= avail) {
48 7235 goto copy;
49 }
50 28 add_block(buffer, blk);
51 }
52 55 size = MAX(size, 8192);
53 55 blk = block_new(size);
54
55 7290 copy:
56
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 7290 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
7290 if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) {
57 error_msg (
58 "line length (%zu) exceeded limit (%ju); disabling syntax highlighting",
59 len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN
60 );
61 buffer->options.syntax = false;
62 }
63
64 7290 memcpy(blk->data + blk->size, line, len);
65 7290 blk->size += len;
66 7290 blk->data[blk->size++] = '\n';
67 7290 blk->nl++;
68 7290 return blk;
69 }
70
71 7320 static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp)
72 {
73 7320 const char *line = dec->ibuf + dec->ipos;
74 7320 const char *nl = memchr(line, '\n', dec->isize - dec->ipos);
75 7320 size_t len;
76
77
2/2
✓ Branch 0 taken 7288 times.
✓ Branch 1 taken 32 times.
7320 if (nl) {
78 7288 len = nl - line;
79 7288 dec->ipos += len + 1;
80 } else {
81 32 len = dec->isize - dec->ipos;
82
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 30 times.
32 if (len == 0) {
83 return false;
84 }
85 2 dec->ipos += len;
86 }
87
88 7290 *linep = line;
89 7290 *lenp = len;
90 7290 return true;
91 }
92
93 30 static bool file_decoder_read_utf8(Buffer *buffer, const unsigned char *buf, size_t size)
94 {
95
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
30 if (unlikely(!encoding_is_utf8(buffer->encoding))) {
96 errno = EINVAL;
97 return false;
98 }
99
100 30 FileDecoder dec = {
101 .ibuf = buf,
102 .isize = size,
103 };
104
105 30 const char *line;
106 30 size_t len;
107
108
2/2
✓ Branch 0 taken 27 times.
✓ Branch 1 taken 3 times.
30 if (!read_utf8_line(&dec, &line, &len)) {
109 return true;
110 }
111
112
3/4
✓ Branch 0 taken 27 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 26 times.
27 if (len && line[len - 1] == '\r') {
113 1 buffer->crlf_newlines = true;
114 1 len--;
115 }
116
117 27 Block *blk = add_utf8_line(buffer, NULL, line, len);
118
119
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 26 times.
27 if (unlikely(buffer->crlf_newlines)) {
120
2/2
✓ Branch 0 taken 270 times.
✓ Branch 1 taken 1 times.
271 while (read_utf8_line(&dec, &line, &len)) {
121
4/4
✓ Branch 0 taken 268 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 267 times.
270 if (len && line[len - 1] == '\r') {
122 1 len--;
123 }
124 270 blk = add_utf8_line(buffer, blk, line, len);
125 }
126 } else {
127
2/2
✓ Branch 0 taken 6993 times.
✓ Branch 1 taken 26 times.
7019 while (read_utf8_line(&dec, &line, &len)) {
128 6993 blk = add_utf8_line(buffer, blk, line, len);
129 }
130 }
131
132
1/2
✓ Branch 0 taken 27 times.
✗ Branch 1 not taken.
27 if (blk) {
133 27 add_block(buffer, blk);
134 }
135
136 return true;
137 }
138
139 1 static size_t unix_to_dos (
140 FileEncoder *enc,
141 const unsigned char *buf,
142 size_t size
143 ) {
144 // TODO: Pass in Buffer::nl and make this size adjustment more conservative
145 // (it's resized to handle the worst possible case, despite the fact that we
146 // already have the number of newlines pre-computed)
147
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (enc->nsize < size * 2) {
148 1 enc->nsize = size * 2;
149 1 enc->nbuf = xrealloc(enc->nbuf, enc->nsize);
150 }
151
152 // TODO: Optimize this loop, by making use of memccpy(3)
153 size_t d = 0;
154
2/2
✓ Branch 0 taken 21 times.
✓ Branch 1 taken 1 times.
22 for (size_t s = 0; s < size; s++) {
155 21 unsigned char ch = buf[s];
156
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 18 times.
21 if (ch == '\n') {
157 3 enc->nbuf[d++] = '\r';
158 }
159 21 enc->nbuf[d++] = ch;
160 }
161
162 1 return d;
163 }
164
165 #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation:
166
167 bool conversion_supported_by_iconv (
168 const char* UNUSED_ARG(from),
169 const char* UNUSED_ARG(to)
170 ) {
171 errno = EINVAL;
172 return false;
173 }
174
175 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
176 {
177 if (unlikely(!encoding_is_utf8(encoding))) {
178 BUG("unsupported conversion; should have been handled earlier");
179 }
180
181 return (FileEncoder) {
182 .crlf = crlf,
183 .fd = fd,
184 };
185 }
186
187 void file_encoder_free(FileEncoder *enc)
188 {
189 free(enc->nbuf);
190 }
191
192 ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n)
193 {
194 if (unlikely(enc->crlf)) {
195 n = unix_to_dos(enc, buf, n);
196 buf = enc->nbuf;
197 }
198 return xwrite_all(enc->fd, buf, n);
199 }
200
201 size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc))
202 {
203 return 0;
204 }
205
206 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
207 {
208 return file_decoder_read_utf8(buffer, buf, size);
209 }
210
211 #else // ICONV_DISABLE != 1; use full iconv implementation:
212
213 #include <iconv.h>
214
215 // UTF-8 encoding of U+00BF (inverted question mark; "¿")
216 #define REPLACEMENT "\xc2\xbf"
217
218 struct cconv {
219 iconv_t cd;
220 char *obuf;
221 size_t osize;
222 size_t opos;
223 size_t consumed;
224 size_t errors;
225
226 // Temporary input buffer
227 char tbuf[16];
228 size_t tcount;
229
230 // REPLACEMENT character, in target encoding
231 char rbuf[4];
232 size_t rcount;
233
234 // Input character size in bytes, or zero for UTF-8
235 size_t char_size;
236 };
237
238 1 static struct cconv *create(iconv_t cd)
239 {
240 1 struct cconv *c = xnew0(struct cconv, 1);
241 1 c->cd = cd;
242 1 c->osize = 8192;
243 1 c->obuf = xmalloc(c->osize);
244 1 return c;
245 }
246
247 2 static size_t iconv_wrapper (
248 iconv_t cd,
249 const char **restrict inbuf,
250 size_t *restrict inbytesleft,
251 char **restrict outbuf,
252 size_t *restrict outbytesleft
253 ) {
254 // POSIX defines the second parameter of iconv(3) as "char **restrict"
255 // but NetBSD declares it as "const char **restrict"
256 #ifdef __NetBSD__
257 const char **restrict in = inbuf;
258 #else
259 2 char **restrict in = (char **restrict)inbuf;
260 #endif
261
262 2 return iconv(cd, in, inbytesleft, outbuf, outbytesleft);
263 }
264
265 static void resize_obuf(struct cconv *c)
266 {
267 c->osize = xmul(2, c->osize);
268 c->obuf = xrealloc(c->obuf, c->osize);
269 }
270
271 static void add_replacement(struct cconv *c)
272 {
273 if (c->osize - c->opos < 4) {
274 resize_obuf(c);
275 }
276
277 memcpy(c->obuf + c->opos, c->rbuf, c->rcount);
278 c->opos += c->rcount;
279 }
280
281 static size_t handle_invalid(struct cconv *c, const char *buf, size_t count)
282 {
283 LOG_DEBUG("%zu %zu", c->char_size, count);
284 add_replacement(c);
285 if (c->char_size == 0) {
286 // Converting from UTF-8
287 size_t idx = 0;
288 CodePoint u = u_get_char(buf, count, &idx);
289 LOG_DEBUG("U+%04" PRIX32, u);
290 return idx;
291 }
292 if (c->char_size > count) {
293 // wtf
294 return 1;
295 }
296 return c->char_size;
297 }
298
299 1 static int xiconv(struct cconv *c, const char **ib, size_t *ic)
300 {
301 1 while (1) {
302 1 char *ob = c->obuf + c->opos;
303 1 size_t oc = c->osize - c->opos;
304 1 size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc);
305 1 c->opos = ob - c->obuf;
306
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (rc == (size_t)-1) {
307 switch (errno) {
308 case EILSEQ:
309 c->errors++;
310 // Reset
311 iconv(c->cd, NULL, NULL, NULL, NULL);
312 return errno;
313 case EINVAL:
314 return errno;
315 case E2BIG:
316 resize_obuf(c);
317 continue;
318 default:
319 BUG("iconv: %s", strerror(errno));
320 }
321 } else {
322 1 c->errors += rc;
323 }
324 1 return 0;
325 }
326 }
327
328 static size_t convert_incomplete(struct cconv *c, const char *input, size_t len)
329 {
330 size_t ipos = 0;
331 while (c->tcount < sizeof(c->tbuf) && ipos < len) {
332 c->tbuf[c->tcount++] = input[ipos++];
333 const char *ib = c->tbuf;
334 size_t ic = c->tcount;
335 int rc = xiconv(c, &ib, &ic);
336 if (ic > 0) {
337 memmove(c->tbuf, ib, ic);
338 }
339 c->tcount = ic;
340 if (rc == EINVAL) {
341 // Incomplete character at end of input buffer; try again
342 // with more input data
343 continue;
344 }
345 if (rc == EILSEQ) {
346 // Invalid multibyte sequence
347 size_t skip = handle_invalid(c, c->tbuf, c->tcount);
348 c->tcount -= skip;
349 if (c->tcount > 0) {
350 LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip);
351 memmove(c->tbuf, c->tbuf + skip, c->tcount);
352 continue;
353 }
354 return ipos;
355 }
356 break;
357 }
358
359 LOG_DEBUG("%zu %zu", ipos, c->tcount);
360 return ipos;
361 }
362
363 1 static void cconv_process(struct cconv *c, const char *input, size_t len)
364 {
365
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (c->consumed > 0) {
366 size_t fill = c->opos - c->consumed;
367 memmove(c->obuf, c->obuf + c->consumed, fill);
368 c->opos = fill;
369 c->consumed = 0;
370 }
371
372
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (c->tcount > 0) {
373 size_t ipos = convert_incomplete(c, input, len);
374 input += ipos;
375 len -= ipos;
376 }
377
378 1 const char *ib = input;
379
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
2 for (size_t ic = len; ic > 0; ) {
380 1 int r = xiconv(c, &ib, &ic);
381
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (r == EINVAL) {
382 // Incomplete character at end of input buffer
383 if (ic < sizeof(c->tbuf)) {
384 memcpy(c->tbuf, ib, ic);
385 c->tcount = ic;
386 } else {
387 // FIXME
388 }
389 ic = 0;
390 continue;
391 }
392
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (r == EILSEQ) {
393 // Invalid multibyte sequence
394 size_t skip = handle_invalid(c, ib, ic);
395 ic -= skip;
396 ib += skip;
397 continue;
398 }
399 }
400 1 }
401
402 static struct cconv *cconv_to_utf8(const char *encoding)
403 {
404 iconv_t cd = iconv_open("UTF-8", encoding);
405 if (cd == (iconv_t)-1) {
406 return NULL;
407 }
408
409 struct cconv *c = create(cd);
410 c->rcount = copyliteral(c->rbuf, REPLACEMENT);
411
412 if (str_has_prefix(encoding, "UTF-16")) {
413 c->char_size = 2;
414 } else if (str_has_prefix(encoding, "UTF-32")) {
415 c->char_size = 4;
416 } else {
417 c->char_size = 1;
418 }
419
420 return c;
421 }
422
423 1 static void encode_replacement(struct cconv *c)
424 {
425 1 static const unsigned char rep[] = REPLACEMENT;
426 1 const char *ib = rep;
427 1 char *ob = c->rbuf;
428 1 size_t ic = STRLEN(REPLACEMENT);
429 1 size_t oc = sizeof(c->rbuf);
430 1 size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc);
431
432
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (rc == (size_t)-1) {
433 1 c->rbuf[0] = '\xbf';
434 1 c->rcount = 1;
435 } else {
436 c->rcount = ob - c->rbuf;
437 }
438 1 }
439
440 1 static struct cconv *cconv_from_utf8(const char *encoding)
441 {
442 1 iconv_t cd = iconv_open(encoding, "UTF-8");
443
1/2
✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
1 if (cd == (iconv_t)-1) {
444 return NULL;
445 }
446 1 struct cconv *c = create(cd);
447 1 encode_replacement(c);
448 1 return c;
449 }
450
451 1 static void cconv_flush(struct cconv *c)
452 {
453
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (c->tcount > 0) {
454 // Replace incomplete character at end of input buffer
455 LOG_DEBUG("incomplete character at EOF");
456 add_replacement(c);
457 c->tcount = 0;
458 }
459 1 }
460
461 static char *cconv_consume_line(struct cconv *c, size_t *len)
462 {
463 char *line = c->obuf + c->consumed;
464 char *nl = memchr(line, '\n', c->opos - c->consumed);
465 if (!nl) {
466 *len = 0;
467 return NULL;
468 }
469
470 size_t n = nl - line + 1;
471 c->consumed += n;
472 *len = n;
473 return line;
474 }
475
476 1 static char *cconv_consume_all(struct cconv *c, size_t *len)
477 {
478 1 char *buf = c->obuf + c->consumed;
479 1 *len = c->opos - c->consumed;
480 1 c->consumed = c->opos;
481 1 return buf;
482 }
483
484 1 static void cconv_free(struct cconv *c)
485 {
486 1 BUG_ON(!c);
487 1 iconv_close(c->cd);
488 1 free(c->obuf);
489 1 free(c);
490 1 }
491
492 2 bool conversion_supported_by_iconv(const char *from, const char *to)
493 {
494
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
2 if (unlikely(from[0] == '\0' || to[0] == '\0')) {
495 errno = EINVAL;
496 return false;
497 }
498
499 2 iconv_t cd = iconv_open(to, from);
500
1/2
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
2 if (cd == (iconv_t)-1) {
501 return false;
502 }
503
504 2 iconv_close(cd);
505 2 return true;
506 }
507
508 16 FileEncoder file_encoder(const char *encoding, bool crlf, int fd)
509 {
510 16 struct cconv *cconv = NULL;
511
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
16 if (unlikely(!encoding_is_utf8(encoding))) {
512 1 cconv = cconv_from_utf8(encoding);
513
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
1 if (!cconv) {
514 BUG("unsupported conversion; should have been handled earlier");
515 }
516 }
517
518 16 return (FileEncoder) {
519 .cconv = cconv,
520 .crlf = crlf,
521 .fd = fd,
522 };
523 }
524
525 16 void file_encoder_free(FileEncoder *enc)
526 {
527
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
16 if (enc->cconv) {
528 1 cconv_free(enc->cconv);
529 }
530 16 free(enc->nbuf);
531 16 }
532
533 // NOTE: buf must contain whole characters!
534 16 ssize_t file_encoder_write (
535 FileEncoder *enc,
536 const unsigned char *buf,
537 size_t size
538 ) {
539
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
16 if (unlikely(enc->crlf)) {
540 1 size = unix_to_dos(enc, buf, size);
541 1 buf = enc->nbuf;
542 }
543
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
16 if (unlikely(enc->cconv)) {
544 1 cconv_process(enc->cconv, buf, size);
545 1 cconv_flush(enc->cconv);
546 1 buf = cconv_consume_all(enc->cconv, &size);
547 }
548 16 return xwrite_all(enc->fd, buf, size);
549 }
550
551 16 size_t file_encoder_get_nr_errors(const FileEncoder *enc)
552 {
553
2/2
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
16 return enc->cconv ? enc->cconv->errors : 0;
554 }
555
556 static bool fill(FileDecoder *dec)
557 {
558 if (dec->ipos == dec->isize) {
559 return false;
560 }
561
562 // Smaller than cconv.obuf to make realloc less likely
563 size_t max = 7 * 1024;
564
565 size_t icount = MIN(dec->isize - dec->ipos, max);
566 cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount);
567 dec->ipos += icount;
568 if (dec->ipos == dec->isize) {
569 // Must be flushed after all input has been fed
570 cconv_flush(dec->cconv);
571 }
572 return true;
573 }
574
575 static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp)
576 {
577 char *line;
578 size_t len;
579 while (1) {
580 line = cconv_consume_line(dec->cconv, &len);
581 if (line || !fill(dec)) {
582 break;
583 }
584 }
585
586 if (line) {
587 // Newline not wanted
588 len--;
589 } else {
590 line = cconv_consume_all(dec->cconv, &len);
591 if (len == 0) {
592 return false;
593 }
594 }
595
596 *linep = line;
597 *lenp = len;
598 return true;
599 }
600
601 30 bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size)
602 {
603
1/2
✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
30 if (encoding_is_utf8(buffer->encoding)) {
604 30 return file_decoder_read_utf8(buffer, buf, size);
605 }
606
607 struct cconv *cconv = cconv_to_utf8(buffer->encoding);
608 if (!cconv) {
609 return false;
610 }
611
612 FileDecoder dec = {
613 .ibuf = buf,
614 .isize = size,
615 .cconv = cconv,
616 };
617
618 const char *line;
619 size_t len;
620
621 if (decode_and_read_line(&dec, &line, &len)) {
622 if (len && line[len - 1] == '\r') {
623 buffer->crlf_newlines = true;
624 len--;
625 }
626 Block *blk = add_utf8_line(buffer, NULL, line, len);
627 while (decode_and_read_line(&dec, &line, &len)) {
628 if (buffer->crlf_newlines && len && line[len - 1] == '\r') {
629 len--;
630 }
631 blk = add_utf8_line(buffer, blk, line, len);
632 }
633 if (blk) {
634 add_block(buffer, blk);
635 }
636 }
637
638 cconv_free(cconv);
639 return true;
640 }
641
642 #endif
643