Line | Branch | Exec | Source |
---|---|---|---|
1 | #include <errno.h> | ||
2 | #include <inttypes.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <string.h> | ||
5 | #include "convert.h" | ||
6 | #include "block.h" | ||
7 | #include "buildvar-iconv.h" | ||
8 | #include "encoding.h" | ||
9 | #include "util/debug.h" | ||
10 | #include "util/intern.h" | ||
11 | #include "util/list.h" | ||
12 | #include "util/log.h" | ||
13 | #include "util/str-util.h" | ||
14 | #include "util/utf8.h" | ||
15 | #include "util/xmalloc.h" | ||
16 | #include "util/xreadwrite.h" | ||
17 | |||
18 | enum { | ||
19 | // If any line exceeds this length when reading a file, syntax | ||
20 | // highlighting will be automatically disabled | ||
21 | SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB | ||
22 | }; | ||
23 | |||
24 | typedef struct { | ||
25 | const unsigned char *ibuf; | ||
26 | ssize_t ipos; | ||
27 | ssize_t isize; | ||
28 | struct cconv *cconv; | ||
29 | } FileDecoder; | ||
30 | |||
31 | 56 | static void add_block(Buffer *buffer, Block *blk) | |
32 | { | ||
33 | 56 | buffer->nl += blk->nl; | |
34 | 56 | list_insert_before(&blk->node, &buffer->blocks); | |
35 | 56 | } | |
36 | |||
37 | 7316 | static Block *add_utf8_line ( | |
38 | Buffer *buffer, | ||
39 | Block *blk, | ||
40 | const unsigned char *line, | ||
41 | size_t len | ||
42 | ) { | ||
43 | 7316 | size_t size = len + 1; | |
44 |
2/2✓ Branch 0 (2→3) taken 7288 times.
✓ Branch 1 (2→6) taken 28 times.
|
7316 | if (blk) { |
45 | 7288 | size_t avail = blk->alloc - blk->size; | |
46 |
2/2✓ Branch 0 (3→4) taken 7260 times.
✓ Branch 1 (3→5) taken 28 times.
|
7288 | if (size <= avail) { |
47 | 7260 | goto copy; | |
48 | } | ||
49 | 28 | add_block(buffer, blk); | |
50 | } | ||
51 | 56 | size = MAX(size, 8192); | |
52 | 56 | blk = block_new(size); | |
53 | |||
54 | 7316 | copy: | |
55 |
1/4✗ Branch 0 (7→8) not taken.
✓ Branch 1 (7→11) taken 7316 times.
✗ Branch 2 (8→9) not taken.
✗ Branch 3 (8→11) not taken.
|
7316 | if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) { |
56 | // TODO: Make the limit configurable and add documentation | ||
57 | // TODO: Pass in an ErrorBuffer* and use error_msg() instead of LOG_NOTICE() | ||
58 | ✗ | LOG_NOTICE ( | |
59 | "line length (%zu) exceeded limit (%ju); disabling syntax highlighting", | ||
60 | len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN | ||
61 | ); | ||
62 | ✗ | buffer->options.syntax = false; | |
63 | } | ||
64 | |||
65 | 7316 | memcpy(blk->data + blk->size, line, len); | |
66 | 7316 | blk->size += len; | |
67 | 7316 | blk->data[blk->size++] = '\n'; | |
68 | 7316 | blk->nl++; | |
69 | 7316 | return blk; | |
70 | } | ||
71 | |||
72 | 7347 | static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
73 | { | ||
74 | 7347 | const char *line = dec->ibuf + dec->ipos; | |
75 | 7347 | const char *nl = memchr(line, '\n', dec->isize - dec->ipos); | |
76 | 7347 | size_t len; | |
77 | |||
78 |
2/2✓ Branch 0 (2→3) taken 7314 times.
✓ Branch 1 (2→4) taken 33 times.
|
7347 | if (nl) { |
79 | 7314 | len = nl - line; | |
80 | 7314 | dec->ipos += len + 1; | |
81 | } else { | ||
82 | 33 | len = dec->isize - dec->ipos; | |
83 |
2/2✓ Branch 0 (4→5) taken 2 times.
✓ Branch 1 (4→7) taken 31 times.
|
33 | if (len == 0) { |
84 | return false; | ||
85 | } | ||
86 | 2 | dec->ipos += len; | |
87 | } | ||
88 | |||
89 | 7316 | *linep = line; | |
90 | 7316 | *lenp = len; | |
91 | 7316 | return true; | |
92 | } | ||
93 | |||
94 | 31 | static bool file_decoder_read_utf8(Buffer *buffer, const unsigned char *buf, size_t size) | |
95 | { | ||
96 |
1/2✗ Branch 0 (3→4) not taken.
✓ Branch 1 (3→5) taken 31 times.
|
31 | if (unlikely(!encoding_is_utf8(buffer->encoding))) { |
97 | ✗ | errno = EINVAL; | |
98 | ✗ | return false; | |
99 | } | ||
100 | |||
101 | 31 | FileDecoder dec = { | |
102 | .ibuf = buf, | ||
103 | .isize = size, | ||
104 | }; | ||
105 | |||
106 | 31 | const char *line; | |
107 | 31 | size_t len; | |
108 | |||
109 |
2/2✓ Branch 0 (6→7) taken 28 times.
✓ Branch 1 (6→23) taken 3 times.
|
31 | if (!read_utf8_line(&dec, &line, &len)) { |
110 | return true; | ||
111 | } | ||
112 | |||
113 |
3/4✓ Branch 0 (7→8) taken 28 times.
✗ Branch 1 (7→10) not taken.
✓ Branch 2 (8→9) taken 1 times.
✓ Branch 3 (8→10) taken 27 times.
|
28 | if (len && line[len - 1] == '\r') { |
114 | 1 | buffer->crlf_newlines = true; | |
115 | 1 | len--; | |
116 | } | ||
117 | |||
118 | 28 | Block *blk = add_utf8_line(buffer, NULL, line, len); | |
119 | |||
120 |
2/2✓ Branch 0 (11→16) taken 1 times.
✓ Branch 1 (11→19) taken 27 times.
|
28 | if (unlikely(buffer->crlf_newlines)) { |
121 |
2/2✓ Branch 0 (17→12) taken 270 times.
✓ Branch 1 (17→21) taken 1 times.
|
271 | while (read_utf8_line(&dec, &line, &len)) { |
122 |
4/4✓ Branch 0 (12→13) taken 268 times.
✓ Branch 1 (12→15) taken 2 times.
✓ Branch 2 (13→14) taken 1 times.
✓ Branch 3 (13→15) taken 267 times.
|
270 | if (len && line[len - 1] == '\r') { |
123 | 1 | len--; | |
124 | } | ||
125 | 270 | blk = add_utf8_line(buffer, blk, line, len); | |
126 | } | ||
127 | } else { | ||
128 |
2/2✓ Branch 0 (20→18) taken 7018 times.
✓ Branch 1 (20→21) taken 27 times.
|
7045 | while (read_utf8_line(&dec, &line, &len)) { |
129 | 7018 | blk = add_utf8_line(buffer, blk, line, len); | |
130 | } | ||
131 | } | ||
132 | |||
133 |
1/2✓ Branch 0 (21→22) taken 28 times.
✗ Branch 1 (21→23) not taken.
|
28 | if (blk) { |
134 | 28 | add_block(buffer, blk); | |
135 | } | ||
136 | |||
137 | return true; | ||
138 | } | ||
139 | |||
140 | 1 | static size_t unix_to_dos ( | |
141 | FileEncoder *enc, | ||
142 | const unsigned char *buf, | ||
143 | size_t size | ||
144 | ) { | ||
145 | // TODO: Pass in Buffer::nl and make this size adjustment more conservative | ||
146 | // (it's resized to handle the worst possible case, despite the fact that we | ||
147 | // already have the number of newlines pre-computed) | ||
148 |
1/2✓ Branch 0 (2→3) taken 1 times.
✗ Branch 1 (2→8) not taken.
|
1 | if (enc->nsize < size * 2) { |
149 | 1 | enc->nsize = size * 2; | |
150 | 1 | enc->nbuf = xrealloc(enc->nbuf, enc->nsize); | |
151 | } | ||
152 | |||
153 | // TODO: Optimize this loop, by making use of memccpy(3) | ||
154 | size_t d = 0; | ||
155 |
2/2✓ Branch 0 (9→5) taken 21 times.
✓ Branch 1 (9→10) taken 1 times.
|
22 | for (size_t s = 0; s < size; s++) { |
156 | 21 | unsigned char ch = buf[s]; | |
157 |
2/2✓ Branch 0 (5→6) taken 3 times.
✓ Branch 1 (5→7) taken 18 times.
|
21 | if (ch == '\n') { |
158 | 3 | enc->nbuf[d++] = '\r'; | |
159 | } | ||
160 | 21 | enc->nbuf[d++] = ch; | |
161 | } | ||
162 | |||
163 | 1 | return d; | |
164 | } | ||
165 | |||
166 | #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation: | ||
167 | |||
168 | bool conversion_supported_by_iconv ( | ||
169 | const char* UNUSED_ARG(from), | ||
170 | const char* UNUSED_ARG(to) | ||
171 | ) { | ||
172 | errno = EINVAL; | ||
173 | return false; | ||
174 | } | ||
175 | |||
176 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | ||
177 | { | ||
178 | if (unlikely(!encoding_is_utf8(encoding))) { | ||
179 | BUG("unsupported conversion; should have been handled earlier"); | ||
180 | } | ||
181 | |||
182 | return (FileEncoder) { | ||
183 | .crlf = crlf, | ||
184 | .fd = fd, | ||
185 | }; | ||
186 | } | ||
187 | |||
188 | void file_encoder_free(FileEncoder *enc) | ||
189 | { | ||
190 | free(enc->nbuf); | ||
191 | } | ||
192 | |||
193 | ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n) | ||
194 | { | ||
195 | if (unlikely(enc->crlf)) { | ||
196 | n = unix_to_dos(enc, buf, n); | ||
197 | buf = enc->nbuf; | ||
198 | } | ||
199 | return xwrite_all(enc->fd, buf, n); | ||
200 | } | ||
201 | |||
202 | size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc)) | ||
203 | { | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size) | ||
208 | { | ||
209 | return file_decoder_read_utf8(buffer, buf, size); | ||
210 | } | ||
211 | |||
212 | #else // ICONV_DISABLE != 1; use full iconv implementation: | ||
213 | |||
214 | #include <iconv.h> | ||
215 | |||
216 | // UTF-8 encoding of U+00BF (inverted question mark; "¿") | ||
217 | #define REPLACEMENT "\xc2\xbf" | ||
218 | |||
219 | struct cconv { | ||
220 | iconv_t cd; | ||
221 | char *obuf; | ||
222 | size_t osize; | ||
223 | size_t opos; | ||
224 | size_t consumed; | ||
225 | size_t errors; | ||
226 | |||
227 | // Temporary input buffer | ||
228 | char tbuf[16]; | ||
229 | size_t tcount; | ||
230 | |||
231 | // REPLACEMENT character, in target encoding | ||
232 | char rbuf[4]; | ||
233 | size_t rcount; | ||
234 | |||
235 | // Input character size in bytes, or zero for UTF-8 | ||
236 | size_t char_size; | ||
237 | }; | ||
238 | |||
239 | 1 | static struct cconv *create(iconv_t cd) | |
240 | { | ||
241 | 1 | struct cconv *c = xnew0(struct cconv, 1); | |
242 | 1 | c->cd = cd; | |
243 | 1 | c->osize = 8192; | |
244 | 1 | c->obuf = xmalloc(c->osize); | |
245 | 1 | return c; | |
246 | } | ||
247 | |||
248 | 2 | static size_t iconv_wrapper ( | |
249 | iconv_t cd, | ||
250 | const char **restrict inbuf, | ||
251 | size_t *restrict inbytesleft, | ||
252 | char **restrict outbuf, | ||
253 | size_t *restrict outbytesleft | ||
254 | ) { | ||
255 | // POSIX defines the second parameter of iconv(3) as "char **restrict" | ||
256 | // but NetBSD declares it as "const char **restrict" | ||
257 | #ifdef __NetBSD__ | ||
258 | const char **restrict in = inbuf; | ||
259 | #else | ||
260 | 2 | char **restrict in = (char **restrict)inbuf; | |
261 | #endif | ||
262 | |||
263 | 2 | return iconv(cd, in, inbytesleft, outbuf, outbytesleft); | |
264 | } | ||
265 | |||
266 | ✗ | static void resize_obuf(struct cconv *c) | |
267 | { | ||
268 | ✗ | c->osize = xmul(2, c->osize); | |
269 | ✗ | c->obuf = xrealloc(c->obuf, c->osize); | |
270 | ✗ | } | |
271 | |||
272 | ✗ | static void add_replacement(struct cconv *c) | |
273 | { | ||
274 | ✗ | if (c->osize - c->opos < 4) { | |
275 | ✗ | resize_obuf(c); | |
276 | } | ||
277 | |||
278 | ✗ | memcpy(c->obuf + c->opos, c->rbuf, c->rcount); | |
279 | ✗ | c->opos += c->rcount; | |
280 | ✗ | } | |
281 | |||
282 | ✗ | static size_t handle_invalid(struct cconv *c, const char *buf, size_t count) | |
283 | { | ||
284 | ✗ | LOG_DEBUG("%zu %zu", c->char_size, count); | |
285 | ✗ | add_replacement(c); | |
286 | ✗ | if (c->char_size == 0) { | |
287 | // Converting from UTF-8 | ||
288 | ✗ | size_t idx = 0; | |
289 | ✗ | CodePoint u = u_get_char(buf, count, &idx); | |
290 | ✗ | LOG_DEBUG("U+%04" PRIX32, u); | |
291 | ✗ | return idx; | |
292 | } | ||
293 | ✗ | if (c->char_size > count) { | |
294 | // wtf | ||
295 | ✗ | return 1; | |
296 | } | ||
297 | return c->char_size; | ||
298 | } | ||
299 | |||
300 | 1 | static int xiconv(struct cconv *c, const char **ib, size_t *ic) | |
301 | { | ||
302 | 1 | while (1) { | |
303 | 1 | char *ob = c->obuf + c->opos; | |
304 | 1 | size_t oc = c->osize - c->opos; | |
305 | 1 | size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc); | |
306 | 1 | c->opos = ob - c->obuf; | |
307 |
1/2✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→12) taken 1 times.
|
1 | if (rc == (size_t)-1) { |
308 | ✗ | switch (errno) { | |
309 | ✗ | case EILSEQ: | |
310 | ✗ | c->errors++; | |
311 | // Reset | ||
312 | ✗ | iconv(c->cd, NULL, NULL, NULL, NULL); | |
313 | ✗ | return errno; | |
314 | case EINVAL: | ||
315 | return errno; | ||
316 | ✗ | case E2BIG: | |
317 | ✗ | resize_obuf(c); | |
318 | ✗ | continue; | |
319 | ✗ | default: | |
320 | − | BUG("iconv: %s", strerror(errno)); | |
321 | } | ||
322 | } else { | ||
323 | 1 | c->errors += rc; | |
324 | } | ||
325 | 1 | return 0; | |
326 | } | ||
327 | } | ||
328 | |||
329 | ✗ | static size_t convert_incomplete(struct cconv *c, const char *input, size_t len) | |
330 | { | ||
331 | ✗ | size_t ipos = 0; | |
332 | ✗ | while (c->tcount < sizeof(c->tbuf) && ipos < len) { | |
333 | ✗ | c->tbuf[c->tcount++] = input[ipos++]; | |
334 | ✗ | const char *ib = c->tbuf; | |
335 | ✗ | size_t ic = c->tcount; | |
336 | ✗ | int rc = xiconv(c, &ib, &ic); | |
337 | ✗ | if (ic > 0) { | |
338 | ✗ | memmove(c->tbuf, ib, ic); | |
339 | } | ||
340 | ✗ | c->tcount = ic; | |
341 | ✗ | if (rc == EINVAL) { | |
342 | // Incomplete character at end of input buffer; try again | ||
343 | // with more input data | ||
344 | ✗ | continue; | |
345 | } | ||
346 | ✗ | if (rc == EILSEQ) { | |
347 | // Invalid multibyte sequence | ||
348 | ✗ | size_t skip = handle_invalid(c, c->tbuf, c->tcount); | |
349 | ✗ | c->tcount -= skip; | |
350 | ✗ | if (c->tcount > 0) { | |
351 | ✗ | LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip); | |
352 | ✗ | memmove(c->tbuf, c->tbuf + skip, c->tcount); | |
353 | ✗ | continue; | |
354 | } | ||
355 | ✗ | return ipos; | |
356 | } | ||
357 | ✗ | break; | |
358 | } | ||
359 | |||
360 | ✗ | LOG_DEBUG("%zu %zu", ipos, c->tcount); | |
361 | ✗ | return ipos; | |
362 | } | ||
363 | |||
364 | 1 | static void cconv_process(struct cconv *c, const char *input, size_t len) | |
365 | { | ||
366 |
1/2✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→4) taken 1 times.
|
1 | if (c->consumed > 0) { |
367 | ✗ | size_t fill = c->opos - c->consumed; | |
368 | ✗ | memmove(c->obuf, c->obuf + c->consumed, fill); | |
369 | ✗ | c->opos = fill; | |
370 | ✗ | c->consumed = 0; | |
371 | } | ||
372 | |||
373 |
1/2✗ Branch 0 (4→5) not taken.
✓ Branch 1 (4→7) taken 1 times.
|
1 | if (c->tcount > 0) { |
374 | ✗ | size_t ipos = convert_incomplete(c, input, len); | |
375 | ✗ | input += ipos; | |
376 | ✗ | len -= ipos; | |
377 | } | ||
378 | |||
379 | 1 | const char *ib = input; | |
380 |
2/2✓ Branch 0 (17→8) taken 1 times.
✓ Branch 1 (17→18) taken 1 times.
|
2 | for (size_t ic = len; ic > 0; ) { |
381 | 1 | int r = xiconv(c, &ib, &ic); | |
382 |
1/2✗ Branch 0 (9→10) not taken.
✓ Branch 1 (9→13) taken 1 times.
|
1 | if (r == EINVAL) { |
383 | // Incomplete character at end of input buffer | ||
384 | ✗ | if (ic < sizeof(c->tbuf)) { | |
385 | ✗ | memcpy(c->tbuf, ib, ic); | |
386 | ✗ | c->tcount = ic; | |
387 | } else { | ||
388 | // FIXME | ||
389 | ✗ | } | |
390 | ✗ | ic = 0; | |
391 | ✗ | continue; | |
392 | } | ||
393 |
1/2✗ Branch 0 (13→14) not taken.
✓ Branch 1 (13→16) taken 1 times.
|
1 | if (r == EILSEQ) { |
394 | // Invalid multibyte sequence | ||
395 | ✗ | size_t skip = handle_invalid(c, ib, ic); | |
396 | ✗ | ic -= skip; | |
397 | ✗ | ib += skip; | |
398 | ✗ | continue; | |
399 | } | ||
400 | } | ||
401 | 1 | } | |
402 | |||
403 | ✗ | static struct cconv *cconv_to_utf8(const char *encoding) | |
404 | { | ||
405 | ✗ | iconv_t cd = iconv_open("UTF-8", encoding); | |
406 | ✗ | if (cd == (iconv_t)-1) { | |
407 | return NULL; | ||
408 | } | ||
409 | |||
410 | ✗ | struct cconv *c = create(cd); | |
411 | ✗ | c->rcount = copyliteral(c->rbuf, REPLACEMENT); | |
412 | |||
413 | ✗ | if (str_has_prefix(encoding, "UTF-16")) { | |
414 | ✗ | c->char_size = 2; | |
415 | ✗ | } else if (str_has_prefix(encoding, "UTF-32")) { | |
416 | ✗ | c->char_size = 4; | |
417 | } else { | ||
418 | ✗ | c->char_size = 1; | |
419 | } | ||
420 | |||
421 | return c; | ||
422 | } | ||
423 | |||
424 | 1 | static void encode_replacement(struct cconv *c) | |
425 | { | ||
426 | 1 | static const unsigned char rep[] = REPLACEMENT; | |
427 | 1 | const char *ib = rep; | |
428 | 1 | char *ob = c->rbuf; | |
429 | 1 | size_t ic = STRLEN(REPLACEMENT); | |
430 | 1 | size_t oc = sizeof(c->rbuf); | |
431 | 1 | size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc); | |
432 | |||
433 |
1/2✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→5) not taken.
|
1 | if (rc == (size_t)-1) { |
434 | 1 | c->rbuf[0] = '\xbf'; | |
435 | 1 | c->rcount = 1; | |
436 | } else { | ||
437 | ✗ | c->rcount = ob - c->rbuf; | |
438 | } | ||
439 | 1 | } | |
440 | |||
441 | 1 | static struct cconv *cconv_from_utf8(const char *encoding) | |
442 | { | ||
443 | 1 | iconv_t cd = iconv_open(encoding, "UTF-8"); | |
444 |
1/2✓ Branch 0 (3→4) taken 1 times.
✗ Branch 1 (3→7) not taken.
|
1 | if (cd == (iconv_t)-1) { |
445 | return NULL; | ||
446 | } | ||
447 | 1 | struct cconv *c = create(cd); | |
448 | 1 | encode_replacement(c); | |
449 | 1 | return c; | |
450 | } | ||
451 | |||
452 | 1 | static void cconv_flush(struct cconv *c) | |
453 | { | ||
454 |
1/2✗ Branch 0 (2→3) not taken.
✓ Branch 1 (2→6) taken 1 times.
|
1 | if (c->tcount > 0) { |
455 | // Replace incomplete character at end of input buffer | ||
456 | ✗ | LOG_DEBUG("incomplete character at EOF"); | |
457 | ✗ | add_replacement(c); | |
458 | ✗ | c->tcount = 0; | |
459 | } | ||
460 | 1 | } | |
461 | |||
462 | ✗ | static char *cconv_consume_line(struct cconv *c, size_t *len) | |
463 | { | ||
464 | ✗ | char *line = c->obuf + c->consumed; | |
465 | ✗ | char *nl = memchr(line, '\n', c->opos - c->consumed); | |
466 | ✗ | if (!nl) { | |
467 | ✗ | *len = 0; | |
468 | ✗ | return NULL; | |
469 | } | ||
470 | |||
471 | ✗ | size_t n = nl - line + 1; | |
472 | ✗ | c->consumed += n; | |
473 | ✗ | *len = n; | |
474 | ✗ | return line; | |
475 | } | ||
476 | |||
477 | 1 | static char *cconv_consume_all(struct cconv *c, size_t *len) | |
478 | { | ||
479 | 1 | char *buf = c->obuf + c->consumed; | |
480 | 1 | *len = c->opos - c->consumed; | |
481 | 1 | c->consumed = c->opos; | |
482 | 1 | return buf; | |
483 | } | ||
484 | |||
485 | 1 | static void cconv_free(struct cconv *c) | |
486 | { | ||
487 | 1 | BUG_ON(!c); | |
488 | 1 | iconv_close(c->cd); | |
489 | 1 | free(c->obuf); | |
490 | 1 | free(c); | |
491 | 1 | } | |
492 | |||
493 | 2 | bool conversion_supported_by_iconv(const char *from, const char *to) | |
494 | { | ||
495 |
2/4✓ Branch 0 (2→3) taken 2 times.
✗ Branch 1 (2→4) not taken.
✗ Branch 2 (3→4) not taken.
✓ Branch 3 (3→5) taken 2 times.
|
2 | if (unlikely(from[0] == '\0' || to[0] == '\0')) { |
496 | ✗ | errno = EINVAL; | |
497 | ✗ | return false; | |
498 | } | ||
499 | |||
500 | 2 | iconv_t cd = iconv_open(to, from); | |
501 |
1/2✓ Branch 0 (6→7) taken 2 times.
✗ Branch 1 (6→9) not taken.
|
2 | if (cd == (iconv_t)-1) { |
502 | return false; | ||
503 | } | ||
504 | |||
505 | 2 | iconv_close(cd); | |
506 | 2 | return true; | |
507 | } | ||
508 | |||
509 | 21 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | |
510 | { | ||
511 | 21 | struct cconv *cconv = NULL; | |
512 |
2/2✓ Branch 0 (3→4) taken 1 times.
✓ Branch 1 (3→7) taken 20 times.
|
21 | if (unlikely(!encoding_is_utf8(encoding))) { |
513 | 1 | cconv = cconv_from_utf8(encoding); | |
514 |
1/2✗ Branch 0 (5→6) not taken.
✓ Branch 1 (5→7) taken 1 times.
|
1 | if (!cconv) { |
515 | − | BUG("unsupported conversion; should have been handled earlier"); | |
516 | } | ||
517 | } | ||
518 | |||
519 | 21 | return (FileEncoder) { | |
520 | .cconv = cconv, | ||
521 | .crlf = crlf, | ||
522 | .fd = fd, | ||
523 | }; | ||
524 | } | ||
525 | |||
526 | 21 | void file_encoder_free(FileEncoder *enc) | |
527 | { | ||
528 |
2/2✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
|
21 | if (enc->cconv) { |
529 | 1 | cconv_free(enc->cconv); | |
530 | } | ||
531 | 21 | free(enc->nbuf); | |
532 | 21 | } | |
533 | |||
534 | // NOTE: buf must contain whole characters! | ||
535 | 21 | ssize_t file_encoder_write ( | |
536 | FileEncoder *enc, | ||
537 | const unsigned char *buf, | ||
538 | size_t size | ||
539 | ) { | ||
540 |
2/2✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→5) taken 20 times.
|
21 | if (unlikely(enc->crlf)) { |
541 | 1 | size = unix_to_dos(enc, buf, size); | |
542 | 1 | buf = enc->nbuf; | |
543 | } | ||
544 |
2/2✓ Branch 0 (5→6) taken 1 times.
✓ Branch 1 (5→9) taken 20 times.
|
21 | if (unlikely(enc->cconv)) { |
545 | 1 | cconv_process(enc->cconv, buf, size); | |
546 | 1 | cconv_flush(enc->cconv); | |
547 | 1 | buf = cconv_consume_all(enc->cconv, &size); | |
548 | } | ||
549 | 21 | return xwrite_all(enc->fd, buf, size); | |
550 | } | ||
551 | |||
552 | 21 | size_t file_encoder_get_nr_errors(const FileEncoder *enc) | |
553 | { | ||
554 |
2/2✓ Branch 0 (2→3) taken 1 times.
✓ Branch 1 (2→4) taken 20 times.
|
21 | return enc->cconv ? enc->cconv->errors : 0; |
555 | } | ||
556 | |||
557 | ✗ | static bool fill(FileDecoder *dec) | |
558 | { | ||
559 | ✗ | if (dec->ipos == dec->isize) { | |
560 | return false; | ||
561 | } | ||
562 | |||
563 | // Smaller than cconv.obuf to make realloc less likely | ||
564 | ✗ | size_t max = 7 * 1024; | |
565 | |||
566 | ✗ | size_t icount = MIN(dec->isize - dec->ipos, max); | |
567 | ✗ | cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount); | |
568 | ✗ | dec->ipos += icount; | |
569 | ✗ | if (dec->ipos == dec->isize) { | |
570 | // Must be flushed after all input has been fed | ||
571 | ✗ | cconv_flush(dec->cconv); | |
572 | } | ||
573 | return true; | ||
574 | } | ||
575 | |||
576 | ✗ | static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
577 | { | ||
578 | ✗ | char *line; | |
579 | ✗ | size_t len; | |
580 | ✗ | while (1) { | |
581 | ✗ | line = cconv_consume_line(dec->cconv, &len); | |
582 | ✗ | if (line || !fill(dec)) { | |
583 | break; | ||
584 | } | ||
585 | } | ||
586 | |||
587 | ✗ | if (line) { | |
588 | // Newline not wanted | ||
589 | ✗ | len--; | |
590 | } else { | ||
591 | ✗ | line = cconv_consume_all(dec->cconv, &len); | |
592 | ✗ | if (len == 0) { | |
593 | return false; | ||
594 | } | ||
595 | } | ||
596 | |||
597 | ✗ | *linep = line; | |
598 | ✗ | *lenp = len; | |
599 | ✗ | return true; | |
600 | } | ||
601 | |||
602 | 31 | bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size) | |
603 | { | ||
604 |
1/2✓ Branch 0 (3→4) taken 31 times.
✗ Branch 1 (3→5) not taken.
|
31 | if (encoding_is_utf8(buffer->encoding)) { |
605 | 31 | return file_decoder_read_utf8(buffer, buf, size); | |
606 | } | ||
607 | |||
608 | ✗ | struct cconv *cconv = cconv_to_utf8(buffer->encoding); | |
609 | ✗ | if (!cconv) { | |
610 | return false; | ||
611 | } | ||
612 | |||
613 | ✗ | FileDecoder dec = { | |
614 | .ibuf = buf, | ||
615 | .isize = size, | ||
616 | .cconv = cconv, | ||
617 | }; | ||
618 | |||
619 | ✗ | const char *line; | |
620 | ✗ | size_t len; | |
621 | |||
622 | ✗ | if (decode_and_read_line(&dec, &line, &len)) { | |
623 | ✗ | if (len && line[len - 1] == '\r') { | |
624 | ✗ | buffer->crlf_newlines = true; | |
625 | ✗ | len--; | |
626 | } | ||
627 | ✗ | Block *blk = add_utf8_line(buffer, NULL, line, len); | |
628 | ✗ | while (decode_and_read_line(&dec, &line, &len)) { | |
629 | ✗ | if (buffer->crlf_newlines && len && line[len - 1] == '\r') { | |
630 | ✗ | len--; | |
631 | } | ||
632 | ✗ | blk = add_utf8_line(buffer, blk, line, len); | |
633 | } | ||
634 | ✗ | if (blk) { | |
635 | ✗ | add_block(buffer, blk); | |
636 | } | ||
637 | } | ||
638 | |||
639 | ✗ | cconv_free(cconv); | |
640 | ✗ | return true; | |
641 | } | ||
642 | |||
643 | #endif | ||
644 |