Line | Branch | Exec | Source |
---|---|---|---|
1 | #include <errno.h> | ||
2 | #include <inttypes.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <string.h> | ||
5 | #include "convert.h" | ||
6 | #include "block.h" | ||
7 | #include "buildvar-iconv.h" | ||
8 | #include "encoding.h" | ||
9 | #include "error.h" | ||
10 | #include "util/debug.h" | ||
11 | #include "util/intern.h" | ||
12 | #include "util/list.h" | ||
13 | #include "util/log.h" | ||
14 | #include "util/str-util.h" | ||
15 | #include "util/utf8.h" | ||
16 | #include "util/xmalloc.h" | ||
17 | #include "util/xreadwrite.h" | ||
18 | |||
19 | enum { | ||
20 | // If any line exceeds this length when reading a file, syntax | ||
21 | // highlighting will be automatically disabled | ||
22 | SYN_HIGHLIGHT_MAX_LINE_LEN = 512u << 10, // 512KiB | ||
23 | }; | ||
24 | |||
25 | typedef struct { | ||
26 | const unsigned char *ibuf; | ||
27 | ssize_t ipos; | ||
28 | ssize_t isize; | ||
29 | struct cconv *cconv; | ||
30 | } FileDecoder; | ||
31 | |||
32 | 55 | static void add_block(Buffer *buffer, Block *blk) | |
33 | { | ||
34 | 55 | buffer->nl += blk->nl; | |
35 | 55 | list_insert_before(&blk->node, &buffer->blocks); | |
36 | 55 | } | |
37 | |||
38 | 7290 | static Block *add_utf8_line ( | |
39 | Buffer *buffer, | ||
40 | Block *blk, | ||
41 | const unsigned char *line, | ||
42 | size_t len | ||
43 | ) { | ||
44 | 7290 | size_t size = len + 1; | |
45 |
2/2✓ Branch 0 taken 7263 times.
✓ Branch 1 taken 27 times.
|
7290 | if (blk) { |
46 | 7263 | size_t avail = blk->alloc - blk->size; | |
47 |
2/2✓ Branch 0 taken 7235 times.
✓ Branch 1 taken 28 times.
|
7263 | if (size <= avail) { |
48 | 7235 | goto copy; | |
49 | } | ||
50 | 28 | add_block(buffer, blk); | |
51 | } | ||
52 | 55 | size = MAX(size, 8192); | |
53 | 55 | blk = block_new(size); | |
54 | |||
55 | 7290 | copy: | |
56 |
1/4✗ Branch 0 not taken.
✓ Branch 1 taken 7290 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
7290 | if (unlikely(len > SYN_HIGHLIGHT_MAX_LINE_LEN && buffer->options.syntax)) { |
57 | ✗ | error_msg ( | |
58 | "line length (%zu) exceeded limit (%ju); disabling syntax highlighting", | ||
59 | len, (uintmax_t)SYN_HIGHLIGHT_MAX_LINE_LEN | ||
60 | ); | ||
61 | ✗ | buffer->options.syntax = false; | |
62 | } | ||
63 | |||
64 | 7290 | memcpy(blk->data + blk->size, line, len); | |
65 | 7290 | blk->size += len; | |
66 | 7290 | blk->data[blk->size++] = '\n'; | |
67 | 7290 | blk->nl++; | |
68 | 7290 | return blk; | |
69 | } | ||
70 | |||
71 | 7320 | static bool read_utf8_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
72 | { | ||
73 | 7320 | const char *line = dec->ibuf + dec->ipos; | |
74 | 7320 | const char *nl = memchr(line, '\n', dec->isize - dec->ipos); | |
75 | 7320 | size_t len; | |
76 | |||
77 |
2/2✓ Branch 0 taken 7288 times.
✓ Branch 1 taken 32 times.
|
7320 | if (nl) { |
78 | 7288 | len = nl - line; | |
79 | 7288 | dec->ipos += len + 1; | |
80 | } else { | ||
81 | 32 | len = dec->isize - dec->ipos; | |
82 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 30 times.
|
32 | if (len == 0) { |
83 | return false; | ||
84 | } | ||
85 | 2 | dec->ipos += len; | |
86 | } | ||
87 | |||
88 | 7290 | *linep = line; | |
89 | 7290 | *lenp = len; | |
90 | 7290 | return true; | |
91 | } | ||
92 | |||
93 | 30 | static bool file_decoder_read_utf8(Buffer *buffer, const unsigned char *buf, size_t size) | |
94 | { | ||
95 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 30 times.
|
30 | if (unlikely(!encoding_is_utf8(buffer->encoding))) { |
96 | ✗ | errno = EINVAL; | |
97 | ✗ | return false; | |
98 | } | ||
99 | |||
100 | 30 | FileDecoder dec = { | |
101 | .ibuf = buf, | ||
102 | .isize = size, | ||
103 | }; | ||
104 | |||
105 | 30 | const char *line; | |
106 | 30 | size_t len; | |
107 | |||
108 |
2/2✓ Branch 0 taken 27 times.
✓ Branch 1 taken 3 times.
|
30 | if (!read_utf8_line(&dec, &line, &len)) { |
109 | return true; | ||
110 | } | ||
111 | |||
112 |
3/4✓ Branch 0 taken 27 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 26 times.
|
27 | if (len && line[len - 1] == '\r') { |
113 | 1 | buffer->crlf_newlines = true; | |
114 | 1 | len--; | |
115 | } | ||
116 | |||
117 | 27 | Block *blk = add_utf8_line(buffer, NULL, line, len); | |
118 | |||
119 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 26 times.
|
27 | if (unlikely(buffer->crlf_newlines)) { |
120 |
2/2✓ Branch 0 taken 270 times.
✓ Branch 1 taken 1 times.
|
271 | while (read_utf8_line(&dec, &line, &len)) { |
121 |
4/4✓ Branch 0 taken 268 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 267 times.
|
270 | if (len && line[len - 1] == '\r') { |
122 | 1 | len--; | |
123 | } | ||
124 | 270 | blk = add_utf8_line(buffer, blk, line, len); | |
125 | } | ||
126 | } else { | ||
127 |
2/2✓ Branch 0 taken 6993 times.
✓ Branch 1 taken 26 times.
|
7019 | while (read_utf8_line(&dec, &line, &len)) { |
128 | 6993 | blk = add_utf8_line(buffer, blk, line, len); | |
129 | } | ||
130 | } | ||
131 | |||
132 |
1/2✓ Branch 0 taken 27 times.
✗ Branch 1 not taken.
|
27 | if (blk) { |
133 | 27 | add_block(buffer, blk); | |
134 | } | ||
135 | |||
136 | return true; | ||
137 | } | ||
138 | |||
139 | 1 | static size_t unix_to_dos ( | |
140 | FileEncoder *enc, | ||
141 | const unsigned char *buf, | ||
142 | size_t size | ||
143 | ) { | ||
144 | // TODO: Pass in Buffer::nl and make this size adjustment more conservative | ||
145 | // (it's resized to handle the worst possible case, despite the fact that we | ||
146 | // already have the number of newlines pre-computed) | ||
147 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (enc->nsize < size * 2) { |
148 | 1 | enc->nsize = size * 2; | |
149 | 1 | enc->nbuf = xrealloc(enc->nbuf, enc->nsize); | |
150 | } | ||
151 | |||
152 | // TODO: Optimize this loop, by making use of memccpy(3) | ||
153 | size_t d = 0; | ||
154 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 1 times.
|
22 | for (size_t s = 0; s < size; s++) { |
155 | 21 | unsigned char ch = buf[s]; | |
156 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 18 times.
|
21 | if (ch == '\n') { |
157 | 3 | enc->nbuf[d++] = '\r'; | |
158 | } | ||
159 | 21 | enc->nbuf[d++] = ch; | |
160 | } | ||
161 | |||
162 | 1 | return d; | |
163 | } | ||
164 | |||
165 | #if ICONV_DISABLE == 1 // iconv not available; use basic, UTF-8 implementation: | ||
166 | |||
167 | bool conversion_supported_by_iconv ( | ||
168 | const char* UNUSED_ARG(from), | ||
169 | const char* UNUSED_ARG(to) | ||
170 | ) { | ||
171 | errno = EINVAL; | ||
172 | return false; | ||
173 | } | ||
174 | |||
175 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | ||
176 | { | ||
177 | if (unlikely(!encoding_is_utf8(encoding))) { | ||
178 | BUG("unsupported conversion; should have been handled earlier"); | ||
179 | } | ||
180 | |||
181 | return (FileEncoder) { | ||
182 | .crlf = crlf, | ||
183 | .fd = fd, | ||
184 | }; | ||
185 | } | ||
186 | |||
187 | void file_encoder_free(FileEncoder *enc) | ||
188 | { | ||
189 | free(enc->nbuf); | ||
190 | } | ||
191 | |||
192 | ssize_t file_encoder_write(FileEncoder *enc, const unsigned char *buf, size_t n) | ||
193 | { | ||
194 | if (unlikely(enc->crlf)) { | ||
195 | n = unix_to_dos(enc, buf, n); | ||
196 | buf = enc->nbuf; | ||
197 | } | ||
198 | return xwrite_all(enc->fd, buf, n); | ||
199 | } | ||
200 | |||
201 | size_t file_encoder_get_nr_errors(const FileEncoder* UNUSED_ARG(enc)) | ||
202 | { | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size) | ||
207 | { | ||
208 | return file_decoder_read_utf8(buffer, buf, size); | ||
209 | } | ||
210 | |||
211 | #else // ICONV_DISABLE != 1; use full iconv implementation: | ||
212 | |||
213 | #include <iconv.h> | ||
214 | |||
215 | // UTF-8 encoding of U+00BF (inverted question mark; "¿") | ||
216 | #define REPLACEMENT "\xc2\xbf" | ||
217 | |||
218 | struct cconv { | ||
219 | iconv_t cd; | ||
220 | char *obuf; | ||
221 | size_t osize; | ||
222 | size_t opos; | ||
223 | size_t consumed; | ||
224 | size_t errors; | ||
225 | |||
226 | // Temporary input buffer | ||
227 | char tbuf[16]; | ||
228 | size_t tcount; | ||
229 | |||
230 | // REPLACEMENT character, in target encoding | ||
231 | char rbuf[4]; | ||
232 | size_t rcount; | ||
233 | |||
234 | // Input character size in bytes, or zero for UTF-8 | ||
235 | size_t char_size; | ||
236 | }; | ||
237 | |||
238 | 1 | static struct cconv *create(iconv_t cd) | |
239 | { | ||
240 | 1 | struct cconv *c = xnew0(struct cconv, 1); | |
241 | 1 | c->cd = cd; | |
242 | 1 | c->osize = 8192; | |
243 | 1 | c->obuf = xmalloc(c->osize); | |
244 | 1 | return c; | |
245 | } | ||
246 | |||
247 | 2 | static size_t iconv_wrapper ( | |
248 | iconv_t cd, | ||
249 | const char **restrict inbuf, | ||
250 | size_t *restrict inbytesleft, | ||
251 | char **restrict outbuf, | ||
252 | size_t *restrict outbytesleft | ||
253 | ) { | ||
254 | // POSIX defines the second parameter of iconv(3) as "char **restrict" | ||
255 | // but NetBSD declares it as "const char **restrict" | ||
256 | #ifdef __NetBSD__ | ||
257 | const char **restrict in = inbuf; | ||
258 | #else | ||
259 | 2 | char **restrict in = (char **restrict)inbuf; | |
260 | #endif | ||
261 | |||
262 | 2 | return iconv(cd, in, inbytesleft, outbuf, outbytesleft); | |
263 | } | ||
264 | |||
265 | ✗ | static void resize_obuf(struct cconv *c) | |
266 | { | ||
267 | ✗ | c->osize = xmul(2, c->osize); | |
268 | ✗ | c->obuf = xrealloc(c->obuf, c->osize); | |
269 | ✗ | } | |
270 | |||
271 | ✗ | static void add_replacement(struct cconv *c) | |
272 | { | ||
273 | ✗ | if (c->osize - c->opos < 4) { | |
274 | ✗ | resize_obuf(c); | |
275 | } | ||
276 | |||
277 | ✗ | memcpy(c->obuf + c->opos, c->rbuf, c->rcount); | |
278 | ✗ | c->opos += c->rcount; | |
279 | ✗ | } | |
280 | |||
281 | ✗ | static size_t handle_invalid(struct cconv *c, const char *buf, size_t count) | |
282 | { | ||
283 | ✗ | LOG_DEBUG("%zu %zu", c->char_size, count); | |
284 | ✗ | add_replacement(c); | |
285 | ✗ | if (c->char_size == 0) { | |
286 | // Converting from UTF-8 | ||
287 | ✗ | size_t idx = 0; | |
288 | ✗ | CodePoint u = u_get_char(buf, count, &idx); | |
289 | ✗ | LOG_DEBUG("U+%04" PRIX32, u); | |
290 | ✗ | return idx; | |
291 | } | ||
292 | ✗ | if (c->char_size > count) { | |
293 | // wtf | ||
294 | ✗ | return 1; | |
295 | } | ||
296 | return c->char_size; | ||
297 | } | ||
298 | |||
299 | 1 | static int xiconv(struct cconv *c, const char **ib, size_t *ic) | |
300 | { | ||
301 | 1 | while (1) { | |
302 | 1 | char *ob = c->obuf + c->opos; | |
303 | 1 | size_t oc = c->osize - c->opos; | |
304 | 1 | size_t rc = iconv_wrapper(c->cd, ib, ic, &ob, &oc); | |
305 | 1 | c->opos = ob - c->obuf; | |
306 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (rc == (size_t)-1) { |
307 | ✗ | switch (errno) { | |
308 | ✗ | case EILSEQ: | |
309 | ✗ | c->errors++; | |
310 | // Reset | ||
311 | ✗ | iconv(c->cd, NULL, NULL, NULL, NULL); | |
312 | ✗ | return errno; | |
313 | case EINVAL: | ||
314 | return errno; | ||
315 | ✗ | case E2BIG: | |
316 | ✗ | resize_obuf(c); | |
317 | ✗ | continue; | |
318 | ✗ | default: | |
319 | − | BUG("iconv: %s", strerror(errno)); | |
320 | } | ||
321 | } else { | ||
322 | 1 | c->errors += rc; | |
323 | } | ||
324 | 1 | return 0; | |
325 | } | ||
326 | } | ||
327 | |||
328 | ✗ | static size_t convert_incomplete(struct cconv *c, const char *input, size_t len) | |
329 | { | ||
330 | ✗ | size_t ipos = 0; | |
331 | ✗ | while (c->tcount < sizeof(c->tbuf) && ipos < len) { | |
332 | ✗ | c->tbuf[c->tcount++] = input[ipos++]; | |
333 | ✗ | const char *ib = c->tbuf; | |
334 | ✗ | size_t ic = c->tcount; | |
335 | ✗ | int rc = xiconv(c, &ib, &ic); | |
336 | ✗ | if (ic > 0) { | |
337 | ✗ | memmove(c->tbuf, ib, ic); | |
338 | } | ||
339 | ✗ | c->tcount = ic; | |
340 | ✗ | if (rc == EINVAL) { | |
341 | // Incomplete character at end of input buffer; try again | ||
342 | // with more input data | ||
343 | ✗ | continue; | |
344 | } | ||
345 | ✗ | if (rc == EILSEQ) { | |
346 | // Invalid multibyte sequence | ||
347 | ✗ | size_t skip = handle_invalid(c, c->tbuf, c->tcount); | |
348 | ✗ | c->tcount -= skip; | |
349 | ✗ | if (c->tcount > 0) { | |
350 | ✗ | LOG_DEBUG("tcount=%zu, skip=%zu", c->tcount, skip); | |
351 | ✗ | memmove(c->tbuf, c->tbuf + skip, c->tcount); | |
352 | ✗ | continue; | |
353 | } | ||
354 | ✗ | return ipos; | |
355 | } | ||
356 | ✗ | break; | |
357 | } | ||
358 | |||
359 | ✗ | LOG_DEBUG("%zu %zu", ipos, c->tcount); | |
360 | ✗ | return ipos; | |
361 | } | ||
362 | |||
363 | 1 | static void cconv_process(struct cconv *c, const char *input, size_t len) | |
364 | { | ||
365 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (c->consumed > 0) { |
366 | ✗ | size_t fill = c->opos - c->consumed; | |
367 | ✗ | memmove(c->obuf, c->obuf + c->consumed, fill); | |
368 | ✗ | c->opos = fill; | |
369 | ✗ | c->consumed = 0; | |
370 | } | ||
371 | |||
372 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (c->tcount > 0) { |
373 | ✗ | size_t ipos = convert_incomplete(c, input, len); | |
374 | ✗ | input += ipos; | |
375 | ✗ | len -= ipos; | |
376 | } | ||
377 | |||
378 | 1 | const char *ib = input; | |
379 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | for (size_t ic = len; ic > 0; ) { |
380 | 1 | int r = xiconv(c, &ib, &ic); | |
381 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (r == EINVAL) { |
382 | // Incomplete character at end of input buffer | ||
383 | ✗ | if (ic < sizeof(c->tbuf)) { | |
384 | ✗ | memcpy(c->tbuf, ib, ic); | |
385 | ✗ | c->tcount = ic; | |
386 | } else { | ||
387 | // FIXME | ||
388 | ✗ | } | |
389 | ✗ | ic = 0; | |
390 | ✗ | continue; | |
391 | } | ||
392 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (r == EILSEQ) { |
393 | // Invalid multibyte sequence | ||
394 | ✗ | size_t skip = handle_invalid(c, ib, ic); | |
395 | ✗ | ic -= skip; | |
396 | ✗ | ib += skip; | |
397 | ✗ | continue; | |
398 | } | ||
399 | } | ||
400 | 1 | } | |
401 | |||
402 | ✗ | static struct cconv *cconv_to_utf8(const char *encoding) | |
403 | { | ||
404 | ✗ | iconv_t cd = iconv_open("UTF-8", encoding); | |
405 | ✗ | if (cd == (iconv_t)-1) { | |
406 | return NULL; | ||
407 | } | ||
408 | |||
409 | ✗ | struct cconv *c = create(cd); | |
410 | ✗ | c->rcount = copyliteral(c->rbuf, REPLACEMENT); | |
411 | |||
412 | ✗ | if (str_has_prefix(encoding, "UTF-16")) { | |
413 | ✗ | c->char_size = 2; | |
414 | ✗ | } else if (str_has_prefix(encoding, "UTF-32")) { | |
415 | ✗ | c->char_size = 4; | |
416 | } else { | ||
417 | ✗ | c->char_size = 1; | |
418 | } | ||
419 | |||
420 | return c; | ||
421 | } | ||
422 | |||
423 | 1 | static void encode_replacement(struct cconv *c) | |
424 | { | ||
425 | 1 | static const unsigned char rep[] = REPLACEMENT; | |
426 | 1 | const char *ib = rep; | |
427 | 1 | char *ob = c->rbuf; | |
428 | 1 | size_t ic = STRLEN(REPLACEMENT); | |
429 | 1 | size_t oc = sizeof(c->rbuf); | |
430 | 1 | size_t rc = iconv_wrapper(c->cd, &ib, &ic, &ob, &oc); | |
431 | |||
432 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (rc == (size_t)-1) { |
433 | 1 | c->rbuf[0] = '\xbf'; | |
434 | 1 | c->rcount = 1; | |
435 | } else { | ||
436 | ✗ | c->rcount = ob - c->rbuf; | |
437 | } | ||
438 | 1 | } | |
439 | |||
440 | 1 | static struct cconv *cconv_from_utf8(const char *encoding) | |
441 | { | ||
442 | 1 | iconv_t cd = iconv_open(encoding, "UTF-8"); | |
443 |
1/2✓ Branch 0 taken 1 times.
✗ Branch 1 not taken.
|
1 | if (cd == (iconv_t)-1) { |
444 | return NULL; | ||
445 | } | ||
446 | 1 | struct cconv *c = create(cd); | |
447 | 1 | encode_replacement(c); | |
448 | 1 | return c; | |
449 | } | ||
450 | |||
451 | 1 | static void cconv_flush(struct cconv *c) | |
452 | { | ||
453 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (c->tcount > 0) { |
454 | // Replace incomplete character at end of input buffer | ||
455 | ✗ | LOG_DEBUG("incomplete character at EOF"); | |
456 | ✗ | add_replacement(c); | |
457 | ✗ | c->tcount = 0; | |
458 | } | ||
459 | 1 | } | |
460 | |||
461 | ✗ | static char *cconv_consume_line(struct cconv *c, size_t *len) | |
462 | { | ||
463 | ✗ | char *line = c->obuf + c->consumed; | |
464 | ✗ | char *nl = memchr(line, '\n', c->opos - c->consumed); | |
465 | ✗ | if (!nl) { | |
466 | ✗ | *len = 0; | |
467 | ✗ | return NULL; | |
468 | } | ||
469 | |||
470 | ✗ | size_t n = nl - line + 1; | |
471 | ✗ | c->consumed += n; | |
472 | ✗ | *len = n; | |
473 | ✗ | return line; | |
474 | } | ||
475 | |||
476 | 1 | static char *cconv_consume_all(struct cconv *c, size_t *len) | |
477 | { | ||
478 | 1 | char *buf = c->obuf + c->consumed; | |
479 | 1 | *len = c->opos - c->consumed; | |
480 | 1 | c->consumed = c->opos; | |
481 | 1 | return buf; | |
482 | } | ||
483 | |||
484 | 1 | static void cconv_free(struct cconv *c) | |
485 | { | ||
486 | 1 | BUG_ON(!c); | |
487 | 1 | iconv_close(c->cd); | |
488 | 1 | free(c->obuf); | |
489 | 1 | free(c); | |
490 | 1 | } | |
491 | |||
492 | 2 | bool conversion_supported_by_iconv(const char *from, const char *to) | |
493 | { | ||
494 |
2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 2 times.
|
2 | if (unlikely(from[0] == '\0' || to[0] == '\0')) { |
495 | ✗ | errno = EINVAL; | |
496 | ✗ | return false; | |
497 | } | ||
498 | |||
499 | 2 | iconv_t cd = iconv_open(to, from); | |
500 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | if (cd == (iconv_t)-1) { |
501 | return false; | ||
502 | } | ||
503 | |||
504 | 2 | iconv_close(cd); | |
505 | 2 | return true; | |
506 | } | ||
507 | |||
508 | 16 | FileEncoder file_encoder(const char *encoding, bool crlf, int fd) | |
509 | { | ||
510 | 16 | struct cconv *cconv = NULL; | |
511 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
|
16 | if (unlikely(!encoding_is_utf8(encoding))) { |
512 | 1 | cconv = cconv_from_utf8(encoding); | |
513 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (!cconv) { |
514 | − | BUG("unsupported conversion; should have been handled earlier"); | |
515 | } | ||
516 | } | ||
517 | |||
518 | 16 | return (FileEncoder) { | |
519 | .cconv = cconv, | ||
520 | .crlf = crlf, | ||
521 | .fd = fd, | ||
522 | }; | ||
523 | } | ||
524 | |||
525 | 16 | void file_encoder_free(FileEncoder *enc) | |
526 | { | ||
527 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
|
16 | if (enc->cconv) { |
528 | 1 | cconv_free(enc->cconv); | |
529 | } | ||
530 | 16 | free(enc->nbuf); | |
531 | 16 | } | |
532 | |||
533 | // NOTE: buf must contain whole characters! | ||
534 | 16 | ssize_t file_encoder_write ( | |
535 | FileEncoder *enc, | ||
536 | const unsigned char *buf, | ||
537 | size_t size | ||
538 | ) { | ||
539 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
|
16 | if (unlikely(enc->crlf)) { |
540 | 1 | size = unix_to_dos(enc, buf, size); | |
541 | 1 | buf = enc->nbuf; | |
542 | } | ||
543 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
|
16 | if (unlikely(enc->cconv)) { |
544 | 1 | cconv_process(enc->cconv, buf, size); | |
545 | 1 | cconv_flush(enc->cconv); | |
546 | 1 | buf = cconv_consume_all(enc->cconv, &size); | |
547 | } | ||
548 | 16 | return xwrite_all(enc->fd, buf, size); | |
549 | } | ||
550 | |||
551 | 16 | size_t file_encoder_get_nr_errors(const FileEncoder *enc) | |
552 | { | ||
553 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 15 times.
|
16 | return enc->cconv ? enc->cconv->errors : 0; |
554 | } | ||
555 | |||
556 | ✗ | static bool fill(FileDecoder *dec) | |
557 | { | ||
558 | ✗ | if (dec->ipos == dec->isize) { | |
559 | return false; | ||
560 | } | ||
561 | |||
562 | // Smaller than cconv.obuf to make realloc less likely | ||
563 | ✗ | size_t max = 7 * 1024; | |
564 | |||
565 | ✗ | size_t icount = MIN(dec->isize - dec->ipos, max); | |
566 | ✗ | cconv_process(dec->cconv, dec->ibuf + dec->ipos, icount); | |
567 | ✗ | dec->ipos += icount; | |
568 | ✗ | if (dec->ipos == dec->isize) { | |
569 | // Must be flushed after all input has been fed | ||
570 | ✗ | cconv_flush(dec->cconv); | |
571 | } | ||
572 | return true; | ||
573 | } | ||
574 | |||
575 | ✗ | static bool decode_and_read_line(FileDecoder *dec, const char **linep, size_t *lenp) | |
576 | { | ||
577 | ✗ | char *line; | |
578 | ✗ | size_t len; | |
579 | ✗ | while (1) { | |
580 | ✗ | line = cconv_consume_line(dec->cconv, &len); | |
581 | ✗ | if (line || !fill(dec)) { | |
582 | break; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | ✗ | if (line) { | |
587 | // Newline not wanted | ||
588 | ✗ | len--; | |
589 | } else { | ||
590 | ✗ | line = cconv_consume_all(dec->cconv, &len); | |
591 | ✗ | if (len == 0) { | |
592 | return false; | ||
593 | } | ||
594 | } | ||
595 | |||
596 | ✗ | *linep = line; | |
597 | ✗ | *lenp = len; | |
598 | ✗ | return true; | |
599 | } | ||
600 | |||
601 | 30 | bool file_decoder_read(Buffer *buffer, const unsigned char *buf, size_t size) | |
602 | { | ||
603 |
1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
|
30 | if (encoding_is_utf8(buffer->encoding)) { |
604 | 30 | return file_decoder_read_utf8(buffer, buf, size); | |
605 | } | ||
606 | |||
607 | ✗ | struct cconv *cconv = cconv_to_utf8(buffer->encoding); | |
608 | ✗ | if (!cconv) { | |
609 | return false; | ||
610 | } | ||
611 | |||
612 | ✗ | FileDecoder dec = { | |
613 | .ibuf = buf, | ||
614 | .isize = size, | ||
615 | .cconv = cconv, | ||
616 | }; | ||
617 | |||
618 | ✗ | const char *line; | |
619 | ✗ | size_t len; | |
620 | |||
621 | ✗ | if (decode_and_read_line(&dec, &line, &len)) { | |
622 | ✗ | if (len && line[len - 1] == '\r') { | |
623 | ✗ | buffer->crlf_newlines = true; | |
624 | ✗ | len--; | |
625 | } | ||
626 | ✗ | Block *blk = add_utf8_line(buffer, NULL, line, len); | |
627 | ✗ | while (decode_and_read_line(&dec, &line, &len)) { | |
628 | ✗ | if (buffer->crlf_newlines && len && line[len - 1] == '\r') { | |
629 | ✗ | len--; | |
630 | } | ||
631 | ✗ | blk = add_utf8_line(buffer, blk, line, len); | |
632 | } | ||
633 | ✗ | if (blk) { | |
634 | ✗ | add_block(buffer, blk); | |
635 | } | ||
636 | } | ||
637 | |||
638 | ✗ | cconv_free(cconv); | |
639 | ✗ | return true; | |
640 | } | ||
641 | |||
642 | #endif | ||
643 |