Libparserutils
codec_utf16.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
14 
16 #include "utils/endian.h"
17 #include "utils/utils.h"
18 
22 typedef struct charset_utf16_codec {
25 #define INVAL_BUFSIZE (32)
29  size_t inval_len; /*< Byte length of inval_buf **/
30 
31 #define READ_BUFSIZE (8)
32  uint32_t read_buf[READ_BUFSIZE];
35  size_t read_len;
37 #define WRITE_BUFSIZE (8)
41  size_t write_len;
44 
45 static bool charset_utf16_codec_handles_charset(const char *charset);
46 static parserutils_error charset_utf16_codec_create(const char *charset,
52  const uint8_t **source, size_t *sourcelen,
53  uint8_t **dest, size_t *destlen);
56  const uint8_t **source, size_t *sourcelen,
57  uint8_t **dest, size_t *destlen);
62  const uint8_t **source, size_t *sourcelen,
63  uint8_t **dest, size_t *destlen);
66  uint32_t ucs4, uint8_t **dest, size_t *destlen);
67 
74 bool charset_utf16_codec_handles_charset(const char *charset)
75 {
76  return parserutils_charset_mibenum_from_name(charset, strlen(charset))
77  ==
78  parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
79 }
80 
92 {
94 
95  UNUSED(charset);
96 
97  c = malloc(sizeof(charset_utf16_codec));
98  if (c == NULL)
99  return PARSERUTILS_NOMEM;
100 
101  c->inval_buf[0] = '\0';
102  c->inval_len = 0;
103 
104  c->read_buf[0] = 0;
105  c->read_len = 0;
106 
107  c->write_buf[0] = 0;
108  c->write_len = 0;
109 
110  /* Finally, populate vtable */
115 
116  *codec = (parserutils_charset_codec *) c;
117 
118  return PARSERUTILS_OK;
119 }
120 
128 {
129  UNUSED(codec);
130 
131  return PARSERUTILS_OK;
132 }
133 
162  const uint8_t **source, size_t *sourcelen,
163  uint8_t **dest, size_t *destlen)
164 {
166  uint32_t ucs4;
167  uint32_t *towrite;
168  size_t towritelen;
169  parserutils_error error;
170 
171  /* Process any outstanding characters from the previous call */
172  if (c->write_len > 0) {
173  uint32_t *pwrite = c->write_buf;
174  uint8_t buf[4];
175  size_t len;
176 
177  while (c->write_len > 0) {
179  pwrite[0], buf, &len);
180  assert(error == PARSERUTILS_OK);
181 
182  if (*destlen < len) {
183  /* Insufficient output buffer space */
184  for (len = 0; len < c->write_len; len++)
185  c->write_buf[len] = pwrite[len];
186 
187  return PARSERUTILS_NOMEM;
188  }
189 
190  memcpy(*dest, buf, len);
191 
192  *dest += len;
193  *destlen -= len;
194 
195  pwrite++;
196  c->write_len--;
197  }
198  }
199 
200  /* Now process the characters for this call */
201  while (*sourcelen > 0) {
202  ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
203  towrite = &ucs4;
204  towritelen = 1;
205 
206  /* Output current characters */
207  while (towritelen > 0) {
208  uint8_t buf[4];
209  size_t len;
210 
212  towrite[0], buf, &len);
213  assert(error == PARSERUTILS_OK);
214 
215  if (*destlen < len) {
216  /* Insufficient output space */
217  assert(towritelen < WRITE_BUFSIZE);
218 
219  c->write_len = towritelen;
220 
221  /* Copy pending chars to save area, for
222  * processing next call. */
223  for (len = 0; len < towritelen; len++)
224  c->write_buf[len] = towrite[len];
225 
226  /* Claim character we've just buffered,
227  * so it's not reprocessed */
228  *source += 4;
229  *sourcelen -= 4;
230 
231  return PARSERUTILS_NOMEM;
232  }
233 
234  memcpy(*dest, buf, len);
235 
236  *dest += len;
237  *destlen -= len;
238 
239  towrite++;
240  towritelen--;
241  }
242 
243  *source += 4;
244  *sourcelen -= 4;
245  }
246 
247  (void) error;
248 
249  return PARSERUTILS_OK;
250 }
251 
294  const uint8_t **source, size_t *sourcelen,
295  uint8_t **dest, size_t *destlen)
296 {
298  parserutils_error error;
299 
300  if (c->read_len > 0) {
301  /* Output left over from last decode */
302  uint32_t *pread = c->read_buf;
303 
304  while (c->read_len > 0 && *destlen >= c->read_len * 4) {
305  *((uint32_t *) (void *) *dest) =
306  endian_host_to_big(pread[0]);
307 
308  *dest += 4;
309  *destlen -= 4;
310 
311  pread++;
312  c->read_len--;
313  }
314 
315  if (*destlen < c->read_len * 4) {
316  /* Ran out of output buffer */
317  size_t i;
318 
319  /* Shuffle remaining output down */
320  for (i = 0; i < c->read_len; i++)
321  c->read_buf[i] = pread[i];
322 
323  return PARSERUTILS_NOMEM;
324  }
325  }
326 
327  if (c->inval_len > 0) {
328  /* The last decode ended in an incomplete sequence.
329  * Fill up inval_buf with data from the start of the
330  * new chunk and process it. */
331  uint8_t *in = c->inval_buf;
332  size_t ol = c->inval_len;
333  size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
334  size_t orig_l = l;
335 
336  memcpy(c->inval_buf + ol, *source, l);
337 
338  l += c->inval_len;
339 
341  (const uint8_t **) &in, &l, dest, destlen);
342  if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
343  return error;
344  }
345 
346  /* And now, fix up source pointers */
347  *source += max((signed) (orig_l - l), 0);
348  *sourcelen -= max((signed) (orig_l - l), 0);
349 
350  /* Failed to resolve an incomplete character and
351  * ran out of buffer space. No recovery strategy
352  * possible, so explode everywhere. */
353  assert((orig_l + ol) - l != 0);
354 
355  /* Report memory exhaustion case from above */
356  if (error != PARSERUTILS_OK)
357  return error;
358  }
359 
360  /* Finally, the "normal" case; process all outstanding characters */
361  while (*sourcelen > 0) {
363  source, sourcelen, dest, destlen);
364  if (error != PARSERUTILS_OK) {
365  return error;
366  }
367  }
368 
369  return PARSERUTILS_OK;
370 }
371 
379 {
381 
382  c->inval_buf[0] = '\0';
383  c->inval_len = 0;
384 
385  c->read_buf[0] = 0;
386  c->read_len = 0;
387 
388  c->write_buf[0] = 0;
389  c->write_len = 0;
390 
391  return PARSERUTILS_OK;
392 }
393 
394 
424  const uint8_t **source, size_t *sourcelen,
425  uint8_t **dest, size_t *destlen)
426 {
427  uint32_t ucs4;
428  size_t sucs4;
429  parserutils_error error;
430 
431  /* Convert a single character */
432  error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
433  &ucs4, &sucs4);
434  if (error == PARSERUTILS_OK) {
435  /* Read a character */
437  ucs4, dest, destlen);
438  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
439  /* output succeeded; update source pointers */
440  *source += sucs4;
441  *sourcelen -= sucs4;
442  }
443 
444  /* Clear inval buffer */
445  c->inval_buf[0] = '\0';
446  c->inval_len = 0;
447 
448  return error;
449  } else if (error == PARSERUTILS_NEEDDATA) {
450  /* Incomplete input sequence */
451  assert(*sourcelen < INVAL_BUFSIZE);
452 
453  memmove(c->inval_buf, *source, *sourcelen);
454  c->inval_buf[*sourcelen] = '\0';
455  c->inval_len = *sourcelen;
456 
457  *source += *sourcelen;
458  *sourcelen = 0;
459 
460  return PARSERUTILS_OK;
461  } else if (error == PARSERUTILS_INVALID) {
462  /* Illegal input sequence */
463  uint32_t nextchar;
464 
465  /* Clear inval buffer */
466  c->inval_buf[0] = '\0';
467  c->inval_len = 0;
468 
469  /* Strict errormode; simply flag invalid character */
470  if (c->base.errormode ==
472  return PARSERUTILS_INVALID;
473  }
474 
475  /* Find next valid UTF-16 sequence.
476  * We're processing client-provided data, so let's
477  * be paranoid about its validity. */
479  *source, *sourcelen, 0, &nextchar);
480  if (error != PARSERUTILS_OK) {
481  if (error == PARSERUTILS_NEEDDATA) {
482  /* Need more data to be sure */
483  assert(*sourcelen < INVAL_BUFSIZE);
484 
485  memmove(c->inval_buf, *source, *sourcelen);
486  c->inval_buf[*sourcelen] = '\0';
487  c->inval_len = *sourcelen;
488 
489  *source += *sourcelen;
490  *sourcelen = 0;
491 
492  nextchar = 0;
493  } else {
494  return error;
495  }
496  }
497 
498  /* output U+FFFD and continue processing. */
500  0xFFFD, dest, destlen);
501  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
502  /* output succeeded; update source pointers */
503  *source += nextchar;
504  *sourcelen -= nextchar;
505  }
506 
507  return error;
508  }
509 
510  return PARSERUTILS_OK;
511 }
512 
524  uint32_t ucs4, uint8_t **dest, size_t *destlen)
525 {
526  if (*destlen < 4) {
527  /* Run out of output buffer */
528  c->read_len = 1;
529  c->read_buf[0] = ucs4;
530 
531  return PARSERUTILS_NOMEM;
532  }
533 
534  *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
535  *dest += 4;
536  *destlen -= 4;
537 
538  return PARSERUTILS_OK;
539 }
540 
541 
545 };
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
size_t len
Definition: codec_8859.c:23
static parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
Clear a UTF-16 codec's encoding state.
Definition: codec_utf16.c:378
#define INVAL_BUFSIZE
Definition: codec_utf16.c:25
struct charset_utf16_codec charset_utf16_codec
UTF-16 charset codec.
const parserutils_charset_handler charset_utf16_codec_handler
Definition: codec_utf16.c:542
static parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the UTF-16 to UCS-4 (big endian)
Definition: codec_utf16.c:423
static parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_utf16.c:523
#define READ_BUFSIZE
Definition: codec_utf16.c:31
static parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into UTF-16.
Definition: codec_utf16.c:161
static parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of UTF-16 data into UCS-4 (big endian)
Definition: codec_utf16.c:293
static bool charset_utf16_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_utf16.c:74
static parserutils_error charset_utf16_codec_destroy(parserutils_charset_codec *codec)
Destroy a UTF-16 codec.
Definition: codec_utf16.c:127
static parserutils_error charset_utf16_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a UTF-16 codec.
Definition: codec_utf16.c:90
#define WRITE_BUFSIZE
Definition: codec_utf16.c:37
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
parserutils_error
Definition: errors.h:18
@ PARSERUTILS_OK
Definition: errors.h:19
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
@ PARSERUTILS_INVALID
Definition: errors.h:23
@ PARSERUTILS_NOMEM
Definition: errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
UTF-16 charset codec.
Definition: codec_utf16.c:22
parserutils_charset_codec base
Base class.
Definition: codec_utf16.c:23
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_utf16.c:38
size_t read_len
Character length of read_buf.
Definition: codec_utf16.c:35
uint8_t inval_buf[INVAL_BUFSIZE]
Buffer for fixing up incomplete input sequences.
Definition: codec_utf16.c:26
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_utf16.c:32
size_t write_len
Character length of write_buf.
Definition: codec_utf16.c:41
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
Codec factory component definition.
Definition: codec_impl.h:39
UTF-16 manipulation functions (interface).
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, size_t len, uint32_t *ucs4, size_t *clen)
Convert a UTF-16 sequence into a single UCS-4 character.
Definition: utf16.c:27
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition: utf16.c:214
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, size_t *len)
Convert a single UCS-4 character into a UTF-16 sequence.
Definition: utf16.c:70
#define UNUSED(x)
Definition: utils.h:25
#define min(a, b)
Definition: utils.h:16
#define SLEN(s)
Definition: utils.h:21
#define max(a, b)
Definition: utils.h:12