From 55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442 Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Fri, 31 Oct 2008 15:41:34 +1100 Subject: finished adding UTF16_MUNGED charset Changed the approach for the charset to go via utf16, which makes a bit more sense to read. Added a testsuiite for UTF16_MUNGED as part of LOCAL-ICONV --- lib/util/charset/iconv.c | 111 +++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 70 deletions(-) (limited to 'lib/util/charset/iconv.c') diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index 10b3a6488b..b6842a49aa 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -51,7 +51,7 @@ static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *); static size_t ascii_push (void *,const char **, size_t *, char **, size_t *); static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *); static size_t utf8_push (void *,const char **, size_t *, char **, size_t *); -static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *); +static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *); @@ -69,7 +69,7 @@ static const struct charset_functions builtin_functions[] = { {"UTF-8", utf8_pull, utf8_push}, /* this handles the munging needed for String2Key */ - {"UTF8_MUNGED", utf8_pull, utf8_munged_push}, + {"UTF16_MUNGED", utf16_munged_pull, iconv_copy}, {"ASCII", ascii_pull, ascii_push}, {"UCS2-HEX", ucs2hex_pull, ucs2hex_push} @@ -713,103 +713,74 @@ error: /* - this takes a UTF16 sequence, munges it according to the string2key - rules, and produces a UTF8 sequence + this takes a UTF16 munged sequence, modifies it according to the + string2key rules, and produces a UTF16 sequence The rules are: - 1) convert any instance of 0xD800 - 0xDBFF (high surrogate) + 1) any 0x0000 characters are mapped to 0x0001 + + 2) convert any instance of 0xD800 - 0xDBFF (high surrogate) without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to U+FFFD (OBJECT REPLACEMENT CHARACTER). - 2) the same for any low surrogate that was not preceded by a high surrogate. + 3) the same for any low surrogate that was not preceded by a high surrogate. + */ -static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft, +static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { size_t in_left=*inbytesleft, out_left=*outbytesleft; uint8_t *c = (uint8_t *)*outbuf; const uint8_t *uc = (const uint8_t *)*inbuf; - while (in_left >= 2 && out_left >= 1) { - unsigned int codepoint; + while (in_left >= 2 && out_left >= 2) { + unsigned int codepoint = uc[0] | (uc[1]<<8); - if (uc[1] == 0 && !(uc[0] & 0x80)) { - /* simplest case */ - c[0] = uc[0]; - in_left -= 2; - out_left -= 1; - uc += 2; - c += 1; - continue; + if (codepoint == 0) { + codepoint = 1; } - if ((uc[1]&0xf8) == 0) { - /* next simplest case */ - if (out_left < 2) { + if ((codepoint & 0xfc00) == 0xd800) { + /* a high surrogate */ + unsigned int codepoint2; + if (in_left < 4) { + codepoint = 0xfffd; + goto codepoint16; + } + codepoint2 = uc[2] | (uc[3]<<8); + if ((codepoint2 & 0xfc00) != 0xdc00) { + /* high surrogate not followed by low + surrogate: convert to 0xfffd */ + codepoint = 0xfffd; + goto codepoint16; + } + if (out_left < 4) { errno = E2BIG; goto error; } - c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); - c[1] = 0x80 | (uc[0] & 0x3f); - in_left -= 2; - out_left -= 2; - uc += 2; - c += 2; + memcpy(c, uc, 4); + in_left -= 4; + out_left -= 4; + uc += 4; + c += 4; continue; } - if ((uc[1] & 0xfc) == 0xdc) { - /* low surrogate not preceded by high surrogate - convert to 0xfffd */ - codepoint = 0xfffd; - goto codepoint16; - } - - if ((uc[1] & 0xfc) != 0xd8) { - codepoint = uc[0] | (uc[1]<<8); - goto codepoint16; - } - - /* its the first part of a 4 byte sequence */ - if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) { - /* high surrogate not followed by low surrogate - convert to 0xfffd */ + if ((codepoint & 0xfc00) == 0xdc00) { + /* low surrogate not preceded by high + surrogate: convert to 0xfffd */ codepoint = 0xfffd; - goto codepoint16; - } - - codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | - (uc[0]<<10) | ((uc[1] & 0x3)<<18)); - - if (out_left < 4) { - errno = E2BIG; - goto error; } - c[0] = 0xf0 | (codepoint >> 18); - c[1] = 0x80 | ((codepoint >> 12) & 0x3f); - c[2] = 0x80 | ((codepoint >> 6) & 0x3f); - c[3] = 0x80 | (codepoint & 0x3f); - - in_left -= 4; - out_left -= 4; - uc += 4; - c += 4; - continue; codepoint16: - if (out_left < 3) { - errno = E2BIG; - goto error; - } - c[0] = 0xe0 | (codepoint >> 12); - c[1] = 0x80 | ((codepoint >> 6) & 0x3f); - c[2] = 0x80 | (codepoint & 0x3f); + c[0] = codepoint & 0xFF; + c[1] = (codepoint>>8) & 0xFF; in_left -= 2; - out_left -= 3; + out_left -= 2; uc += 2; - c += 3; + c += 2; continue; } -- cgit