diff options
-rw-r--r-- | lib/util/charset/iconv.c | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index a01b6a5787..10b3a6488b 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -51,6 +51,7 @@ static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *); static size_t ascii_push (void *,const char **, size_t *, char **, size_t *); static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *); static size_t utf8_push (void *,const char **, size_t *, char **, size_t *); +static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *); @@ -66,6 +67,10 @@ static const struct charset_functions builtin_functions[] = { /* we include the UTF-8 alias to cope with differing locale settings */ {"UTF8", utf8_pull, utf8_push}, {"UTF-8", utf8_pull, utf8_push}, + + /* this handles the munging needed for String2Key */ + {"UTF8_MUNGED", utf8_pull, utf8_munged_push}, + {"ASCII", ascii_pull, ascii_push}, {"UCS2-HEX", ucs2hex_pull, ucs2hex_push} }; @@ -707,4 +712,131 @@ error: } +/* + this takes a UTF16 sequence, munges it according to the string2key + rules, and produces a UTF8 sequence + +The rules are: + + 1) convert any instance of 0xD800 - 0xDBFF (high surrogate) + without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to + U+FFFD (OBJECT REPLACEMENT CHARACTER). + + 2) the same for any low surrogate that was not preceded by a high surrogate. + */ +static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + size_t in_left=*inbytesleft, out_left=*outbytesleft; + uint8_t *c = (uint8_t *)*outbuf; + const uint8_t *uc = (const uint8_t *)*inbuf; + + while (in_left >= 2 && out_left >= 1) { + unsigned int codepoint; + + if (uc[1] == 0 && !(uc[0] & 0x80)) { + /* simplest case */ + c[0] = uc[0]; + in_left -= 2; + out_left -= 1; + uc += 2; + c += 1; + continue; + } + + if ((uc[1]&0xf8) == 0) { + /* next simplest case */ + if (out_left < 2) { + errno = E2BIG; + goto error; + } + c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); + c[1] = 0x80 | (uc[0] & 0x3f); + in_left -= 2; + out_left -= 2; + uc += 2; + c += 2; + continue; + } + + if ((uc[1] & 0xfc) == 0xdc) { + /* low surrogate not preceded by high surrogate + convert to 0xfffd */ + codepoint = 0xfffd; + goto codepoint16; + } + + if ((uc[1] & 0xfc) != 0xd8) { + codepoint = uc[0] | (uc[1]<<8); + goto codepoint16; + } + + /* its the first part of a 4 byte sequence */ + if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) { + /* high surrogate not followed by low surrogate + convert to 0xfffd */ + codepoint = 0xfffd; + goto codepoint16; + } + + codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | + (uc[0]<<10) | ((uc[1] & 0x3)<<18)); + + if (out_left < 4) { + errno = E2BIG; + goto error; + } + c[0] = 0xf0 | (codepoint >> 18); + c[1] = 0x80 | ((codepoint >> 12) & 0x3f); + c[2] = 0x80 | ((codepoint >> 6) & 0x3f); + c[3] = 0x80 | (codepoint & 0x3f); + + in_left -= 4; + out_left -= 4; + uc += 4; + c += 4; + continue; + + codepoint16: + if (out_left < 3) { + errno = E2BIG; + goto error; + } + c[0] = 0xe0 | (codepoint >> 12); + c[1] = 0x80 | ((codepoint >> 6) & 0x3f); + c[2] = 0x80 | (codepoint & 0x3f); + + in_left -= 2; + out_left -= 3; + uc += 2; + c += 3; + continue; + } + + if (in_left == 1) { + errno = EINVAL; + goto error; + } + + if (in_left > 1) { + errno = E2BIG; + goto error; + } + + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = (const char *)uc; + *outbuf = (char *)c; + + return 0; + +error: + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = (const char *)uc; + *outbuf = (char *)c; + return -1; +} + + |