From 31c1c7846f6b6e5848bc39a28a65118bfa98e35d Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Wed, 1 Sep 2004 04:39:06 +0000 Subject: r2159: converted samba4 over to UTF-16. I had previously thought this was unnecessary, as windows doesn't use standards compliant UTF-16, and for filesystem operations treats bytes as UCS-2, but Bjoern Jacke has pointed out to me that this means we don't correctly store extended UTF-16 characters as UTF-8 on disk. This can be seen with (for example) the gothic characters with codepoints above 64k. This commit also adds a LOCAL-ICONV torture test that tests the first 1 million codepoints against the system iconv library, and tests 5 million random UTF-16LE buffers for identical error handling to the system iconv library. the lib/iconv.c changes need backporting to samba3 (This used to be commit 756f28ac95feaa84b42402723d5f7286865c78db) --- source4/lib/charcnv.c | 34 +++--- source4/lib/iconv.c | 262 ++++++++++++++++++++++++++++++++++------------ source4/lib/util_unistr.c | 4 +- 3 files changed, 216 insertions(+), 84 deletions(-) (limited to 'source4/lib') diff --git a/source4/lib/charcnv.c b/source4/lib/charcnv.c index 8204a3bcdd..2109e957d9 100644 --- a/source4/lib/charcnv.c +++ b/source4/lib/charcnv.c @@ -47,12 +47,12 @@ static const char *charset_name(charset_t ch) { const char *ret = NULL; - if (ch == CH_UCS2) ret = "UTF-16LE"; + if (ch == CH_UTF16) ret = "UTF-16LE"; else if (ch == CH_UNIX) ret = lp_unix_charset(); else if (ch == CH_DOS) ret = lp_dos_charset(); else if (ch == CH_DISPLAY) ret = lp_display_charset(); else if (ch == CH_UTF8) ret = "UTF8"; - else if (ch == CH_UCS2BE) ret = "UCS-2BE"; + else if (ch == CH_UTF16BE) ret = "UTF-16BE"; if (!ret || !*ret) ret = "ASCII"; return ret; @@ -81,13 +81,13 @@ void init_iconv(void) /* so that charset_name() works we need to get the UNIX<->UCS2 going first */ - if (!conv_handles[CH_UNIX][CH_UCS2]) - conv_handles[CH_UNIX][CH_UCS2] = smb_iconv_open(charset_name(CH_UCS2), + if (!conv_handles[CH_UNIX][CH_UTF16]) + conv_handles[CH_UNIX][CH_UTF16] = smb_iconv_open(charset_name(CH_UTF16), "ASCII"); - if (!conv_handles[CH_UCS2][CH_UNIX]) - conv_handles[CH_UCS2][CH_UNIX] = smb_iconv_open("ASCII", - charset_name(CH_UCS2)); + if (!conv_handles[CH_UTF16][CH_UNIX]) + conv_handles[CH_UTF16][CH_UNIX] = smb_iconv_open("ASCII", + charset_name(CH_UTF16)); for (c1=0;c1direct = to->push; return ret; } - if (strcasecmp(tocode, "UCS-2LE") == 0 && from) { + if (strcasecmp(tocode, "UTF-16LE") == 0 && from) { ret->direct = from->pull; return ret; } #ifdef HAVE_NATIVE_ICONV - if (strcasecmp(fromcode, "UCS-2LE") == 0) { + if (strcasecmp(fromcode, "UTF-16LE") == 0) { ret->direct = sys_iconv; ret->cd_direct = ret->cd_push; ret->cd_push = NULL; return ret; } - if (strcasecmp(tocode, "UCS-2LE") == 0) { + if (strcasecmp(tocode, "UTF-16LE") == 0) { ret->direct = sys_iconv; ret->cd_direct = ret->cd_pull; ret->cd_pull = NULL; @@ -460,100 +461,231 @@ static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft, static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { - while (*inbytesleft >= 1 && *outbytesleft >= 2) { - const uint8_t *c = (const uint8_t *)*inbuf; - uint8_t *uc = (uint8_t *)*outbuf; - int len = 1; + size_t in_left=*inbytesleft, out_left=*outbytesleft; + const uint8_t *c = (const uint8_t *)*inbuf; + uint8_t *uc = (uint8_t *)*outbuf; + while (in_left >= 1 && out_left >= 2) { if ((c[0] & 0x80) == 0) { uc[0] = c[0]; uc[1] = 0; - } else if ((c[0] & 0xf0) == 0xe0) { - if (*inbytesleft < 3) { - DEBUG(0,("short utf8 char\n")); - goto badseq; + c += 1; + in_left -= 1; + out_left -= 2; + uc += 2; + continue; + } + + if ((c[0] & 0xe0) == 0xc0) { + if (in_left < 2 || + (c[1] & 0xc0) != 0x80) { + errno = EILSEQ; + goto error; + } + uc[1] = (c[0]>>2) & 0x7; + uc[0] = (c[0]<<6) | (c[1]&0x3f); + c += 2; + in_left -= 2; + out_left -= 2; + uc += 2; + continue; + } + + if ((c[0] & 0xf0) == 0xe0) { + if (in_left < 3 || + (c[1] & 0xc0) != 0x80 || + (c[2] & 0xc0) != 0x80) { + errno = EILSEQ; + goto error; } uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF); uc[0] = (c[1]<<6) | (c[2]&0x3f); - len = 3; - } else if ((c[0] & 0xe0) == 0xc0) { - if (*inbytesleft < 2) { - DEBUG(0,("short utf8 char\n")); - goto badseq; + c += 3; + in_left -= 3; + out_left -= 2; + uc += 2; + continue; + } + + if ((c[0] & 0xf8) == 0xf0) { + unsigned int codepoint; + if (in_left < 4 || + (c[1] & 0xc0) != 0x80 || + (c[2] & 0xc0) != 0x80 || + (c[3] & 0xc0) != 0x80) { + errno = EILSEQ; + goto error; } - uc[1] = (c[0]>>2) & 0x7; - uc[0] = (c[0]<<6) | (c[1]&0x3f); - len = 2; + codepoint = + (c[3]&0x3f) | + ((c[2]&0x3f)<<6) | + ((c[1]&0x3f)<<12) | + ((c[0]&0x7)<<18); + if (codepoint < 0x10000) { + /* accept UTF-8 characters that are not + minimally packed, but pack the result */ + uc[0] = (codepoint & 0xFF); + uc[1] = (codepoint >> 8); + c += 4; + in_left -= 4; + out_left -= 2; + uc += 2; + continue; + } + + codepoint -= 0x10000; + + if (out_left < 4) { + errno = E2BIG; + goto error; + } + + uc[0] = (codepoint>>10) & 0xFF; + uc[1] = (codepoint>>18) | 0xd8; + uc[2] = codepoint & 0xFF; + uc[3] = ((codepoint>>8) & 0x3) | 0xdc; + c += 4; + in_left -= 4; + out_left -= 4; + uc += 4; + continue; } - (*inbuf) += len; - (*inbytesleft) -= len; - (*outbytesleft) -= 2; - (*outbuf) += 2; + /* we don't handle 5 byte sequences */ + errno = EINVAL; + goto error; } - if (*inbytesleft > 0) { + if (in_left > 0) { errno = E2BIG; - return -1; + goto error; } - + + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = c; + *outbuf = uc; return 0; -badseq: - errno = EINVAL; +error: + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = c; + *outbuf = uc; return -1; } static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) + char **outbuf, size_t *outbytesleft) { - while (*inbytesleft >= 2 && *outbytesleft >= 1) { - uint8_t *c = (uint8_t *)*outbuf; - const uint8_t *uc = (const uint8_t *)*inbuf; - int len=1; - - if (uc[1] & 0xf8) { - if (*outbytesleft < 3) { - DEBUG(0,("short utf8 write\n")); - goto toobig; + size_t in_left=*inbytesleft, out_left=*outbytesleft; + uint8_t *c = (uint8_t *)*outbuf; + const uint8_t *uc = (const uint8_t *)*inbuf; + + while (in_left >= 2 && out_left >= 1) { + unsigned int codepoint; + + if (uc[1] == 0 && !(uc[0] & 0x80)) { + /* simplest case */ + c[0] = uc[0]; + in_left -= 2; + out_left -= 1; + uc += 2; + c += 1; + continue; + } + + if ((uc[1]&0xf8) == 0) { + /* next simplest case */ + if (out_left < 2) { + errno = E2BIG; + goto error; } - c[0] = 0xe0 | (uc[1]>>4); - c[1] = 0x80 | ((uc[1]&0xF)<<2) | (uc[0]>>6); - c[2] = 0x80 | (uc[0]&0x3f); - len = 3; - } else if (uc[1] | (uc[0] & 0x80)) { - if (*outbytesleft < 2) { - DEBUG(0,("short utf8 write\n")); - goto toobig; + c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); + c[1] = 0x80 | (uc[0] & 0x3f); + in_left -= 2; + out_left -= 2; + uc += 2; + c += 2; + continue; + } + + if ((uc[1] & 0xfc) == 0xdc) { + /* its the second part of a 4 byte sequence. Illegal */ + if (in_left < 4) { + errno = EINVAL; + } else { + errno = EILSEQ; } - c[0] = 0xc0 | (uc[1]<<2) | (uc[0]>>6); - c[1] = 0x80 | (uc[0]&0x3f); - len = 2; - } else { - c[0] = uc[0]; + goto error; } + if ((uc[1] & 0xfc) != 0xd8) { + codepoint = uc[0] | (uc[1]<<8); + if (out_left < 3) { + errno = E2BIG; + goto error; + } + c[0] = 0xe0 | (codepoint >> 12); + c[1] = 0x80 | ((codepoint >> 6) & 0x3f); + c[2] = 0x80 | (codepoint & 0x3f); + + in_left -= 2; + out_left -= 3; + uc += 2; + c += 3; + continue; + } - (*inbytesleft) -= 2; - (*outbytesleft) -= len; - (*inbuf) += 2; - (*outbuf) += len; + /* its the first part of a 4 byte sequence */ + if (in_left < 4) { + errno = EINVAL; + goto error; + } + if ((uc[3] & 0xfc) != 0xdc) { + errno = EILSEQ; + goto error; + } + codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | + (uc[0]<<10) | ((uc[1] & 0x3)<<18)); + + if (out_left < 4) { + errno = E2BIG; + goto error; + } + c[0] = 0xf0 | (codepoint >> 18); + c[1] = 0x80 | ((codepoint >> 12) & 0x3f); + c[2] = 0x80 | ((codepoint >> 6) & 0x3f); + c[3] = 0x80 | (codepoint & 0x3f); + + in_left -= 4; + out_left -= 4; + uc += 4; + c += 4; } - if (*inbytesleft == 1) { + if (in_left == 1) { errno = EINVAL; - return -1; + goto error; } - if (*inbytesleft > 1) { + if (in_left > 1) { errno = E2BIG; - return -1; + goto error; } + + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = uc; + *outbuf = c; return 0; -toobig: - errno = E2BIG; +error: + *inbytesleft = in_left; + *outbytesleft = out_left; + *inbuf = uc; + *outbuf = c; return -1; } diff --git a/source4/lib/util_unistr.c b/source4/lib/util_unistr.c index 2bd990836e..63d68fa12e 100644 --- a/source4/lib/util_unistr.c +++ b/source4/lib/util_unistr.c @@ -96,9 +96,9 @@ static int check_dos_char(smb_ucs2_t c) char buf[10]; smb_ucs2_t c2 = 0; int len1, len2; - len1 = convert_string(CH_UCS2, CH_DOS, &c, 2, buf, sizeof(buf)); + len1 = convert_string(CH_UTF16, CH_DOS, &c, 2, buf, sizeof(buf)); if (len1 == 0) return 0; - len2 = convert_string(CH_DOS, CH_UCS2, buf, len1, &c2, 2); + len2 = convert_string(CH_DOS, CH_UTF16, buf, len1, &c2, 2); if (len2 != 2) return 0; return (c == c2); } -- cgit