From 55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442 Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Fri, 31 Oct 2008 15:41:34 +1100 Subject: finished adding UTF16_MUNGED charset Changed the approach for the charset to go via utf16, which makes a bit more sense to read. Added a testsuiite for UTF16_MUNGED as part of LOCAL-ICONV --- lib/util/charset/charcnv.c | 1 + lib/util/charset/charset.h | 4 +- lib/util/charset/iconv.c | 111 +++++++++++++++-------------------------- lib/util/charset/tests/iconv.c | 58 +++++++++++++++++++++ 4 files changed, 102 insertions(+), 72 deletions(-) (limited to 'lib') diff --git a/lib/util/charset/charcnv.c b/lib/util/charset/charcnv.c index 2ae16c3250..9dd68f05ea 100644 --- a/lib/util/charset/charcnv.c +++ b/lib/util/charset/charcnv.c @@ -57,6 +57,7 @@ static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) case CH_DOS: return ic->dos_charset; case CH_UTF8: return "UTF8"; case CH_UTF16BE: return "UTF-16BE"; + case CH_UTF16MUNGED: return "UTF16_MUNGED"; default: return "ASCII"; } diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h index 21fc20b8c3..cace79f949 100644 --- a/lib/util/charset/charset.h +++ b/lib/util/charset/charset.h @@ -28,9 +28,9 @@ #include /* this defines the charset types used in samba */ -typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE} charset_t; +typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE, CH_UTF16MUNGED} charset_t; -#define NUM_CHARSETS 5 +#define NUM_CHARSETS 6 /* * for each charset we have a function that pulls from that charset to diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index 10b3a6488b..b6842a49aa 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -51,7 +51,7 @@ static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *); static size_t ascii_push (void *,const char **, size_t *, char **, size_t *); static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *); static size_t utf8_push (void *,const char **, size_t *, char **, size_t *); -static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *); +static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *); @@ -69,7 +69,7 @@ static const struct charset_functions builtin_functions[] = { {"UTF-8", utf8_pull, utf8_push}, /* this handles the munging needed for String2Key */ - {"UTF8_MUNGED", utf8_pull, utf8_munged_push}, + {"UTF16_MUNGED", utf16_munged_pull, iconv_copy}, {"ASCII", ascii_pull, ascii_push}, {"UCS2-HEX", ucs2hex_pull, ucs2hex_push} @@ -713,103 +713,74 @@ error: /* - this takes a UTF16 sequence, munges it according to the string2key - rules, and produces a UTF8 sequence + this takes a UTF16 munged sequence, modifies it according to the + string2key rules, and produces a UTF16 sequence The rules are: - 1) convert any instance of 0xD800 - 0xDBFF (high surrogate) + 1) any 0x0000 characters are mapped to 0x0001 + + 2) convert any instance of 0xD800 - 0xDBFF (high surrogate) without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to U+FFFD (OBJECT REPLACEMENT CHARACTER). - 2) the same for any low surrogate that was not preceded by a high surrogate. + 3) the same for any low surrogate that was not preceded by a high surrogate. + */ -static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft, +static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { size_t in_left=*inbytesleft, out_left=*outbytesleft; uint8_t *c = (uint8_t *)*outbuf; const uint8_t *uc = (const uint8_t *)*inbuf; - while (in_left >= 2 && out_left >= 1) { - unsigned int codepoint; + while (in_left >= 2 && out_left >= 2) { + unsigned int codepoint = uc[0] | (uc[1]<<8); - if (uc[1] == 0 && !(uc[0] & 0x80)) { - /* simplest case */ - c[0] = uc[0]; - in_left -= 2; - out_left -= 1; - uc += 2; - c += 1; - continue; + if (codepoint == 0) { + codepoint = 1; } - if ((uc[1]&0xf8) == 0) { - /* next simplest case */ - if (out_left < 2) { + if ((codepoint & 0xfc00) == 0xd800) { + /* a high surrogate */ + unsigned int codepoint2; + if (in_left < 4) { + codepoint = 0xfffd; + goto codepoint16; + } + codepoint2 = uc[2] | (uc[3]<<8); + if ((codepoint2 & 0xfc00) != 0xdc00) { + /* high surrogate not followed by low + surrogate: convert to 0xfffd */ + codepoint = 0xfffd; + goto codepoint16; + } + if (out_left < 4) { errno = E2BIG; goto error; } - c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); - c[1] = 0x80 | (uc[0] & 0x3f); - in_left -= 2; - out_left -= 2; - uc += 2; - c += 2; + memcpy(c, uc, 4); + in_left -= 4; + out_left -= 4; + uc += 4; + c += 4; continue; } - if ((uc[1] & 0xfc) == 0xdc) { - /* low surrogate not preceded by high surrogate - convert to 0xfffd */ - codepoint = 0xfffd; - goto codepoint16; - } - - if ((uc[1] & 0xfc) != 0xd8) { - codepoint = uc[0] | (uc[1]<<8); - goto codepoint16; - } - - /* its the first part of a 4 byte sequence */ - if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) { - /* high surrogate not followed by low surrogate - convert to 0xfffd */ + if ((codepoint & 0xfc00) == 0xdc00) { + /* low surrogate not preceded by high + surrogate: convert to 0xfffd */ codepoint = 0xfffd; - goto codepoint16; - } - - codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | - (uc[0]<<10) | ((uc[1] & 0x3)<<18)); - - if (out_left < 4) { - errno = E2BIG; - goto error; } - c[0] = 0xf0 | (codepoint >> 18); - c[1] = 0x80 | ((codepoint >> 12) & 0x3f); - c[2] = 0x80 | ((codepoint >> 6) & 0x3f); - c[3] = 0x80 | (codepoint & 0x3f); - - in_left -= 4; - out_left -= 4; - uc += 4; - c += 4; - continue; codepoint16: - if (out_left < 3) { - errno = E2BIG; - goto error; - } - c[0] = 0xe0 | (codepoint >> 12); - c[1] = 0x80 | ((codepoint >> 6) & 0x3f); - c[2] = 0x80 | (codepoint & 0x3f); + c[0] = codepoint & 0xFF; + c[1] = (codepoint>>8) & 0xFF; in_left -= 2; - out_left -= 3; + out_left -= 2; uc += 2; - c += 3; + c += 2; continue; } diff --git a/lib/util/charset/tests/iconv.c b/lib/util/charset/tests/iconv.c index 40e223b28f..1facea6136 100644 --- a/lib/util/charset/tests/iconv.c +++ b/lib/util/charset/tests/iconv.c @@ -398,10 +398,65 @@ static bool test_random_5m(struct torture_context *tctx) return true; } + +static bool test_string2key(struct torture_context *tctx) +{ + uint16_t *buf; + char *dest = NULL; + TALLOC_CTX *mem_ctx = talloc_new(tctx); + ssize_t ret; + size_t len = (random()%1000)+1; + const uint16_t in1[10] = { 'a', 0xd805, 'b', 0xdcf0, 'c', 0, 'd', 'e', 'f', 'g' }; + uint8_t le1[20]; + uint8_t *munged1; + uint8_t *out1; + int i; + const char *correct = "a\357\277\275b\357\277\275c\001defg"; + + buf = talloc_size(mem_ctx, len*2); + generate_random_buffer((uint8_t *)buf, len*2); + + torture_comment(tctx, "converting random buffer\n"); + + ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)buf, len*2, (void**)&dest); + if (ret == -1) { + torture_fail(tctx, "Failed to convert random buffer\n"); + } + + for (i=0;i<10;i++) { + SSVAL(&le1[2*i], 0, in1[i]); + } + + torture_comment(tctx, "converting fixed buffer to UTF16\n"); + + ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF16, (void *)le1, 20, (void**)&munged1); + if (ret == -1) { + torture_fail(tctx, "Failed to convert fixed buffer to UTF16_MUNGED\n"); + } + + torture_assert(tctx, ret == 20, "conversion should give 20 bytes\n"); + + torture_comment(tctx, "converting fixed buffer to UTF8\n"); + + ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)le1, 20, (void**)&out1); + if (ret == -1) { + torture_fail(tctx, "Failed to convert fixed buffer to UTF8\n"); + } + + torture_assert(tctx, strcmp(correct, out1) == 0, "conversion gave incorrect result\n"); + + talloc_free(mem_ctx); + + return true; +} + struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx) { struct torture_suite *suite = torture_suite_create(mem_ctx, "ICONV"); + torture_suite_add_simple_test(suite, "string2key", + test_string2key); + torture_suite_add_simple_test(suite, "next_codepoint()", test_next_codepoint); @@ -410,6 +465,9 @@ struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx) torture_suite_add_simple_test(suite, "5M random UTF-16LE sequences", test_random_5m); + + torture_suite_add_simple_test(suite, "string2key", + test_string2key); return suite; } -- cgit