summaryrefslogtreecommitdiff
path: root/lib/util/charset
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2008-10-31 15:41:34 +1100
committerAndrew Tridgell <tridge@samba.org>2008-10-31 15:41:34 +1100
commit55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442 (patch)
treeb676358946045bc2271c1c75254ecaac21eb72b9 /lib/util/charset
parent5ecccac1c34f58019b195f6838f57366faa3575d (diff)
downloadsamba-55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442.tar.gz
samba-55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442.tar.bz2
samba-55d55d9d9b881b2ec09fa76515cdd1cf6f0e2442.zip
finished adding UTF16_MUNGED charset
Changed the approach for the charset to go via utf16, which makes a bit more sense to read. Added a testsuiite for UTF16_MUNGED as part of LOCAL-ICONV
Diffstat (limited to 'lib/util/charset')
-rw-r--r--lib/util/charset/charcnv.c1
-rw-r--r--lib/util/charset/charset.h4
-rw-r--r--lib/util/charset/iconv.c111
-rw-r--r--lib/util/charset/tests/iconv.c58
4 files changed, 102 insertions, 72 deletions
diff --git a/lib/util/charset/charcnv.c b/lib/util/charset/charcnv.c
index 2ae16c3250..9dd68f05ea 100644
--- a/lib/util/charset/charcnv.c
+++ b/lib/util/charset/charcnv.c
@@ -57,6 +57,7 @@ static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
case CH_DOS: return ic->dos_charset;
case CH_UTF8: return "UTF8";
case CH_UTF16BE: return "UTF-16BE";
+ case CH_UTF16MUNGED: return "UTF16_MUNGED";
default:
return "ASCII";
}
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h
index 21fc20b8c3..cace79f949 100644
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -28,9 +28,9 @@
#include <talloc.h>
/* this defines the charset types used in samba */
-typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE} charset_t;
+typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE, CH_UTF16MUNGED} charset_t;
-#define NUM_CHARSETS 5
+#define NUM_CHARSETS 6
/*
* for each charset we have a function that pulls from that charset to
diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c
index 10b3a6488b..b6842a49aa 100644
--- a/lib/util/charset/iconv.c
+++ b/lib/util/charset/iconv.c
@@ -51,7 +51,7 @@ static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
-static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *);
+static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
@@ -69,7 +69,7 @@ static const struct charset_functions builtin_functions[] = {
{"UTF-8", utf8_pull, utf8_push},
/* this handles the munging needed for String2Key */
- {"UTF8_MUNGED", utf8_pull, utf8_munged_push},
+ {"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
{"ASCII", ascii_pull, ascii_push},
{"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
@@ -713,103 +713,74 @@ error:
/*
- this takes a UTF16 sequence, munges it according to the string2key
- rules, and produces a UTF8 sequence
+ this takes a UTF16 munged sequence, modifies it according to the
+ string2key rules, and produces a UTF16 sequence
The rules are:
- 1) convert any instance of 0xD800 - 0xDBFF (high surrogate)
+ 1) any 0x0000 characters are mapped to 0x0001
+
+ 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
U+FFFD (OBJECT REPLACEMENT CHARACTER).
- 2) the same for any low surrogate that was not preceded by a high surrogate.
+ 3) the same for any low surrogate that was not preceded by a high surrogate.
+
*/
-static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft,
+static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft)
{
size_t in_left=*inbytesleft, out_left=*outbytesleft;
uint8_t *c = (uint8_t *)*outbuf;
const uint8_t *uc = (const uint8_t *)*inbuf;
- while (in_left >= 2 && out_left >= 1) {
- unsigned int codepoint;
+ while (in_left >= 2 && out_left >= 2) {
+ unsigned int codepoint = uc[0] | (uc[1]<<8);
- if (uc[1] == 0 && !(uc[0] & 0x80)) {
- /* simplest case */
- c[0] = uc[0];
- in_left -= 2;
- out_left -= 1;
- uc += 2;
- c += 1;
- continue;
+ if (codepoint == 0) {
+ codepoint = 1;
}
- if ((uc[1]&0xf8) == 0) {
- /* next simplest case */
- if (out_left < 2) {
+ if ((codepoint & 0xfc00) == 0xd800) {
+ /* a high surrogate */
+ unsigned int codepoint2;
+ if (in_left < 4) {
+ codepoint = 0xfffd;
+ goto codepoint16;
+ }
+ codepoint2 = uc[2] | (uc[3]<<8);
+ if ((codepoint2 & 0xfc00) != 0xdc00) {
+ /* high surrogate not followed by low
+ surrogate: convert to 0xfffd */
+ codepoint = 0xfffd;
+ goto codepoint16;
+ }
+ if (out_left < 4) {
errno = E2BIG;
goto error;
}
- c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
- c[1] = 0x80 | (uc[0] & 0x3f);
- in_left -= 2;
- out_left -= 2;
- uc += 2;
- c += 2;
+ memcpy(c, uc, 4);
+ in_left -= 4;
+ out_left -= 4;
+ uc += 4;
+ c += 4;
continue;
}
- if ((uc[1] & 0xfc) == 0xdc) {
- /* low surrogate not preceded by high surrogate
- convert to 0xfffd */
- codepoint = 0xfffd;
- goto codepoint16;
- }
-
- if ((uc[1] & 0xfc) != 0xd8) {
- codepoint = uc[0] | (uc[1]<<8);
- goto codepoint16;
- }
-
- /* its the first part of a 4 byte sequence */
- if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) {
- /* high surrogate not followed by low surrogate
- convert to 0xfffd */
+ if ((codepoint & 0xfc00) == 0xdc00) {
+ /* low surrogate not preceded by high
+ surrogate: convert to 0xfffd */
codepoint = 0xfffd;
- goto codepoint16;
- }
-
- codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
- (uc[0]<<10) | ((uc[1] & 0x3)<<18));
-
- if (out_left < 4) {
- errno = E2BIG;
- goto error;
}
- c[0] = 0xf0 | (codepoint >> 18);
- c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
- c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
- c[3] = 0x80 | (codepoint & 0x3f);
-
- in_left -= 4;
- out_left -= 4;
- uc += 4;
- c += 4;
- continue;
codepoint16:
- if (out_left < 3) {
- errno = E2BIG;
- goto error;
- }
- c[0] = 0xe0 | (codepoint >> 12);
- c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
- c[2] = 0x80 | (codepoint & 0x3f);
+ c[0] = codepoint & 0xFF;
+ c[1] = (codepoint>>8) & 0xFF;
in_left -= 2;
- out_left -= 3;
+ out_left -= 2;
uc += 2;
- c += 3;
+ c += 2;
continue;
}
diff --git a/lib/util/charset/tests/iconv.c b/lib/util/charset/tests/iconv.c
index 40e223b28f..1facea6136 100644
--- a/lib/util/charset/tests/iconv.c
+++ b/lib/util/charset/tests/iconv.c
@@ -398,10 +398,65 @@ static bool test_random_5m(struct torture_context *tctx)
return true;
}
+
+static bool test_string2key(struct torture_context *tctx)
+{
+ uint16_t *buf;
+ char *dest = NULL;
+ TALLOC_CTX *mem_ctx = talloc_new(tctx);
+ ssize_t ret;
+ size_t len = (random()%1000)+1;
+ const uint16_t in1[10] = { 'a', 0xd805, 'b', 0xdcf0, 'c', 0, 'd', 'e', 'f', 'g' };
+ uint8_t le1[20];
+ uint8_t *munged1;
+ uint8_t *out1;
+ int i;
+ const char *correct = "a\357\277\275b\357\277\275c\001defg";
+
+ buf = talloc_size(mem_ctx, len*2);
+ generate_random_buffer((uint8_t *)buf, len*2);
+
+ torture_comment(tctx, "converting random buffer\n");
+
+ ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)buf, len*2, (void**)&dest);
+ if (ret == -1) {
+ torture_fail(tctx, "Failed to convert random buffer\n");
+ }
+
+ for (i=0;i<10;i++) {
+ SSVAL(&le1[2*i], 0, in1[i]);
+ }
+
+ torture_comment(tctx, "converting fixed buffer to UTF16\n");
+
+ ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF16, (void *)le1, 20, (void**)&munged1);
+ if (ret == -1) {
+ torture_fail(tctx, "Failed to convert fixed buffer to UTF16_MUNGED\n");
+ }
+
+ torture_assert(tctx, ret == 20, "conversion should give 20 bytes\n");
+
+ torture_comment(tctx, "converting fixed buffer to UTF8\n");
+
+ ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)le1, 20, (void**)&out1);
+ if (ret == -1) {
+ torture_fail(tctx, "Failed to convert fixed buffer to UTF8\n");
+ }
+
+ torture_assert(tctx, strcmp(correct, out1) == 0, "conversion gave incorrect result\n");
+
+ talloc_free(mem_ctx);
+
+ return true;
+}
+
struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx)
{
struct torture_suite *suite = torture_suite_create(mem_ctx, "ICONV");
+ torture_suite_add_simple_test(suite, "string2key",
+ test_string2key);
+
torture_suite_add_simple_test(suite, "next_codepoint()",
test_next_codepoint);
@@ -410,6 +465,9 @@ struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx)
torture_suite_add_simple_test(suite, "5M random UTF-16LE sequences",
test_random_5m);
+
+ torture_suite_add_simple_test(suite, "string2key",
+ test_string2key);
return suite;
}