diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/util/charset/charcnv.c | 289 | ||||
-rw-r--r-- | lib/util/charset/charset.h | 5 | ||||
-rw-r--r-- | lib/util/charset/codepoints.c | 362 | ||||
-rw-r--r-- | lib/util/charset/util_unistr.c | 26 | ||||
-rw-r--r-- | lib/util/charset/wscript_build | 4 |
5 files changed, 366 insertions, 320 deletions
diff --git a/lib/util/charset/charcnv.c b/lib/util/charset/charcnv.c index 59b36e3062..dd2c725125 100644 --- a/lib/util/charset/charcnv.c +++ b/lib/util/charset/charcnv.c @@ -38,137 +38,6 @@ * @sa lib/iconv.c */ -struct smb_iconv_convenience { - TALLOC_CTX *child_ctx; - const char *unix_charset; - const char *dos_charset; - bool native_iconv; - smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS]; -}; - - -/** - * Return the name of a charset to give to iconv(). - **/ -static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) -{ - switch (ch) { - case CH_UTF16: return "UTF-16LE"; - case CH_UNIX: return ic->unix_charset; - case CH_DOS: return ic->dos_charset; - case CH_UTF8: return "UTF8"; - case CH_UTF16BE: return "UTF-16BE"; - case CH_UTF16MUNGED: return "UTF16_MUNGED"; - default: - return "ASCII"; - } -} - -/** - re-initialize iconv conversion descriptors -**/ -static int close_iconv_convenience(struct smb_iconv_convenience *data) -{ - unsigned c1, c2; - for (c1=0;c1<NUM_CHARSETS;c1++) { - for (c2=0;c2<NUM_CHARSETS;c2++) { - if (data->conv_handles[c1][c2] != NULL) { - if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) { - smb_iconv_close(data->conv_handles[c1][c2]); - } - data->conv_handles[c1][c2] = NULL; - } - } - } - - return 0; -} - -/* - the old_ic is passed in here as the smb_iconv_convenience structure - is used as a global pointer in some places (eg. python modules). We - don't want to invalidate those global pointers, but we do want to - update them with the right charset information when loadparm - runs. To do that we need to re-use the structure pointer, but - re-fill the elements in the structure with the updated values - */ -_PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, - const char *dos_charset, - const char *unix_charset, - bool native_iconv, - struct smb_iconv_convenience *old_ic) -{ - struct smb_iconv_convenience *ret; - - if (old_ic != NULL) { - ret = old_ic; - close_iconv_convenience(ret); - talloc_free(ret->child_ctx); - ZERO_STRUCTP(ret); - } else { - ret = talloc_zero(mem_ctx, struct smb_iconv_convenience); - } - if (ret == NULL) { - return NULL; - } - - /* we use a child context to allow us to free all ptrs without - freeing the structure itself */ - ret->child_ctx = talloc_new(ret); - if (ret->child_ctx == NULL) { - return NULL; - } - - talloc_set_destructor(ret, close_iconv_convenience); - - ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset); - ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset); - ret->native_iconv = native_iconv; - - return ret; -} - -/* - on-demand initialisation of conversion handles -*/ -static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, - charset_t from, charset_t to) -{ - const char *n1, *n2; - static bool initialised; - - if (initialised == false) { - initialised = true; - } - - if (ic->conv_handles[from][to]) { - return ic->conv_handles[from][to]; - } - - n1 = charset_name(ic, from); - n2 = charset_name(ic, to); - - ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, - ic->native_iconv); - - if (ic->conv_handles[from][to] == (smb_iconv_t)-1) { - if ((from == CH_DOS || to == CH_DOS) && - strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) { - DEBUG(0,("dos charset '%s' unavailable - using ASCII\n", - charset_name(ic, CH_DOS))); - ic->dos_charset = "ASCII"; - - n1 = charset_name(ic, from); - n2 = charset_name(ic, to); - - ic->conv_handles[from][to] = - smb_iconv_open_ex(ic, n2, n1, ic->native_iconv); - } - } - - return ic->conv_handles[from][to]; -} - /** * Convert string from one encoding to another, making error checking etc * @@ -363,161 +232,3 @@ _PUBLIC_ bool convert_string_talloc_convenience(TALLOC_CTX *ctx, return true; } - -/** - * Return the unicode codepoint for the next character in the input - * string in the given src_charset. - * The unicode codepoint (codepoint_t) is an unsinged 32 bit value. - * - * Also return the number of bytes consumed (which tells the caller - * how many bytes to skip to get to the next src_charset-character). - * - * This is implemented (in the non-ascii-case) by first converting the - * next character in the input string to UTF16_LE and then calculating - * the unicode codepoint from that. - * - * Return INVALID_CODEPOINT if the next character cannot be converted. - */ -_PUBLIC_ codepoint_t next_codepoint_convenience_ext( - struct smb_iconv_convenience *ic, - const char *str, charset_t src_charset, - size_t *bytes_consumed) -{ - /* it cannot occupy more than 4 bytes in UTF16 format */ - uint8_t buf[4]; - smb_iconv_t descriptor; - size_t ilen_orig; - size_t ilen; - size_t olen; - char *outbuf; - - if ((str[0] & 0x80) == 0) { - *bytes_consumed = 1; - return (codepoint_t)str[0]; - } - - /* - * we assume that no multi-byte character can take more than 5 bytes. - * This is OK as we only support codepoints up to 1M (U+100000) - */ - ilen_orig = strnlen(str, 5); - ilen = ilen_orig; - - descriptor = get_conv_handle(ic, src_charset, CH_UTF16); - if (descriptor == (smb_iconv_t)-1) { - *bytes_consumed = 1; - return INVALID_CODEPOINT; - } - - /* - * this looks a little strange, but it is needed to cope with - * codepoints above 64k (U+1000) which are encoded as per RFC2781. - */ - olen = 2; - outbuf = (char *)buf; - smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); - if (olen == 2) { - olen = 4; - outbuf = (char *)buf; - smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); - if (olen == 4) { - /* we didn't convert any bytes */ - *bytes_consumed = 1; - return INVALID_CODEPOINT; - } - olen = 4 - olen; - } else { - olen = 2 - olen; - } - - *bytes_consumed = ilen_orig - ilen; - - if (olen == 2) { - return (codepoint_t)SVAL(buf, 0); - } - if (olen == 4) { - /* decode a 4 byte UTF16 character manually */ - return (codepoint_t)0x10000 + - (buf[2] | ((buf[3] & 0x3)<<8) | - (buf[0]<<10) | ((buf[1] & 0x3)<<18)); - } - - /* no other length is valid */ - return INVALID_CODEPOINT; -} - -/* - return the unicode codepoint for the next multi-byte CH_UNIX character - in the string - - also return the number of bytes consumed (which tells the caller - how many bytes to skip to get to the next CH_UNIX character) - - return INVALID_CODEPOINT if the next character cannot be converted -*/ -_PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, - const char *str, size_t *size) -{ - return next_codepoint_convenience_ext(ic, str, CH_UNIX, size); -} - -/* - push a single codepoint into a CH_UNIX string the target string must - be able to hold the full character, which is guaranteed if it is at - least 5 bytes in size. The caller may pass less than 5 bytes if they - are sure the character will fit (for example, you can assume that - uppercase/lowercase of a character will not add more than 1 byte) - - return the number of bytes occupied by the CH_UNIX character, or - -1 on failure -*/ -_PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic, - char *str, codepoint_t c) -{ - smb_iconv_t descriptor; - uint8_t buf[4]; - size_t ilen, olen; - const char *inbuf; - - if (c < 128) { - *str = c; - return 1; - } - - descriptor = get_conv_handle(ic, - CH_UTF16, CH_UNIX); - if (descriptor == (smb_iconv_t)-1) { - return -1; - } - - if (c < 0x10000) { - ilen = 2; - olen = 5; - inbuf = (char *)buf; - SSVAL(buf, 0, c); - smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); - if (ilen != 0) { - return -1; - } - return 5 - olen; - } - - c -= 0x10000; - - buf[0] = (c>>10) & 0xFF; - buf[1] = (c>>18) | 0xd8; - buf[2] = c & 0xFF; - buf[3] = ((c>>8) & 0x3) | 0xdc; - - ilen = 4; - olen = 5; - inbuf = (char *)buf; - - smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); - if (ilen != 0) { - return -1; - } - return 5 - olen; -} - - diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h index 28d762578b..b4a5a55461 100644 --- a/lib/util/charset/charset.h +++ b/lib/util/charset/charset.h @@ -170,6 +170,10 @@ ssize_t iconv_talloc(TALLOC_CTX *mem_ctx, void *dest); extern struct smb_iconv_convenience *global_iconv_convenience; +struct smb_iconv_convenience *get_iconv_convenience(void); +smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, + charset_t from, charset_t to); +const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch); codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, size_t *size); @@ -195,6 +199,7 @@ int codepoint_cmpi(codepoint_t c1, codepoint_t c2); struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, const char *dos_charset, const char *unix_charset, + const char *display_charset, bool native_iconv, struct smb_iconv_convenience *old_ic); diff --git a/lib/util/charset/codepoints.c b/lib/util/charset/codepoints.c index 53febb8b5e..01183e4ad4 100644 --- a/lib/util/charset/codepoints.c +++ b/lib/util/charset/codepoints.c @@ -1,8 +1,10 @@ /* Unix SMB/CIFS implementation. - Samba utility functions - Copyright (C) Andrew Tridgell 1992-2001 + Character set conversion Extensions + Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 + Copyright (C) Andrew Tridgell 2001 Copyright (C) Simo Sorce 2001 + Copyright (C) Jelmer Vernooij 2007 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,12 +18,17 @@ You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ +*/ #include "includes.h" +#include "lib/util/charset/charset.h" #include "system/locale.h" #include "dynconfig.h" +#ifdef strcasecmp +#undef strcasecmp +#endif + /** * @file * @brief Unicode string manipulation @@ -126,3 +133,352 @@ _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2) } +struct smb_iconv_convenience { + TALLOC_CTX *child_ctx; + const char *unix_charset; + const char *dos_charset; + const char *display_charset; + bool native_iconv; + smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS]; +}; + +struct smb_iconv_convenience *global_iconv_convenience = NULL; + +struct smb_iconv_convenience *get_iconv_convenience(void) +{ + if (global_iconv_convenience == NULL) + global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(), + "ASCII", "UTF-8", "ASCII", true, NULL); + return global_iconv_convenience; +} + +/** + * Return the name of a charset to give to iconv(). + **/ +const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) +{ + switch (ch) { + case CH_UTF16: return "UTF-16LE"; + case CH_UNIX: return ic->unix_charset; + case CH_DOS: return ic->dos_charset; + case CH_DISPLAY: return ic->display_charset; + case CH_UTF8: return "UTF8"; + case CH_UTF16BE: return "UTF-16BE"; + case CH_UTF16MUNGED: return "UTF16_MUNGED"; + default: + return "ASCII"; + } +} + +/** + re-initialize iconv conversion descriptors +**/ +static int close_iconv_convenience(struct smb_iconv_convenience *data) +{ + unsigned c1, c2; + for (c1=0;c1<NUM_CHARSETS;c1++) { + for (c2=0;c2<NUM_CHARSETS;c2++) { + if (data->conv_handles[c1][c2] != NULL) { + if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) { + smb_iconv_close(data->conv_handles[c1][c2]); + } + data->conv_handles[c1][c2] = NULL; + } + } + } + + return 0; +} + +static const char *map_locale(const char *charset) +{ + if (strcmp(charset, "LOCALE") != 0) { + return charset; + } +#if defined(HAVE_NL_LANGINFO) && defined(CODESET) + { + const char *ln; + smb_iconv_t handle; + + ln = nl_langinfo(CODESET); + if (ln == NULL) { + DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n")); + return "ASCII"; + } + /* Check whether the charset name is supported + by iconv */ + handle = smb_iconv_open(ln, "UCS-2LE"); + if (handle == (smb_iconv_t) -1) { + DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln)); + return "ASCII"; + } else { + DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln)); + smb_iconv_close(handle); + } + return ln; + } +#endif + return "ASCII"; +} + +/* + the old_ic is passed in here as the smb_iconv_convenience structure + is used as a global pointer in some places (eg. python modules). We + don't want to invalidate those global pointers, but we do want to + update them with the right charset information when loadparm + runs. To do that we need to re-use the structure pointer, but + re-fill the elements in the structure with the updated values + */ +_PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, + const char *dos_charset, + const char *unix_charset, + const char *display_charset, + bool native_iconv, + struct smb_iconv_convenience *old_ic) +{ + struct smb_iconv_convenience *ret; + + display_charset = map_locale(display_charset); + + if (old_ic != NULL) { + ret = old_ic; + close_iconv_convenience(ret); + talloc_free(ret->child_ctx); + ZERO_STRUCTP(ret); + } else { + ret = talloc_zero(mem_ctx, struct smb_iconv_convenience); + } + if (ret == NULL) { + return NULL; + } + + /* we use a child context to allow us to free all ptrs without + freeing the structure itself */ + ret->child_ctx = talloc_new(ret); + if (ret->child_ctx == NULL) { + return NULL; + } + + talloc_set_destructor(ret, close_iconv_convenience); + + ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset); + ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset); + ret->display_charset = talloc_strdup(ret->child_ctx, display_charset); + ret->native_iconv = native_iconv; + + return ret; +} + +/* + on-demand initialisation of conversion handles +*/ +smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, + charset_t from, charset_t to) +{ + const char *n1, *n2; + static bool initialised; + + if (initialised == false) { + initialised = true; + } + + if (ic->conv_handles[from][to]) { + return ic->conv_handles[from][to]; + } + + n1 = charset_name(ic, from); + n2 = charset_name(ic, to); + + ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, + ic->native_iconv); + + if (ic->conv_handles[from][to] == (smb_iconv_t)-1) { + if ((from == CH_DOS || to == CH_DOS) && + strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) { + DEBUG(0,("dos charset '%s' unavailable - using ASCII\n", + charset_name(ic, CH_DOS))); + ic->dos_charset = "ASCII"; + + n1 = charset_name(ic, from); + n2 = charset_name(ic, to); + + ic->conv_handles[from][to] = + smb_iconv_open_ex(ic, n2, n1, ic->native_iconv); + } + } + + return ic->conv_handles[from][to]; +} + +/** + * Return the unicode codepoint for the next character in the input + * string in the given src_charset. + * The unicode codepoint (codepoint_t) is an unsinged 32 bit value. + * + * Also return the number of bytes consumed (which tells the caller + * how many bytes to skip to get to the next src_charset-character). + * + * This is implemented (in the non-ascii-case) by first converting the + * next character in the input string to UTF16_LE and then calculating + * the unicode codepoint from that. + * + * Return INVALID_CODEPOINT if the next character cannot be converted. + */ +_PUBLIC_ codepoint_t next_codepoint_convenience_ext( + struct smb_iconv_convenience *ic, + const char *str, charset_t src_charset, + size_t *bytes_consumed) +{ + /* it cannot occupy more than 4 bytes in UTF16 format */ + uint8_t buf[4]; + smb_iconv_t descriptor; + size_t ilen_orig; + size_t ilen; + size_t olen; + char *outbuf; + + if ((str[0] & 0x80) == 0) { + *bytes_consumed = 1; + return (codepoint_t)str[0]; + } + + /* + * we assume that no multi-byte character can take more than 5 bytes. + * This is OK as we only support codepoints up to 1M (U+100000) + */ + ilen_orig = strnlen(str, 5); + ilen = ilen_orig; + + descriptor = get_conv_handle(ic, src_charset, CH_UTF16); + if (descriptor == (smb_iconv_t)-1) { + *bytes_consumed = 1; + return INVALID_CODEPOINT; + } + + /* + * this looks a little strange, but it is needed to cope with + * codepoints above 64k (U+1000) which are encoded as per RFC2781. + */ + olen = 2; + outbuf = (char *)buf; + smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); + if (olen == 2) { + olen = 4; + outbuf = (char *)buf; + smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); + if (olen == 4) { + /* we didn't convert any bytes */ + *bytes_consumed = 1; + return INVALID_CODEPOINT; + } + olen = 4 - olen; + } else { + olen = 2 - olen; + } + + *bytes_consumed = ilen_orig - ilen; + + if (olen == 2) { + return (codepoint_t)SVAL(buf, 0); + } + if (olen == 4) { + /* decode a 4 byte UTF16 character manually */ + return (codepoint_t)0x10000 + + (buf[2] | ((buf[3] & 0x3)<<8) | + (buf[0]<<10) | ((buf[1] & 0x3)<<18)); + } + + /* no other length is valid */ + return INVALID_CODEPOINT; +} + +/* + return the unicode codepoint for the next multi-byte CH_UNIX character + in the string + + also return the number of bytes consumed (which tells the caller + how many bytes to skip to get to the next CH_UNIX character) + + return INVALID_CODEPOINT if the next character cannot be converted +*/ +_PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, + const char *str, size_t *size) +{ + return next_codepoint_convenience_ext(ic, str, CH_UNIX, size); +} + +/* + push a single codepoint into a CH_UNIX string the target string must + be able to hold the full character, which is guaranteed if it is at + least 5 bytes in size. The caller may pass less than 5 bytes if they + are sure the character will fit (for example, you can assume that + uppercase/lowercase of a character will not add more than 1 byte) + + return the number of bytes occupied by the CH_UNIX character, or + -1 on failure +*/ +_PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic, + char *str, codepoint_t c) +{ + smb_iconv_t descriptor; + uint8_t buf[4]; + size_t ilen, olen; + const char *inbuf; + + if (c < 128) { + *str = c; + return 1; + } + + descriptor = get_conv_handle(ic, + CH_UTF16, CH_UNIX); + if (descriptor == (smb_iconv_t)-1) { + return -1; + } + + if (c < 0x10000) { + ilen = 2; + olen = 5; + inbuf = (char *)buf; + SSVAL(buf, 0, c); + smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); + if (ilen != 0) { + return -1; + } + return 5 - olen; + } + + c -= 0x10000; + + buf[0] = (c>>10) & 0xFF; + buf[1] = (c>>18) | 0xd8; + buf[2] = c & 0xFF; + buf[3] = ((c>>8) & 0x3) | 0xdc; + + ilen = 4; + olen = 5; + inbuf = (char *)buf; + + smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); + if (ilen != 0) { + return -1; + } + return 5 - olen; +} + +_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, + size_t *size) +{ + return next_codepoint_convenience_ext(get_iconv_convenience(), str, + src_charset, size); +} + +_PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size) +{ + return next_codepoint_convenience(get_iconv_convenience(), str, size); +} + +_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c) +{ + return push_codepoint_convenience(get_iconv_convenience(), str, c); +} diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c index 410547400d..b6bfb29e7d 100644 --- a/lib/util/charset/util_unistr.c +++ b/lib/util/charset/util_unistr.c @@ -21,16 +21,6 @@ #include "includes.h" #include "system/locale.h" -struct smb_iconv_convenience *global_iconv_convenience = NULL; - -static inline struct smb_iconv_convenience *get_iconv_convenience(void) -{ - if (global_iconv_convenience == NULL) - global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(), - "ASCII", "UTF-8", true, NULL); - return global_iconv_convenience; -} - /** Case insensitive string compararison **/ @@ -1043,19 +1033,3 @@ _PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx, allow_badcharcnv); } -_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, - size_t *size) -{ - return next_codepoint_convenience_ext(get_iconv_convenience(), str, - src_charset, size); -} - -_PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size) -{ - return next_codepoint_convenience(get_iconv_convenience(), str, size); -} - -_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c) -{ - return push_codepoint_convenience(get_iconv_convenience(), str, c); -} diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build index 18479a9978..7dcd189036 100644 --- a/lib/util/charset/wscript_build +++ b/lib/util/charset/wscript_build @@ -4,7 +4,7 @@ if bld.env._SAMBA_BUILD_ == 4: bld.SAMBA_SUBSYSTEM('CHARSET', source='charcnv.c util_unistr.c', - public_deps='ICONV_WRAPPER CODEPOINTS', + public_deps='CODEPOINTS', public_headers='charset.h', ) @@ -14,5 +14,5 @@ bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER', bld.SAMBA_SUBSYSTEM('CODEPOINTS', source='codepoints.c', - deps='DYNCONFIG' + deps='DYNCONFIG ICONV_WRAPPER' ) |