diff options
Diffstat (limited to 'source4/lib/charset')
-rw-r--r-- | source4/lib/charset/charcnv.c | 768 | ||||
-rw-r--r-- | source4/lib/charset/charset.h | 154 | ||||
-rw-r--r-- | source4/lib/charset/config.m4 | 86 | ||||
-rw-r--r-- | source4/lib/charset/config.mk | 13 | ||||
-rw-r--r-- | source4/lib/charset/iconv.c | 711 | ||||
-rw-r--r-- | source4/lib/charset/tests/charset.c | 272 | ||||
-rw-r--r-- | source4/lib/charset/tests/iconv.c | 424 | ||||
-rw-r--r-- | source4/lib/charset/util_unistr.c | 684 |
8 files changed, 0 insertions, 3112 deletions
diff --git a/source4/lib/charset/charcnv.c b/source4/lib/charset/charcnv.c deleted file mode 100644 index 3e384304cf..0000000000 --- a/source4/lib/charset/charcnv.c +++ /dev/null @@ -1,768 +0,0 @@ -/* - Unix SMB/CIFS implementation. - Character set conversion Extensions - Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 - Copyright (C) Andrew Tridgell 2001 - Copyright (C) Simo Sorce 2001 - Copyright (C) Jelmer Vernooij 2007 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - -*/ -#include "includes.h" -#include "system/iconv.h" -#include "param/param.h" - -/** - * @file - * - * @brief Character-set conversion routines built on our iconv. - * - * @note Samba's internal character set (at least in the 3.0 series) - * is always the same as the one for the Unix filesystem. It is - * <b>not</b> necessarily UTF-8 and may be different on machines that - * need i18n filenames to be compatible with Unix software. It does - * have to be a superset of ASCII. All multibyte sequences must start - * with a byte with the high bit set. - * - * @sa lib/iconv.c - */ - -struct smb_iconv_convenience { - const char *unix_charset; - const char *dos_charset; - bool native_iconv; - smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS]; -}; - - -/** - * Return the name of a charset to give to iconv(). - **/ -static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) -{ - switch (ch) { - case CH_UTF16: return "UTF-16LE"; - case CH_UNIX: return ic->unix_charset; - case CH_DOS: return ic->dos_charset; - case CH_UTF8: return "UTF8"; - case CH_UTF16BE: return "UTF-16BE"; - default: - return "ASCII"; - } -} - -/** - re-initialize iconv conversion descriptors -**/ -static int close_iconv(struct smb_iconv_convenience *data) -{ - unsigned c1, c2; - for (c1=0;c1<NUM_CHARSETS;c1++) { - for (c2=0;c2<NUM_CHARSETS;c2++) { - if (data->conv_handles[c1][c2] != NULL) { - if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) { - smb_iconv_close(data->conv_handles[c1][c2]); - } - data->conv_handles[c1][c2] = NULL; - } - } - } - - return 0; -} - -_PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx, - const char *dos_charset, - const char *unix_charset, - bool native_iconv) -{ - struct smb_iconv_convenience *ret = talloc_zero(mem_ctx, - struct smb_iconv_convenience); - - if (ret == NULL) { - return NULL; - } - - talloc_set_destructor(ret, close_iconv); - - ret->dos_charset = talloc_strdup(ret, dos_charset); - ret->unix_charset = talloc_strdup(ret, unix_charset); - ret->native_iconv = native_iconv; - - return ret; -} - -/* - on-demand initialisation of conversion handles -*/ -static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, - charset_t from, charset_t to) -{ - const char *n1, *n2; - static bool initialised; - - if (initialised == false) { - initialised = true; - -#ifdef LC_ALL - /* we set back the locale to C to get ASCII-compatible - toupper/lower functions. For now we do not need - any other POSIX localisations anyway. When we - should really need localized string functions one - day we need to write our own ascii_tolower etc. - */ - setlocale(LC_ALL, "C"); -#endif - } - - if (ic->conv_handles[from][to]) { - return ic->conv_handles[from][to]; - } - - n1 = charset_name(ic, from); - n2 = charset_name(ic, to); - - ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, - ic->native_iconv); - - if (ic->conv_handles[from][to] == (smb_iconv_t)-1) { - if ((from == CH_DOS || to == CH_DOS) && - strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) { - DEBUG(0,("dos charset '%s' unavailable - using ASCII\n", - charset_name(ic, CH_DOS))); - ic->dos_charset = "ASCII"; - - n1 = charset_name(ic, from); - n2 = charset_name(ic, to); - - ic->conv_handles[from][to] = - smb_iconv_open_ex(ic, n2, n1, ic->native_iconv); - } - } - - return ic->conv_handles[from][to]; -} - - -/** - * Convert string from one encoding to another, making error checking etc - * - * @param src pointer to source string (multibyte or singlebyte) - * @param srclen length of the source string in bytes - * @param dest pointer to destination string (multibyte or singlebyte) - * @param destlen maximal length allowed for string - * @returns the number of bytes occupied in the destination - **/ -_PUBLIC_ ssize_t convert_string(struct smb_iconv_convenience *ic, - charset_t from, charset_t to, - void const *src, size_t srclen, - void *dest, size_t destlen) -{ - size_t i_len, o_len; - size_t retval; - const char* inbuf = (const char*)src; - char* outbuf = (char*)dest; - smb_iconv_t descriptor; - - if (srclen == (size_t)-1) - srclen = strlen(inbuf)+1; - - descriptor = get_conv_handle(ic, from, to); - - if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { - /* conversion not supported, use as is */ - size_t len = MIN(srclen,destlen); - memcpy(dest,src,len); - return len; - } - - i_len=srclen; - o_len=destlen; - retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len); - if(retval==(size_t)-1) { - const char *reason; - switch(errno) { - case EINVAL: - reason="Incomplete multibyte sequence"; - return -1; - case E2BIG: - reason="No more room"; - if (from == CH_UNIX) { - DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n", - charset_name(ic, from), charset_name(ic, to), - (int)srclen, (int)destlen, - (const char *)src)); - } else { - DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n", - charset_name(ic, from), charset_name(ic, to), - (int)srclen, (int)destlen)); - } - return -1; - case EILSEQ: - reason="Illegal multibyte sequence"; - return -1; - } - /* smb_panic(reason); */ - } - return destlen-o_len; -} - -_PUBLIC_ ssize_t convert_string_talloc_descriptor(TALLOC_CTX *ctx, smb_iconv_t descriptor, void const *src, size_t srclen, void **dest) -{ - size_t i_len, o_len, destlen; - size_t retval; - const char *inbuf = (const char *)src; - char *outbuf, *ob; - - *dest = NULL; - - /* it is _very_ rare that a conversion increases the size by - more than 3x */ - destlen = srclen; - outbuf = NULL; -convert: - destlen = 2 + (destlen*3); - ob = talloc_realloc(ctx, outbuf, char, destlen); - if (!ob) { - DEBUG(0, ("convert_string_talloc: realloc failed!\n")); - talloc_free(outbuf); - return (size_t)-1; - } else { - outbuf = ob; - } - - /* we give iconv 2 less bytes to allow us to terminate at the - end */ - i_len = srclen; - o_len = destlen-2; - retval = smb_iconv(descriptor, - &inbuf, &i_len, - &outbuf, &o_len); - if(retval == (size_t)-1) { - const char *reason="unknown error"; - switch(errno) { - case EINVAL: - reason="Incomplete multibyte sequence"; - break; - case E2BIG: - goto convert; - case EILSEQ: - reason="Illegal multibyte sequence"; - break; - } - DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf)); - talloc_free(ob); - return (size_t)-1; - } - - destlen = (destlen-2) - o_len; - - /* guarantee null termination in all charsets */ - SSVAL(ob, destlen, 0); - - *dest = ob; - - return destlen; -} - -/** - * Convert between character sets, allocating a new buffer using talloc for the result. - * - * @param srclen length of source buffer. - * @param dest always set at least to NULL - * @note -1 is not accepted for srclen. - * - * @returns Size in bytes of the converted string; or -1 in case of error. - **/ - -_PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx, - struct smb_iconv_convenience *ic, - charset_t from, charset_t to, - void const *src, size_t srclen, - void **dest) -{ - smb_iconv_t descriptor; - - *dest = NULL; - - if (src == NULL || srclen == (size_t)-1 || srclen == 0) - return (size_t)-1; - - descriptor = get_conv_handle(ic, from, to); - - if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { - /* conversion not supported, return -1*/ - DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n", - charset_name(ic, from), - charset_name(ic, to))); - return -1; - } - - return convert_string_talloc_descriptor(ctx, descriptor, src, srclen, dest); -} - -/** - * Copy a string from a char* unix src to a dos codepage string destination. - * - * @return the number of bytes occupied by the string in the destination. - * - * @param flags can include - * <dl> - * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd> - * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd> - * </dl> - * - * @param dest_len the maximum length in bytes allowed in the - * destination. If @p dest_len is -1 then no maximum is used. - **/ -static ssize_t push_ascii(struct smb_iconv_convenience *ic, - void *dest, const char *src, size_t dest_len, int flags) -{ - size_t src_len; - ssize_t ret; - - if (flags & STR_UPPER) { - char *tmpbuf = strupper_talloc(NULL, src); - if (tmpbuf == NULL) { - return -1; - } - ret = push_ascii(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER); - talloc_free(tmpbuf); - return ret; - } - - src_len = strlen(src); - - if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) - src_len++; - - return convert_string(ic, CH_UNIX, CH_DOS, src, src_len, dest, dest_len); -} - -/** - * Copy a string from a unix char* src to an ASCII destination, - * allocating a buffer using talloc(). - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - * or -1 in case of error. - **/ -_PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src) -{ - size_t src_len = strlen(src)+1; - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_UNIX, CH_DOS, src, src_len, (void **)dest); -} - - -/** - * Copy a string from a dos codepage source to a unix char* destination. - * - * The resulting string in "dest" is always null terminated. - * - * @param flags can have: - * <dl> - * <dt>STR_TERMINATE</dt> - * <dd>STR_TERMINATE means the string in @p src - * is null terminated, and src_len is ignored.</dd> - * </dl> - * - * @param src_len is the length of the source area in bytes. - * @returns the number of bytes occupied by the string in @p src. - **/ -static ssize_t pull_ascii(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags) -{ - size_t ret; - - if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) { - if (src_len == (size_t)-1) { - src_len = strlen((const char *)src) + 1; - } else { - size_t len = strnlen((const char *)src, src_len); - if (len < src_len) - len++; - src_len = len; - } - } - - ret = convert_string(ic, CH_DOS, CH_UNIX, src, src_len, dest, dest_len); - - if (dest_len) - dest[MIN(ret, dest_len-1)] = 0; - - return src_len; -} - -/** - * Copy a string from a char* src to a unicode destination. - * - * @returns the number of bytes occupied by the string in the destination. - * - * @param flags can have: - * - * <dl> - * <dt>STR_TERMINATE <dd>means include the null termination. - * <dt>STR_UPPER <dd>means uppercase in the destination. - * <dt>STR_NOALIGN <dd>means don't do alignment. - * </dl> - * - * @param dest_len is the maximum length allowed in the - * destination. If dest_len is -1 then no maxiumum is used. - **/ -static ssize_t push_ucs2(struct smb_iconv_convenience *ic, - void *dest, const char *src, size_t dest_len, int flags) -{ - size_t len=0; - size_t src_len = strlen(src); - size_t ret; - - if (flags & STR_UPPER) { - char *tmpbuf = strupper_talloc(NULL, src); - if (tmpbuf == NULL) { - return -1; - } - ret = push_ucs2(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER); - talloc_free(tmpbuf); - return ret; - } - - if (flags & STR_TERMINATE) - src_len++; - - if (ucs2_align(NULL, dest, flags)) { - *(char *)dest = 0; - dest = (void *)((char *)dest + 1); - if (dest_len) dest_len--; - len++; - } - - /* ucs2 is always a multiple of 2 bytes */ - dest_len &= ~1; - - ret = convert_string(ic, CH_UNIX, CH_UTF16, src, src_len, dest, dest_len); - if (ret == (size_t)-1) { - return 0; - } - - len += ret; - - return len; -} - - -/** - * Copy a string from a unix char* src to a UCS2 destination, - * allocating a buffer using talloc(). - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - * or -1 in case of error. - **/ -_PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, void **dest, const char *src) -{ - size_t src_len = strlen(src)+1; - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF16, src, src_len, dest); -} - - -/** - * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - **/ - -_PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src) -{ - size_t src_len = strlen(src)+1; - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF8, src, src_len, (void **)dest); -} - -/** - Copy a string from a ucs2 source to a unix char* destination. - Flags can have: - STR_TERMINATE means the string in src is null terminated. - STR_NOALIGN means don't try to align. - if STR_TERMINATE is set then src_len is ignored if it is -1. - src_len is the length of the source area in bytes - Return the number of bytes occupied by the string in src. - The resulting string in "dest" is always null terminated. -**/ - -static size_t pull_ucs2(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags) -{ - size_t ret; - - if (ucs2_align(NULL, src, flags)) { - src = (const void *)((const char *)src + 1); - if (src_len > 0) - src_len--; - } - - if (flags & STR_TERMINATE) { - if (src_len == (size_t)-1) { - src_len = utf16_len(src); - } else { - src_len = utf16_len_n(src, src_len); - } - } - - /* ucs2 is always a multiple of 2 bytes */ - if (src_len != (size_t)-1) - src_len &= ~1; - - ret = convert_string(ic, CH_UTF16, CH_UNIX, src, src_len, dest, dest_len); - if (dest_len) - dest[MIN(ret, dest_len-1)] = 0; - - return src_len; -} - -/** - * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - **/ - -_PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src) -{ - size_t src_len = strlen(src)+1; - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_DOS, CH_UNIX, src, src_len, (void **)dest); -} - -/** - * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - **/ - -_PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const void *src) -{ - size_t src_len = utf16_len(src); - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_UTF16, CH_UNIX, src, src_len, (void **)dest); -} - -/** - * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc - * - * @param dest always set at least to NULL - * - * @returns The number of bytes occupied by the string in the destination - **/ - -_PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src) -{ - size_t src_len = strlen(src)+1; - *dest = NULL; - return convert_string_talloc(ctx, ic, CH_UTF8, CH_UNIX, src, src_len, (void **)dest); -} - -/** - Copy a string from a char* src to a unicode or ascii - dos codepage destination choosing unicode or ascii based on the - flags in the SMB buffer starting at base_ptr. - Return the number of bytes occupied by the string in the destination. - flags can have: - STR_TERMINATE means include the null termination. - STR_UPPER means uppercase in the destination. - STR_ASCII use ascii even with unicode packet. - STR_NOALIGN means don't do alignment. - dest_len is the maximum length allowed in the destination. If dest_len - is -1 then no maxiumum is used. -**/ - -_PUBLIC_ ssize_t push_string(struct smb_iconv_convenience *ic, - void *dest, const char *src, size_t dest_len, int flags) -{ - if (flags & STR_ASCII) { - return push_ascii(ic, dest, src, dest_len, flags); - } else if (flags & STR_UNICODE) { - return push_ucs2(ic, dest, src, dest_len, flags); - } else { - smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set"); - return -1; - } -} - - -/** - Copy a string from a unicode or ascii source (depending on - the packet flags) to a char* destination. - Flags can have: - STR_TERMINATE means the string in src is null terminated. - STR_UNICODE means to force as unicode. - STR_ASCII use ascii even with unicode packet. - STR_NOALIGN means don't do alignment. - if STR_TERMINATE is set then src_len is ignored is it is -1 - src_len is the length of the source area in bytes. - Return the number of bytes occupied by the string in src. - The resulting string in "dest" is always null terminated. -**/ - -_PUBLIC_ ssize_t pull_string(struct smb_iconv_convenience *ic, - char *dest, const void *src, size_t dest_len, size_t src_len, int flags) -{ - if (flags & STR_ASCII) { - return pull_ascii(ic, dest, src, dest_len, src_len, flags); - } else if (flags & STR_UNICODE) { - return pull_ucs2(ic, dest, src, dest_len, src_len, flags); - } else { - smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set"); - return -1; - } -} - - -/* - return the unicode codepoint for the next multi-byte CH_UNIX character - in the string - - also return the number of bytes consumed (which tells the caller - how many bytes to skip to get to the next CH_UNIX character) - - return INVALID_CODEPOINT if the next character cannot be converted -*/ -_PUBLIC_ codepoint_t next_codepoint(struct smb_iconv_convenience *ic, - const char *str, size_t *size) -{ - /* it cannot occupy more than 4 bytes in UTF16 format */ - uint8_t buf[4]; - smb_iconv_t descriptor; - size_t ilen_orig; - size_t ilen; - size_t olen; - char *outbuf; - - if ((str[0] & 0x80) == 0) { - *size = 1; - return (codepoint_t)str[0]; - } - - /* we assume that no multi-byte character can take - more than 5 bytes. This is OK as we only - support codepoints up to 1M */ - ilen_orig = strnlen(str, 5); - ilen = ilen_orig; - - descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16); - if (descriptor == (smb_iconv_t)-1) { - *size = 1; - return INVALID_CODEPOINT; - } - - /* this looks a little strange, but it is needed to cope - with codepoints above 64k */ - olen = 2; - outbuf = (char *)buf; - smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); - if (olen == 2) { - olen = 4; - outbuf = (char *)buf; - smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); - if (olen == 4) { - /* we didn't convert any bytes */ - *size = 1; - return INVALID_CODEPOINT; - } - olen = 4 - olen; - } else { - olen = 2 - olen; - } - - *size = ilen_orig - ilen; - - if (olen == 2) { - return (codepoint_t)SVAL(buf, 0); - } - if (olen == 4) { - /* decode a 4 byte UTF16 character manually */ - return (codepoint_t)0x10000 + - (buf[2] | ((buf[3] & 0x3)<<8) | - (buf[0]<<10) | ((buf[1] & 0x3)<<18)); - } - - /* no other length is valid */ - return INVALID_CODEPOINT; -} - -/* - push a single codepoint into a CH_UNIX string the target string must - be able to hold the full character, which is guaranteed if it is at - least 5 bytes in size. The caller may pass less than 5 bytes if they - are sure the character will fit (for example, you can assume that - uppercase/lowercase of a character will not add more than 1 byte) - - return the number of bytes occupied by the CH_UNIX character, or - -1 on failure -*/ -_PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic, - char *str, codepoint_t c) -{ - smb_iconv_t descriptor; - uint8_t buf[4]; - size_t ilen, olen; - const char *inbuf; - - if (c < 128) { - *str = c; - return 1; - } - - descriptor = get_conv_handle(ic, - CH_UTF16, CH_UNIX); - if (descriptor == (smb_iconv_t)-1) { - return -1; - } - - if (c < 0x10000) { - ilen = 2; - olen = 5; - inbuf = (char *)buf; - SSVAL(buf, 0, c); - smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); - if (ilen != 0) { - return -1; - } - return 5 - olen; - } - - c -= 0x10000; - - buf[0] = (c>>10) & 0xFF; - buf[1] = (c>>18) | 0xd8; - buf[2] = c & 0xFF; - buf[3] = ((c>>8) & 0x3) | 0xdc; - - ilen = 4; - olen = 5; - inbuf = (char *)buf; - - smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); - if (ilen != 0) { - return -1; - } - return 5 - olen; -} diff --git a/source4/lib/charset/charset.h b/source4/lib/charset/charset.h deleted file mode 100644 index 041eaeace7..0000000000 --- a/source4/lib/charset/charset.h +++ /dev/null @@ -1,154 +0,0 @@ -/* - Unix SMB/CIFS implementation. - charset defines - Copyright (C) Andrew Tridgell 2001 - Copyright (C) Jelmer Vernooij 2002 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/* This is a public header file that is installed as part of Samba. - * If you remove any functions or change their signature, update - * the so version number. */ - -#ifndef __CHARSET_H__ -#define __CHARSET_H__ - -#include <talloc.h> - -/* this defines the charset types used in samba */ -typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE} charset_t; - -#define NUM_CHARSETS 5 - -/* - * for each charset we have a function that pulls from that charset to - * a ucs2 buffer, and a function that pushes to a ucs2 buffer - * */ - -struct charset_functions { - const char *name; - size_t (*pull)(void *, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); - size_t (*push)(void *, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); - struct charset_functions *prev, *next; -}; - -/* this type is used for manipulating unicode codepoints */ -typedef uint32_t codepoint_t; - -#define INVALID_CODEPOINT ((codepoint_t)-1) - - -/* generic iconv conversion structure */ -typedef struct smb_iconv_s { - size_t (*direct)(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); - size_t (*pull)(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); - size_t (*push)(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); - void *cd_direct, *cd_pull, *cd_push; -} *smb_iconv_t; - -/* string manipulation flags */ -#define STR_TERMINATE 1 -#define STR_UPPER 2 -#define STR_ASCII 4 -#define STR_UNICODE 8 -#define STR_NOALIGN 16 -#define STR_NO_RANGE_CHECK 32 -#define STR_LEN8BIT 64 -#define STR_TERMINATE_ASCII 128 /* only terminate if ascii */ -#define STR_LEN_NOTERM 256 /* the length field is the unterminated length */ - -struct loadparm_context; -struct smb_iconv_convenience; -extern struct smb_iconv_convenience *global_smb_iconv_convenience; - -/* replace some string functions with multi-byte - versions */ -#define strlower(s) strlower_m(s) -#define strupper(s) strupper_m(s) - -char *strchr_m(const char *s, char c); -size_t strlen_m_term(const char *s); -size_t strlen_m(const char *s); -char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength); -void string_replace_w(char *s, char oldc, char newc); -bool strcsequal_w(const char *s1,const char *s2); -bool strequal_w(const char *s1, const char *s2); -int strncasecmp_m(const char *s1, const char *s2, size_t n); -bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize); -int strcasecmp_m(const char *s1, const char *s2); -size_t count_chars_w(const char *s, char c); -void strupper_m(char *s); -void strlower_m(char *s); -char *strupper_talloc(TALLOC_CTX *ctx, const char *src); -char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src); -char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n); -char *strlower_talloc(TALLOC_CTX *ctx, const char *src); -bool strhasupper(const char *string); -bool strhaslower(const char *string); -char *strrchr_m(const char *s, char c); -char *strchr_m(const char *s, char c); - -/* codepoints */ -codepoint_t next_codepoint(struct smb_iconv_convenience *ic, - const char *str, size_t *size); -ssize_t push_codepoint(struct smb_iconv_convenience *ic, - char *str, codepoint_t c); -codepoint_t toupper_w(codepoint_t val); -codepoint_t tolower_w(codepoint_t val); -int codepoint_cmpi(codepoint_t c1, codepoint_t c2); -ssize_t push_string(struct smb_iconv_convenience *ic, void *dest, const char *src, size_t dest_len, int flags); -ssize_t pull_string(struct smb_iconv_convenience *ic, - char *dest, const void *src, size_t dest_len, size_t src_len, int flags); -ssize_t convert_string(struct smb_iconv_convenience *ic, - charset_t from, charset_t to, - void const *src, size_t srclen, - void *dest, size_t destlen); -ssize_t convert_string_talloc_descriptor(TALLOC_CTX *ctx, smb_iconv_t descriptor, void const *src, size_t srclen, void **dest); -ssize_t convert_string_talloc(TALLOC_CTX *ctx, - struct smb_iconv_convenience *ic, - charset_t from, charset_t to, - void const *src, size_t srclen, - void **dest); -ssize_t push_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src); -ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, void **dest, const char *src); -ssize_t push_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src); -ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src); -ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const void *src); -ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src); - -/* iconv */ -smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode); -int smb_iconv_close(smb_iconv_t cd); -size_t smb_iconv(smb_iconv_t cd, - const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft); -smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, - const char *fromcode, bool native_iconv); - -/* iconv convenience */ -struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx, - const char *dos_charset, - const char *unix_charset, - bool native_iconv); - -void load_case_tables(void); -bool charset_register_backend(const void *_funcs); - -#endif /* __CHARSET_H__ */ diff --git a/source4/lib/charset/config.m4 b/source4/lib/charset/config.m4 deleted file mode 100644 index 453de9fe26..0000000000 --- a/source4/lib/charset/config.m4 +++ /dev/null @@ -1,86 +0,0 @@ -dnl SMB_CHECK_ICONV(hdr, msg, action-if-found,action-if-not-found) -AC_DEFUN(SMB_CHECK_ICONV,[ - AC_MSG_CHECKING($2) - AC_TRY_RUN([#include <stdlib.h> -#include <$1> - -int main() -{ - iconv_t cd = iconv_open("ASCII","UCS-2LE"); - if (cd == 0 || cd == (iconv_t)-1) return -1; - return 0; -} - ], - [AC_MSG_RESULT(yes); $3], - [AC_MSG_RESULT(no); $4], - [AC_MSG_RESULT(cross); $4]) -]) - -dnl SMB_CHECK_ICONV_DIR(dir,action-if-found,action-if-not-found) -AC_DEFUN(SMB_CHECK_ICONV_DIR, -[ - save_CPPFLAGS="$CPPFLAGS" - save_LDFLAGS="$LDFLAGS" - save_LIBS="$LIBS" - CPPFLAGS="-I$1/include" - LDFLAGS="-L$1/lib" - LIBS=-liconv - - SMB_CHECK_ICONV(iconv.h,Whether iconv.h is present,[ AC_DEFINE(HAVE_ICONV_H,1,[Whether iconv.h is present]) $2 ], [ - LIBS=-lgiconv - SMB_CHECK_ICONV(giconv.h,Whether giconv.h is present, [AC_DEFINE(HAVE_GICONV_H,1,[Whether giconv.h is present]) $2],[$3]) - ]) - - CPPFLAGS="$save_CPPFLAGS" - LDFLAGS="$save_LDFLAGS" - LIBS="$save_LIBS" -]) - -ICONV_FOUND=no -LOOK_DIRS="/usr /usr/local /sw" -AC_ARG_WITH(libiconv, -[ --with-libiconv=BASEDIR Use libiconv in BASEDIR/lib and BASEDIR/include (default=auto) ], -[ - if test "$withval" = "no" ; then - AC_MSG_ERROR(I won't take no for an answer) - else - if test "$withval" != "yes" ; then - SMB_CHECK_ICONV_DIR($withval, [ - ICONV_FOUND=yes; - ICONV_CPPFLAGS="$CPPFLAGS" - ICONV_LIBS="$LIBS" - ICONV_LDFLAGS="$LDFLAGS" - ], [AC_MSG_ERROR([No iconv library found in $withval])]) - fi - fi -]) - -if test x$ICONV_FOUND = xno; then - SMB_CHECK_ICONV(iconv.h, - [Whether iconv.h is present], - [AC_DEFINE(HAVE_ICONV_H,1,[Whether iconv.h is present]) ICONV_FOUND=yes]) -fi - -for i in $LOOK_DIRS ; do - if test x$ICONV_FOUND = xyes; then - break - fi - - SMB_CHECK_ICONV_DIR($i, [ - ICONV_FOUND=yes - ICONV_CPPFLAGS="$CPPFLAGS" - ICONV_LIBS="$LIBS" - ICONV_LDFLAGS="$LDFLAGS" - ], []) -done - -if test x"$ICONV_FOUND" = x"no"; then - AC_MSG_WARN([Sufficient support for iconv function was not found. - Install libiconv from http://www.gnu.org/software/libiconv/ for better charset compatibility!]) - SMB_ENABLE(ICONV,NO) -else - AC_DEFINE(HAVE_NATIVE_ICONV,1,[Whether external iconv is available]) - SMB_ENABLE(ICONV,YES) -fi - -SMB_EXT_LIB(ICONV,[${ICONV_LIBS}],[${ICONV_CFLAGS}],[${ICONV_CPPFLAGS}],[${ICONV_LDFLAGS}]) diff --git a/source4/lib/charset/config.mk b/source4/lib/charset/config.mk deleted file mode 100644 index 12c2f5f321..0000000000 --- a/source4/lib/charset/config.mk +++ /dev/null @@ -1,13 +0,0 @@ -################################################ -# Start SUBSYSTEM CHARSET -[SUBSYSTEM::CHARSET] -PUBLIC_DEPENDENCIES = ICONV -PRIVATE_DEPENDENCIES = DYNCONFIG -# End SUBSYSTEM CHARSET -################################################ - -CHARSET_OBJ_FILES = $(addprefix $(libcharsetsrcdir)/, iconv.o charcnv.o util_unistr.o) - -PUBLIC_HEADERS += $(libcharsetsrcdir)/charset.h - -$(eval $(call proto_header_template,$(libcharsetsrcdir)/charset_proto.h,$(CHARSET_OBJ_FILES:.o=.c))) diff --git a/source4/lib/charset/iconv.c b/source4/lib/charset/iconv.c deleted file mode 100644 index 150383e7f9..0000000000 --- a/source4/lib/charset/iconv.c +++ /dev/null @@ -1,711 +0,0 @@ -/* - Unix SMB/CIFS implementation. - minimal iconv implementation - Copyright (C) Andrew Tridgell 2001 - Copyright (C) Jelmer Vernooij 2002 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "includes.h" -#include "../lib/util/dlinklist.h" -#include "system/iconv.h" -#include "system/filesys.h" -#include "param/param.h" - - -/** - * @file - * - * @brief Samba wrapper/stub for iconv character set conversion. - * - * iconv is the XPG2 interface for converting between character - * encodings. This file provides a Samba wrapper around it, and also - * a simple reimplementation that is used if the system does not - * implement iconv. - * - * Samba only works with encodings that are supersets of ASCII: ascii - * characters like whitespace can be tested for directly, multibyte - * sequences start with a byte with the high bit set, and strings are - * terminated by a nul byte. - * - * Note that the only function provided by iconv is conversion between - * characters. It doesn't directly support operations like - * uppercasing or comparison. We have to convert to UTF-16LE and - * compare there. - * - * @sa Samba Developers Guide - **/ - -static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *); -static size_t ascii_push (void *,const char **, size_t *, char **, size_t *); -static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *); -static size_t utf8_push (void *,const char **, size_t *, char **, size_t *); -static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *); -static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *); -static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *); -static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *); - -static const struct charset_functions builtin_functions[] = { - /* windows is closest to UTF-16 */ - {"UCS-2LE", iconv_copy, iconv_copy}, - {"UTF-16LE", iconv_copy, iconv_copy}, - {"UCS-2BE", iconv_swab, iconv_swab}, - {"UTF-16BE", iconv_swab, iconv_swab}, - - /* we include the UTF-8 alias to cope with differing locale settings */ - {"UTF8", utf8_pull, utf8_push}, - {"UTF-8", utf8_pull, utf8_push}, - {"ASCII", ascii_pull, ascii_push}, - {"UCS2-HEX", ucs2hex_pull, ucs2hex_push} -}; - -static struct charset_functions *charsets = NULL; - -bool charset_register_backend(const void *_funcs) -{ - struct charset_functions *funcs = (struct charset_functions *)memdup(_funcs,sizeof(struct charset_functions)); - struct charset_functions *c; - - /* Check whether we already have this charset... */ - for (c = charsets; c != NULL; c = c->next) { - if(!strcasecmp(c->name, funcs->name)) { - DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name)); - return false; - } - } - - funcs->next = funcs->prev = NULL; - DLIST_ADD(charsets, funcs); - return true; -} - -#ifdef HAVE_NATIVE_ICONV -/* if there was an error then reset the internal state, - this ensures that we don't have a shift state remaining for - character sets like SJIS */ -static size_t sys_iconv(void *cd, - const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - size_t ret = iconv((iconv_t)cd, - discard_const_p(char *, inbuf), inbytesleft, - outbuf, outbytesleft); - if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL); - return ret; -} -#endif - -/** - * This is a simple portable iconv() implementaion. - * - * It only knows about a very small number of character sets - just - * enough that Samba works on systems that don't have iconv. - **/ -_PUBLIC_ size_t smb_iconv(smb_iconv_t cd, - const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - char cvtbuf[2048]; - size_t bufsize; - - /* in many cases we can go direct */ - if (cd->direct) { - return cd->direct(cd->cd_direct, - inbuf, inbytesleft, outbuf, outbytesleft); - } - - - /* otherwise we have to do it chunks at a time */ - while (*inbytesleft > 0) { - char *bufp1 = cvtbuf; - const char *bufp2 = cvtbuf; - - bufsize = sizeof(cvtbuf); - - if (cd->pull(cd->cd_pull, - inbuf, inbytesleft, &bufp1, &bufsize) == -1 - && errno != E2BIG) return -1; - - bufsize = sizeof(cvtbuf) - bufsize; - - if (cd->push(cd->cd_push, - &bufp2, &bufsize, - outbuf, outbytesleft) == -1) return -1; - } - - return 0; -} - -static bool is_utf16(const char *name) -{ - return strcasecmp(name, "UCS-2LE") == 0 || - strcasecmp(name, "UTF-16LE") == 0; -} - - - -_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, - const char *fromcode, bool native_iconv) -{ - smb_iconv_t ret; - const struct charset_functions *from=NULL, *to=NULL; - int i; - - ret = (smb_iconv_t)talloc_named(mem_ctx, - sizeof(*ret), - "iconv(%s,%s)", tocode, fromcode); - if (!ret) { - errno = ENOMEM; - return (smb_iconv_t)-1; - } - memset(ret, 0, sizeof(*ret)); - - /* check for the simplest null conversion */ - if (strcmp(fromcode, tocode) == 0) { - ret->direct = iconv_copy; - return ret; - } - - for (i=0;i<ARRAY_SIZE(builtin_functions);i++) { - if (strcasecmp(fromcode, builtin_functions[i].name) == 0) { - from = &builtin_functions[i]; - } - if (strcasecmp(tocode, builtin_functions[i].name) == 0) { - to = &builtin_functions[i]; - } - } - - if (from == NULL) { - for (from=charsets; from; from=from->next) { - if (strcasecmp(from->name, fromcode) == 0) break; - } - } - - if (to == NULL) { - for (to=charsets; to; to=to->next) { - if (strcasecmp(to->name, tocode) == 0) break; - } - } - -#ifdef HAVE_NATIVE_ICONV - if ((!from || !to) && !native_iconv) { - goto failed; - } - if (!from) { - ret->pull = sys_iconv; - ret->cd_pull = iconv_open("UTF-16LE", fromcode); - if (ret->cd_pull == (iconv_t)-1) - ret->cd_pull = iconv_open("UCS-2LE", fromcode); - if (ret->cd_pull == (iconv_t)-1) goto failed; - } - - if (!to) { - ret->push = sys_iconv; - ret->cd_push = iconv_open(tocode, "UTF-16LE"); - if (ret->cd_push == (iconv_t)-1) - ret->cd_push = iconv_open(tocode, "UCS-2LE"); - if (ret->cd_push == (iconv_t)-1) goto failed; - } -#else - if (!from || !to) { - goto failed; - } -#endif - - /* check for conversion to/from ucs2 */ - if (is_utf16(fromcode) && to) { - ret->direct = to->push; - return ret; - } - if (is_utf16(tocode) && from) { - ret->direct = from->pull; - return ret; - } - -#ifdef HAVE_NATIVE_ICONV - if (is_utf16(fromcode)) { - ret->direct = sys_iconv; - ret->cd_direct = ret->cd_push; - ret->cd_push = NULL; - return ret; - } - if (is_utf16(tocode)) { - ret->direct = sys_iconv; - ret->cd_direct = ret->cd_pull; - ret->cd_pull = NULL; - return ret; - } -#endif - - /* the general case has to go via a buffer */ - if (!ret->pull) ret->pull = from->pull; - if (!ret->push) ret->push = to->push; - return ret; - -failed: - talloc_free(ret); - errno = EINVAL; - return (smb_iconv_t)-1; -} - -/* - simple iconv_open() wrapper - */ -_PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) -{ - return smb_iconv_open_ex(NULL, tocode, fromcode, true); -} - -/* - simple iconv_close() wrapper -*/ -_PUBLIC_ int smb_iconv_close(smb_iconv_t cd) -{ -#ifdef HAVE_NATIVE_ICONV - if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct); - if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull); - if (cd->cd_push) iconv_close((iconv_t)cd->cd_push); -#endif - - talloc_free(cd); - return 0; -} - - -/********************************************************************** - the following functions implement the builtin character sets in Samba - and also the "test" character sets that are designed to test - multi-byte character set support for english users -***********************************************************************/ -static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - while (*inbytesleft >= 1 && *outbytesleft >= 2) { - (*outbuf)[0] = (*inbuf)[0]; - (*outbuf)[1] = 0; - (*inbytesleft) -= 1; - (*outbytesleft) -= 2; - (*inbuf) += 1; - (*outbuf) += 2; - } - - if (*inbytesleft > 0) { - errno = E2BIG; - return -1; - } - - return 0; -} - -static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - int ir_count=0; - - while (*inbytesleft >= 2 && *outbytesleft >= 1) { - (*outbuf)[0] = (*inbuf)[0] & 0x7F; - if ((*inbuf)[1]) ir_count++; - (*inbytesleft) -= 2; - (*outbytesleft) -= 1; - (*inbuf) += 2; - (*outbuf) += 1; - } - - if (*inbytesleft == 1) { - errno = EINVAL; - return -1; - } - - if (*inbytesleft > 1) { - errno = E2BIG; - return -1; - } - - return ir_count; -} - - -static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - while (*inbytesleft >= 1 && *outbytesleft >= 2) { - uint_t v; - - if ((*inbuf)[0] != '@') { - /* seven bit ascii case */ - (*outbuf)[0] = (*inbuf)[0]; - (*outbuf)[1] = 0; - (*inbytesleft) -= 1; - (*outbytesleft) -= 2; - (*inbuf) += 1; - (*outbuf) += 2; - continue; - } - /* it's a hex character */ - if (*inbytesleft < 5) { - errno = EINVAL; - return -1; - } - - if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) { - errno = EILSEQ; - return -1; - } - - (*outbuf)[0] = v&0xff; - (*outbuf)[1] = v>>8; - (*inbytesleft) -= 5; - (*outbytesleft) -= 2; - (*inbuf) += 5; - (*outbuf) += 2; - } - - if (*inbytesleft > 0) { - errno = E2BIG; - return -1; - } - - return 0; -} - -static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - while (*inbytesleft >= 2 && *outbytesleft >= 1) { - char buf[6]; - - if ((*inbuf)[1] == 0 && - ((*inbuf)[0] & 0x80) == 0 && - (*inbuf)[0] != '@') { - (*outbuf)[0] = (*inbuf)[0]; - (*inbytesleft) -= 2; - (*outbytesleft) -= 1; - (*inbuf) += 2; - (*outbuf) += 1; - continue; - } - if (*outbytesleft < 5) { - errno = E2BIG; - return -1; - } - snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0)); - memcpy(*outbuf, buf, 5); - (*inbytesleft) -= 2; - (*outbytesleft) -= 5; - (*inbuf) += 2; - (*outbuf) += 5; - } - - if (*inbytesleft == 1) { - errno = EINVAL; - return -1; - } - - if (*inbytesleft > 1) { - errno = E2BIG; - return -1; - } - - return 0; -} - -static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - int n; - - n = MIN(*inbytesleft, *outbytesleft); - - swab(*inbuf, *outbuf, (n&~1)); - if (n&1) { - (*outbuf)[n-1] = 0; - } - - (*inbytesleft) -= n; - (*outbytesleft) -= n; - (*inbuf) += n; - (*outbuf) += n; - - if (*inbytesleft > 0) { - errno = E2BIG; - return -1; - } - - return 0; -} - - -static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - int n; - - n = MIN(*inbytesleft, *outbytesleft); - - memmove(*outbuf, *inbuf, n); - - (*inbytesleft) -= n; - (*outbytesleft) -= n; - (*inbuf) += n; - (*outbuf) += n; - - if (*inbytesleft > 0) { - errno = E2BIG; - return -1; - } - - return 0; -} - -/* - this takes a UTF8 sequence and produces a UTF16 sequence - */ -static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - size_t in_left=*inbytesleft, out_left=*outbytesleft; - const uint8_t *c = (const uint8_t *)*inbuf; - uint8_t *uc = (uint8_t *)*outbuf; - - while (in_left >= 1 && out_left >= 2) { - if ((c[0] & 0x80) == 0) { - uc[0] = c[0]; - uc[1] = 0; - c += 1; - in_left -= 1; - out_left -= 2; - uc += 2; - continue; - } - - if ((c[0] & 0xe0) == 0xc0) { - if (in_left < 2 || - (c[1] & 0xc0) != 0x80) { - errno = EILSEQ; - goto error; - } - uc[1] = (c[0]>>2) & 0x7; - uc[0] = (c[0]<<6) | (c[1]&0x3f); - c += 2; - in_left -= 2; - out_left -= 2; - uc += 2; - continue; - } - - if ((c[0] & 0xf0) == 0xe0) { - if (in_left < 3 || - (c[1] & 0xc0) != 0x80 || - (c[2] & 0xc0) != 0x80) { - errno = EILSEQ; - goto error; - } - uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF); - uc[0] = (c[1]<<6) | (c[2]&0x3f); - c += 3; - in_left -= 3; - out_left -= 2; - uc += 2; - continue; - } - - if ((c[0] & 0xf8) == 0xf0) { - unsigned int codepoint; - if (in_left < 4 || - (c[1] & 0xc0) != 0x80 || - (c[2] & 0xc0) != 0x80 || - (c[3] & 0xc0) != 0x80) { - errno = EILSEQ; - goto error; - } - codepoint = - (c[3]&0x3f) | - ((c[2]&0x3f)<<6) | - ((c[1]&0x3f)<<12) | - ((c[0]&0x7)<<18); - if (codepoint < 0x10000) { - /* accept UTF-8 characters that are not - minimally packed, but pack the result */ - uc[0] = (codepoint & 0xFF); - uc[1] = (codepoint >> 8); - c += 4; - in_left -= 4; - out_left -= 2; - uc += 2; - continue; - } - - codepoint -= 0x10000; - - if (out_left < 4) { - errno = E2BIG; - goto error; - } - - uc[0] = (codepoint>>10) & 0xFF; - uc[1] = (codepoint>>18) | 0xd8; - uc[2] = codepoint & 0xFF; - uc[3] = ((codepoint>>8) & 0x3) | 0xdc; - c += 4; - in_left -= 4; - out_left -= 4; - uc += 4; - continue; - } - - /* we don't handle 5 byte sequences */ - errno = EINVAL; - goto error; - } - - if (in_left > 0) { - errno = E2BIG; - goto error; - } - - *inbytesleft = in_left; - *outbytesleft = out_left; - *inbuf = (const char *)c; - *outbuf = (char *)uc; - return 0; - -error: - *inbytesleft = in_left; - *outbytesleft = out_left; - *inbuf = (const char *)c; - *outbuf = (char *)uc; - return -1; -} - - -/* - this takes a UTF16 sequence and produces a UTF8 sequence - */ -static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) -{ - size_t in_left=*inbytesleft, out_left=*outbytesleft; - uint8_t *c = (uint8_t *)*outbuf; - const uint8_t *uc = (const uint8_t *)*inbuf; - - while (in_left >= 2 && out_left >= 1) { - unsigned int codepoint; - - if (uc[1] == 0 && !(uc[0] & 0x80)) { - /* simplest case */ - c[0] = uc[0]; - in_left -= 2; - out_left -= 1; - uc += 2; - c += 1; - continue; - } - - if ((uc[1]&0xf8) == 0) { - /* next simplest case */ - if (out_left < 2) { - errno = E2BIG; - goto error; - } - c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2); - c[1] = 0x80 | (uc[0] & 0x3f); - in_left -= 2; - out_left -= 2; - uc += 2; - c += 2; - continue; - } - - if ((uc[1] & 0xfc) == 0xdc) { - /* its the second part of a 4 byte sequence. Illegal */ - if (in_left < 4) { - errno = EINVAL; - } else { - errno = EILSEQ; - } - goto error; - } - - if ((uc[1] & 0xfc) != 0xd8) { - codepoint = uc[0] | (uc[1]<<8); - if (out_left < 3) { - errno = E2BIG; - goto error; - } - c[0] = 0xe0 | (codepoint >> 12); - c[1] = 0x80 | ((codepoint >> 6) & 0x3f); - c[2] = 0x80 | (codepoint & 0x3f); - - in_left -= 2; - out_left -= 3; - uc += 2; - c += 3; - continue; - } - - /* its the first part of a 4 byte sequence */ - if (in_left < 4) { - errno = EINVAL; - goto error; - } - if ((uc[3] & 0xfc) != 0xdc) { - errno = EILSEQ; - goto error; - } - codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | - (uc[0]<<10) | ((uc[1] & 0x3)<<18)); - - if (out_left < 4) { - errno = E2BIG; - goto error; - } - c[0] = 0xf0 | (codepoint >> 18); - c[1] = 0x80 | ((codepoint >> 12) & 0x3f); - c[2] = 0x80 | ((codepoint >> 6) & 0x3f); - c[3] = 0x80 | (codepoint & 0x3f); - - in_left -= 4; - out_left -= 4; - uc += 4; - c += 4; - } - - if (in_left == 1) { - errno = EINVAL; - goto error; - } - - if (in_left > 1) { - errno = E2BIG; - goto error; - } - - *inbytesleft = in_left; - *outbytesleft = out_left; - *inbuf = (const char *)uc; - *outbuf = (char *)c; - - return 0; - -error: - *inbytesleft = in_left; - *outbytesleft = out_left; - *inbuf = (const char *)uc; - *outbuf = (char *)c; - return -1; -} - - - diff --git a/source4/lib/charset/tests/charset.c b/source4/lib/charset/tests/charset.c deleted file mode 100644 index 5e42ca2932..0000000000 --- a/source4/lib/charset/tests/charset.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - Unix SMB/CIFS implementation. - test suite for the charcnv functions - - Copyright (C) Jelmer Vernooij 2007 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "includes.h" -#include "torture/torture.h" - -static bool test_toupper_w(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, toupper_w('c'), 'C', "c"); - torture_assert_int_equal(tctx, toupper_w('Z'), 'Z', "z"); - torture_assert_int_equal(tctx, toupper_w(0xFFFF4565), 0xFFFF4565, "0xFFFF4565"); - return true; -} - -static bool test_tolower_w(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, tolower_w('C'), 'c', "c"); - torture_assert_int_equal(tctx, tolower_w('z'), 'z', "z"); - torture_assert_int_equal(tctx, tolower_w(0xFFFF4565), 0xFFFF4565, "0xFFFF4565"); - return true; -} - -static bool test_codepoint_cmpi(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, codepoint_cmpi('a', 'a'), 0, "same char"); - torture_assert_int_equal(tctx, codepoint_cmpi('A', 'a'), 0, "upcase version"); - torture_assert_int_equal(tctx, codepoint_cmpi('b', 'a'), 1, "right diff"); - torture_assert_int_equal(tctx, codepoint_cmpi('a', 'b'), -1, "right diff"); - return true; -} - -static bool test_strcasecmp_m(struct torture_context *tctx) -{ - torture_assert(tctx, strcasecmp_m("foo", "bar") != 0, "different strings"); - torture_assert(tctx, strcasecmp_m("foo", "foo") == 0, "same case strings"); - torture_assert(tctx, strcasecmp_m("foo", "Foo") == 0, "different case strings"); - torture_assert(tctx, strcasecmp_m(NULL, "Foo") != 0, "one NULL"); - torture_assert(tctx, strcasecmp_m("foo", NULL) != 0, "other NULL"); - torture_assert(tctx, strcasecmp_m(NULL, NULL) == 0, "both NULL"); - return true; -} - - -static bool test_strequal_w(struct torture_context *tctx) -{ - torture_assert(tctx, !strequal_w("foo", "bar"), "different strings"); - torture_assert(tctx, strequal_w("foo", "foo"), "same case strings"); - torture_assert(tctx, strequal_w("foo", "Foo"), "different case strings"); - torture_assert(tctx, !strequal_w(NULL, "Foo"), "one NULL"); - torture_assert(tctx, !strequal_w("foo", NULL), "other NULL"); - torture_assert(tctx, strequal_w(NULL, NULL), "both NULL"); - return true; -} - -static bool test_strcsequal_w(struct torture_context *tctx) -{ - torture_assert(tctx, !strcsequal_w("foo", "bar"), "different strings"); - torture_assert(tctx, strcsequal_w("foo", "foo"), "same case strings"); - torture_assert(tctx, !strcsequal_w("foo", "Foo"), "different case strings"); - torture_assert(tctx, !strcsequal_w(NULL, "Foo"), "one NULL"); - torture_assert(tctx, !strcsequal_w("foo", NULL), "other NULL"); - torture_assert(tctx, strcsequal_w(NULL, NULL), "both NULL"); - return true; -} - -static bool test_string_replace_w(struct torture_context *tctx) -{ - char data[6] = "bla"; - string_replace_w(data, 'b', 'c'); - torture_assert_str_equal(tctx, data, "cla", "first char replaced"); - memcpy(data, "bab", 4); - string_replace_w(data, 'b', 'c'); - torture_assert_str_equal(tctx, data, "cac", "other chars replaced"); - memcpy(data, "bba", 4); - string_replace_w(data, 'b', 'c'); - torture_assert_str_equal(tctx, data, "cca", "other chars replaced"); - memcpy(data, "blala", 6); - string_replace_w(data, 'o', 'c'); - torture_assert_str_equal(tctx, data, "blala", "no chars replaced"); - string_replace_w(NULL, 'b', 'c'); - return true; -} - -static bool test_strncasecmp_m(struct torture_context *tctx) -{ - torture_assert(tctx, strncasecmp_m("foo", "bar", 3) != 0, "different strings"); - torture_assert(tctx, strncasecmp_m("foo", "foo", 3) == 0, "same case strings"); - torture_assert(tctx, strncasecmp_m("foo", "Foo", 3) == 0, "different case strings"); - torture_assert(tctx, strncasecmp_m("fool", "Foo", 3) == 0, "different case strings"); - torture_assert(tctx, strncasecmp_m("fool", "Fool", 40) == 0, "over size"); - torture_assert(tctx, strncasecmp_m("BLA", "Fool", 0) == 0, "empty"); - torture_assert(tctx, strncasecmp_m(NULL, "Foo", 3) != 0, "one NULL"); - torture_assert(tctx, strncasecmp_m("foo", NULL, 3) != 0, "other NULL"); - torture_assert(tctx, strncasecmp_m(NULL, NULL, 3) == 0, "both NULL"); - return true; -} - -static bool test_next_token_null(struct torture_context *tctx) -{ - char buf[20]; - torture_assert(tctx, !next_token(NULL, buf, " ", 20), "null ptr works"); - return true; -} - -static bool test_next_token(struct torture_context *tctx) -{ - const char *teststr = "foo bar bla"; - char buf[20]; - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "foo", "token matches"); - torture_assert_str_equal(tctx, teststr, "bar bla", "ptr modified correctly"); - - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "bar", "token matches"); - torture_assert_str_equal(tctx, teststr, "bla", "ptr modified correctly"); - - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "bla", "token matches"); - torture_assert_str_equal(tctx, teststr, "", "ptr modified correctly"); - - torture_assert(tctx, !next_token(&teststr, buf, " ", 20), "finding token doesn't work"); - return true; -} - -static bool test_next_token_implicit_sep(struct torture_context *tctx) -{ - const char *teststr = "foo\tbar\n bla"; - char buf[20]; - torture_assert(tctx, next_token(&teststr, buf, NULL, 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "foo", "token matches"); - torture_assert_str_equal(tctx, teststr, "bar\n bla", "ptr modified correctly"); - - torture_assert(tctx, next_token(&teststr, buf, NULL, 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "bar", "token matches"); - torture_assert_str_equal(tctx, teststr, " bla", "ptr modified correctly"); - - torture_assert(tctx, next_token(&teststr, buf, NULL, 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "bla", "token matches"); - torture_assert_str_equal(tctx, teststr, "", "ptr modified correctly"); - - torture_assert(tctx, !next_token(&teststr, buf, NULL, 20), "finding token doesn't work"); - return true; -} - -static bool test_next_token_seps(struct torture_context *tctx) -{ - const char *teststr = ",foo bla"; - char buf[20]; - torture_assert(tctx, next_token(&teststr, buf, ",", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "foo bla", "token matches"); - torture_assert_str_equal(tctx, teststr, "", "ptr modified correctly"); - - torture_assert(tctx, !next_token(&teststr, buf, ",", 20), "finding token doesn't work"); - return true; -} - -static bool test_next_token_quotes(struct torture_context *tctx) -{ - const char *teststr = "\"foo bar\" bla"; - char buf[20]; - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "foo bar", "token matches"); - torture_assert_str_equal(tctx, teststr, "bla", "ptr modified correctly"); - - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "bla", "token matches"); - torture_assert_str_equal(tctx, teststr, "", "ptr modified correctly"); - - torture_assert(tctx, !next_token(&teststr, buf, " ", 20), "finding token doesn't work"); - return true; -} - -static bool test_next_token_quote_wrong(struct torture_context *tctx) -{ - const char *teststr = "\"foo bar bla"; - char buf[20]; - torture_assert(tctx, next_token(&teststr, buf, " ", 20), "finding token works"); - torture_assert_str_equal(tctx, buf, "foo bar bla", "token matches"); - torture_assert_str_equal(tctx, teststr, "", "ptr modified correctly"); - - torture_assert(tctx, !next_token(&teststr, buf, " ", 20), "finding token doesn't work"); - return true; -} - -static bool test_strlen_m(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, strlen_m("foo"), 3, "simple len"); - torture_assert_int_equal(tctx, strlen_m("foo\x83l"), 6, "extended len"); - torture_assert_int_equal(tctx, strlen_m(NULL), 0, "NULL"); - return true; -} - -static bool test_strlen_m_term(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, strlen_m_term("foo"), 4, "simple len"); - torture_assert_int_equal(tctx, strlen_m_term("foo\x83l"), 7, "extended len"); - torture_assert_int_equal(tctx, strlen_m(NULL), 0, "NULL"); - return true; -} - -static bool test_strhaslower(struct torture_context *tctx) -{ - torture_assert(tctx, strhaslower("a"), "one low char"); - torture_assert(tctx, strhaslower("aB"), "one low, one up char"); - torture_assert(tctx, !strhaslower("B"), "one up char"); - torture_assert(tctx, !strhaslower(""), "empty string"); - torture_assert(tctx, !strhaslower("3"), "one digit"); - return true; -} - -static bool test_strhasupper(struct torture_context *tctx) -{ - torture_assert(tctx, strhasupper("B"), "one up char"); - torture_assert(tctx, strhasupper("aB"), "one low, one up char"); - torture_assert(tctx, !strhasupper("a"), "one low char"); - torture_assert(tctx, !strhasupper(""), "empty string"); - torture_assert(tctx, !strhasupper("3"), "one digit"); - return true; -} - -static bool test_count_chars_w(struct torture_context *tctx) -{ - torture_assert_int_equal(tctx, count_chars_w("foo", 'o'), 2, "simple"); - torture_assert_int_equal(tctx, count_chars_w("", 'o'), 0, "empty"); - torture_assert_int_equal(tctx, count_chars_w("bla", 'o'), 0, "none"); - torture_assert_int_equal(tctx, count_chars_w("bla", '\0'), 0, "null"); - return true; -} - -struct torture_suite *torture_local_charset(TALLOC_CTX *mem_ctx) -{ - struct torture_suite *suite = torture_suite_create(mem_ctx, "CHARSET"); - - torture_suite_add_simple_test(suite, "toupper_w", test_toupper_w); - torture_suite_add_simple_test(suite, "tolower_w", test_tolower_w); - torture_suite_add_simple_test(suite, "codepoint_cmpi", test_codepoint_cmpi); - torture_suite_add_simple_test(suite, "strcasecmp_m", test_strcasecmp_m); - torture_suite_add_simple_test(suite, "strequal_w", test_strequal_w); - torture_suite_add_simple_test(suite, "strcsequal_w", test_strcsequal_w); - torture_suite_add_simple_test(suite, "string_replace_w", test_string_replace_w); - torture_suite_add_simple_test(suite, "strncasecmp_m", test_strncasecmp_m); - torture_suite_add_simple_test(suite, "next_token", test_next_token); - torture_suite_add_simple_test(suite, "next_token_null", test_next_token_null); - torture_suite_add_simple_test(suite, "next_token_implicit_sep", test_next_token_implicit_sep); - torture_suite_add_simple_test(suite, "next_token_quotes", test_next_token_quotes); - torture_suite_add_simple_test(suite, "next_token_seps", test_next_token_seps); - torture_suite_add_simple_test(suite, "next_token_quote_wrong", test_next_token_quote_wrong); - torture_suite_add_simple_test(suite, "strlen_m", test_strlen_m); - torture_suite_add_simple_test(suite, "strlen_m_term", test_strlen_m_term); - torture_suite_add_simple_test(suite, "strhaslower", test_strhaslower); - torture_suite_add_simple_test(suite, "strhasupper", test_strhasupper); - torture_suite_add_simple_test(suite, "count_chars_w", test_count_chars_w); - - return suite; -} diff --git a/source4/lib/charset/tests/iconv.c b/source4/lib/charset/tests/iconv.c deleted file mode 100644 index aeb42c2fa1..0000000000 --- a/source4/lib/charset/tests/iconv.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - Unix SMB/CIFS implementation. - - local testing of iconv routines. This tests the system iconv code against - the built-in iconv code - - Copyright (C) Andrew Tridgell 2004 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "includes.h" -#include "torture/torture.h" -#include "system/iconv.h" -#include "system/time.h" -#include "libcli/raw/libcliraw.h" -#include "param/param.h" -#include "torture/util.h" - -#if HAVE_NATIVE_ICONV - -static bool iconv_untestable(struct torture_context *tctx) -{ - iconv_t cd; - - if (!lp_parm_bool(tctx->lp_ctx, NULL, "iconv", "native", true)) - torture_skip(tctx, "system iconv disabled - skipping test"); - - cd = iconv_open("UTF-16LE", "UCS-4LE"); - if (cd == (iconv_t)-1) - torture_skip(tctx, "unable to test - system iconv library does not support UTF-16LE -> UCS-4LE"); - iconv_close(cd); - - cd = iconv_open("UTF-16LE", "CP850"); - if (cd == (iconv_t)-1) - torture_skip(tctx, "unable to test - system iconv library does not support UTF-16LE -> CP850\n"); - iconv_close(cd); - - return false; -} - -/* - generate a UTF-16LE buffer for a given unicode codepoint -*/ -static int gen_codepoint_utf16(unsigned int codepoint, - char *buf, size_t *size) -{ - static iconv_t cd; - uint8_t in[4]; - char *ptr_in; - size_t size_in, size_out, ret; - if (!cd) { - cd = iconv_open("UTF-16LE", "UCS-4LE"); - if (cd == (iconv_t)-1) { - cd = NULL; - return -1; - } - } - - in[0] = codepoint & 0xFF; - in[1] = (codepoint>>8) & 0xFF; - in[2] = (codepoint>>16) & 0xFF; - in[3] = (codepoint>>24) & 0xFF; - - ptr_in = (char *)in; - size_in = 4; - size_out = 8; - - ret = iconv(cd, &ptr_in, &size_in, &buf, &size_out); - - *size = 8 - size_out; - - return ret; -} - - -/* - work out the unicode codepoint of the first UTF-8 character in the buffer -*/ -static unsigned int get_codepoint(char *buf, size_t size, const char *charset) -{ - iconv_t cd; - uint8_t out[4]; - char *ptr_out; - size_t size_out, size_in, ret; - - cd = iconv_open("UCS-4LE", charset); - - size_in = size; - ptr_out = (char *)out; - size_out = sizeof(out); - memset(out, 0, sizeof(out)); - - ret = iconv(cd, &buf, &size_in, &ptr_out, &size_out); - - iconv_close(cd); - - return out[0] | (out[1]<<8) | (out[2]<<16) | (out[3]<<24); -} - -/* - display a buffer with name prefix -*/ -static void show_buf(const char *name, uint8_t *buf, size_t size) -{ - int i; - printf("%s ", name); - for (i=0;i<size;i++) { - printf("%02x ", buf[i]); - } - printf("\n"); -} - -/* - given a UTF-16LE buffer, test the system and built-in iconv code to - make sure they do exactly the same thing in converting the buffer to - "charset", then convert it back again and ensure we get the same - buffer back -*/ -static bool test_buffer(struct torture_context *test, - uint8_t *inbuf, size_t size, const char *charset) -{ - uint8_t buf1[1000], buf2[1000], buf3[1000]; - size_t outsize1, outsize2, outsize3; - const char *ptr_in; - char *ptr_out; - size_t size_in1, size_in2, size_in3; - size_t ret1, ret2, ret3, len1, len2; - int errno1, errno2; - static iconv_t cd; - static smb_iconv_t cd2, cd3; - static const char *last_charset; - - if (cd && last_charset) { - iconv_close(cd); - smb_iconv_close(cd2); - smb_iconv_close(cd3); - cd = NULL; - } - - if (!cd) { - cd = iconv_open(charset, "UTF-16LE"); - if (cd == (iconv_t)-1) { - torture_fail(test, - talloc_asprintf(test, - "failed to open %s to UTF-16LE", - charset)); - } - cd2 = smb_iconv_open_ex(test, charset, "UTF-16LE", lp_parm_bool(test->lp_ctx, NULL, "iconv", "native", true)); - cd3 = smb_iconv_open_ex(test, "UTF-16LE", charset, lp_parm_bool(test->lp_ctx, NULL, "iconv", "native", true)); - last_charset = charset; - } - - /* internal convert to charset - placing result in buf1 */ - ptr_in = (const char *)inbuf; - ptr_out = (char *)buf1; - size_in1 = size; - outsize1 = sizeof(buf1); - - memset(ptr_out, 0, outsize1); - errno = 0; - ret1 = smb_iconv(cd2, &ptr_in, &size_in1, &ptr_out, &outsize1); - errno1 = errno; - - /* system convert to charset - placing result in buf2 */ - ptr_in = (const char *)inbuf; - ptr_out = (char *)buf2; - size_in2 = size; - outsize2 = sizeof(buf2); - - memset(ptr_out, 0, outsize2); - errno = 0; - ret2 = iconv(cd, discard_const_p(char *, &ptr_in), &size_in2, &ptr_out, &outsize2); - errno2 = errno; - - len1 = sizeof(buf1) - outsize1; - len2 = sizeof(buf2) - outsize2; - - /* codepoints above 1M are not interesting for now */ - if (len2 > len1 && - memcmp(buf1, buf2, len1) == 0 && - get_codepoint((char *)(buf2+len1), len2-len1, charset) >= (1<<20)) { - return true; - } - if (len1 > len2 && - memcmp(buf1, buf2, len2) == 0 && - get_codepoint((char *)(buf1+len2), len1-len2, charset) >= (1<<20)) { - return true; - } - - torture_assert_int_equal(test, ret1, ret2, "ret mismatch"); - - if (errno1 != errno2) { - show_buf(" rem1:", inbuf+(size-size_in1), size_in1); - show_buf(" rem2:", inbuf+(size-size_in2), size_in2); - torture_fail(test, talloc_asprintf(test, - "e1=%d/%s e2=%d/%s", - errno1, strerror(errno1), - errno2, strerror(errno2))); - } - - torture_assert_int_equal(test, outsize1, outsize2, "outsize mismatch"); - - torture_assert_int_equal(test, size_in1, size_in2, "size_in mismatch"); - - if (len1 != len2 || - memcmp(buf1, buf2, len1) != 0) { - torture_comment(test, "size=%d ret1=%d ret2=%d", (int)size, (int)ret1, (int)ret2); - show_buf(" IN1:", inbuf, size-size_in1); - show_buf(" IN2:", inbuf, size-size_in2); - show_buf("OUT1:", buf1, len1); - show_buf("OUT2:", buf2, len2); - if (len2 > len1 && memcmp(buf1, buf2, len1) == 0) { - torture_comment(test, "next codepoint is %u", - get_codepoint((char *)(buf2+len1), len2-len1, charset)); - } - if (len1 > len2 && memcmp(buf1, buf2, len2) == 0) { - torture_comment(test, "next codepoint is %u", - get_codepoint((char *)(buf1+len2),len1-len2, charset)); - } - - torture_fail(test, "failed"); - } - - /* convert back to UTF-16, putting result in buf3 */ - size = size - size_in1; - ptr_in = (const char *)buf1; - ptr_out = (char *)buf3; - size_in3 = len1; - outsize3 = sizeof(buf3); - - memset(ptr_out, 0, outsize3); - ret3 = smb_iconv(cd3, &ptr_in, &size_in3, &ptr_out, &outsize3); - - /* we only internally support the first 1M codepoints */ - if (outsize3 != sizeof(buf3) - size && - get_codepoint((char *)(inbuf+sizeof(buf3) - outsize3), - size - (sizeof(buf3) - outsize3), - "UTF-16LE") >= (1<<20)) { - return true; - } - - torture_assert_int_equal(test, ret3, 0, talloc_asprintf(test, - "pull failed - %s", strerror(errno))); - - if (strncmp(charset, "UTF", 3) != 0) { - /* don't expect perfect mappings for non UTF charsets */ - return true; - } - - - torture_assert_int_equal(test, outsize3, sizeof(buf3) - size, - "wrong outsize3"); - - if (memcmp(buf3, inbuf, size) != 0) { - torture_comment(test, "pull bytes mismatch:"); - show_buf("inbuf", inbuf, size); - show_buf(" buf3", buf3, sizeof(buf3) - outsize3); - torture_comment(test, "next codepoint is %u\n", - get_codepoint((char *)(inbuf+sizeof(buf3) - outsize3), - size - (sizeof(buf3) - outsize3), - "UTF-16LE")); - torture_fail(test, ""); - } - - return true; -} - - -/* - test the push_codepoint() and next_codepoint() functions for a given - codepoint -*/ -static bool test_codepoint(struct torture_context *tctx, unsigned int codepoint) -{ - uint8_t buf[10]; - size_t size, size2; - codepoint_t c; - - size = push_codepoint(lp_iconv_convenience(tctx->lp_ctx), (char *)buf, codepoint); - torture_assert(tctx, size != -1 || (codepoint >= 0xd800 && codepoint <= 0x10000), - "Invalid Codepoint range"); - - if (size == -1) return true; - - buf[size] = random(); - buf[size+1] = random(); - buf[size+2] = random(); - buf[size+3] = random(); - - c = next_codepoint(lp_iconv_convenience(tctx->lp_ctx), (char *)buf, &size2); - - torture_assert(tctx, c == codepoint, - talloc_asprintf(tctx, - "next_codepoint(%u) failed - gave %u", codepoint, c)); - - torture_assert(tctx, size2 == size, - talloc_asprintf(tctx, "next_codepoint(%u) gave wrong size %d (should be %d)\n", - codepoint, (int)size2, (int)size)); - - return true; -} - -static bool test_next_codepoint(struct torture_context *tctx) -{ - unsigned int codepoint; - if (iconv_untestable(tctx)) - return true; - - for (codepoint=0;codepoint<(1<<20);codepoint++) { - if (!test_codepoint(tctx, codepoint)) - return false; - } - return true; -} - -static bool test_first_1m(struct torture_context *tctx) -{ - unsigned int codepoint; - size_t size; - unsigned char inbuf[1000]; - - if (iconv_untestable(tctx)) - return true; - - for (codepoint=0;codepoint<(1<<20);codepoint++) { - if (gen_codepoint_utf16(codepoint, (char *)inbuf, &size) != 0) { - continue; - } - - if (codepoint % 1000 == 0) { - if (torture_setting_bool(tctx, "progress", true)) { - torture_comment(tctx, "codepoint=%u \r", codepoint); - fflush(stdout); - } - } - - if (!test_buffer(tctx, inbuf, size, "UTF-8")) - return false; - } - return true; -} - -static bool test_random_5m(struct torture_context *tctx) -{ - unsigned char inbuf[1000]; - unsigned int i; - - if (iconv_untestable(tctx)) - return true; - - for (i=0;i<500000;i++) { - size_t size; - unsigned int c; - - if (i % 1000 == 0) { - if (torture_setting_bool(tctx, "progress", true)) { - torture_comment(tctx, "i=%u \r", i); - fflush(stdout); - } - } - - size = random() % 100; - for (c=0;c<size;c++) { - if (random() % 100 < 80) { - inbuf[c] = random() % 128; - } else { - inbuf[c] = random(); - } - if (random() % 10 == 0) { - inbuf[c] |= 0xd8; - } - if (random() % 10 == 0) { - inbuf[c] |= 0xdc; - } - } - if (!test_buffer(tctx, inbuf, size, "UTF-8")) { - printf("i=%d failed UTF-8\n", i); - return false; - } - - if (!test_buffer(tctx, inbuf, size, "CP850")) { - printf("i=%d failed CP850\n", i); - return false; - } - } - return true; -} - -struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx) -{ - struct torture_suite *suite = torture_suite_create(mem_ctx, "ICONV"); - - torture_suite_add_simple_test(suite, "next_codepoint()", - test_next_codepoint); - - torture_suite_add_simple_test(suite, "first 1M codepoints", - test_first_1m); - - torture_suite_add_simple_test(suite, "5M random UTF-16LE sequences", - test_random_5m); - return suite; -} - -#else - -struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx) -{ - printf("No native iconv library - can't run iconv test\n"); - return NULL; -} - -#endif diff --git a/source4/lib/charset/util_unistr.c b/source4/lib/charset/util_unistr.c deleted file mode 100644 index e4f4bb551a..0000000000 --- a/source4/lib/charset/util_unistr.c +++ /dev/null @@ -1,684 +0,0 @@ -/* - Unix SMB/CIFS implementation. - Samba utility functions - Copyright (C) Andrew Tridgell 1992-2001 - Copyright (C) Simo Sorce 2001 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include "includes.h" -#include "system/locale.h" -#include "dynconfig/dynconfig.h" -#include "param/param.h" - -/** - * @file - * @brief Unicode string manipulation - */ - -/* these 2 tables define the unicode case handling. They are loaded - at startup either via mmap() or read() from the lib directory */ -static void *upcase_table; -static void *lowcase_table; - - -/******************************************************************* -load the case handling tables -********************************************************************/ -void load_case_tables(void) -{ - TALLOC_CTX *mem_ctx; - - mem_ctx = talloc_init("load_case_tables"); - if (!mem_ctx) { - smb_panic("No memory for case_tables"); - } - upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000); - lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000); - talloc_free(mem_ctx); - if (upcase_table == NULL) { - /* try also under codepages for testing purposes */ - upcase_table = map_file("codepages/upcase.dat", 0x20000); - if (upcase_table == NULL) { - upcase_table = (void *)-1; - } - } - if (lowcase_table == NULL) { - /* try also under codepages for testing purposes */ - lowcase_table = map_file("codepages/lowcase.dat", 0x20000); - if (lowcase_table == NULL) { - lowcase_table = (void *)-1; - } - } -} - -/** - Convert a codepoint_t to upper case. -**/ -_PUBLIC_ codepoint_t toupper_w(codepoint_t val) -{ - if (val < 128) { - return toupper(val); - } - if (upcase_table == NULL) { - load_case_tables(); - } - if (upcase_table == (void *)-1) { - return val; - } - if (val & 0xFFFF0000) { - return val; - } - return SVAL(upcase_table, val*2); -} - -/** - Convert a codepoint_t to lower case. -**/ -_PUBLIC_ codepoint_t tolower_w(codepoint_t val) -{ - if (val < 128) { - return tolower(val); - } - if (lowcase_table == NULL) { - load_case_tables(); - } - if (lowcase_table == (void *)-1) { - return val; - } - if (val & 0xFFFF0000) { - return val; - } - return SVAL(lowcase_table, val*2); -} - -/** - compare two codepoints case insensitively -*/ -_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2) -{ - if (c1 == c2 || - toupper_w(c1) == toupper_w(c2)) { - return 0; - } - return c1 - c2; -} - -/** - Case insensitive string compararison -**/ -_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2) -{ - codepoint_t c1=0, c2=0; - size_t size1, size2; - struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm); - - /* handle null ptr comparisons to simplify the use in qsort */ - if (s1 == s2) return 0; - if (s1 == NULL) return -1; - if (s2 == NULL) return 1; - - while (*s1 && *s2) { - c1 = next_codepoint(iconv_convenience, s1, &size1); - c2 = next_codepoint(iconv_convenience, s2, &size2); - - s1 += size1; - s2 += size2; - - if (c1 == c2) { - continue; - } - - if (c1 == INVALID_CODEPOINT || - c2 == INVALID_CODEPOINT) { - /* what else can we do?? */ - return strcasecmp(s1, s2); - } - - if (toupper_w(c1) != toupper_w(c2)) { - return c1 - c2; - } - } - - return *s1 - *s2; -} - -/** - * Get the next token from a string, return False if none found. - * Handles double-quotes. - * - * Based on a routine by GJC@VILLAGE.COM. - * Extensively modified by Andrew.Tridgell@anu.edu.au - **/ -_PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize) -{ - const char *s; - bool quoted; - size_t len=1; - - if (!ptr) - return false; - - s = *ptr; - - /* default to simple separators */ - if (!sep) - sep = " \t\n\r"; - - /* find the first non sep char */ - while (*s && strchr_m(sep,*s)) - s++; - - /* nothing left? */ - if (!*s) - return false; - - /* copy over the token */ - for (quoted = false; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) { - if (*s == '\"') { - quoted = !quoted; - } else { - len++; - *buff++ = *s; - } - } - - *ptr = (*s) ? s+1 : s; - *buff = 0; - - return true; -} - -/** - Case insensitive string compararison, length limited -**/ -_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n) -{ - codepoint_t c1=0, c2=0; - size_t size1, size2; - struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm); - - /* handle null ptr comparisons to simplify the use in qsort */ - if (s1 == s2) return 0; - if (s1 == NULL) return -1; - if (s2 == NULL) return 1; - - while (*s1 && *s2 && n) { - n--; - - c1 = next_codepoint(iconv_convenience, s1, &size1); - c2 = next_codepoint(iconv_convenience, s2, &size2); - - s1 += size1; - s2 += size2; - - if (c1 == c2) { - continue; - } - - if (c1 == INVALID_CODEPOINT || - c2 == INVALID_CODEPOINT) { - /* what else can we do?? */ - return strcasecmp(s1, s2); - } - - if (toupper_w(c1) != toupper_w(c2)) { - return c1 - c2; - } - } - - if (n == 0) { - return 0; - } - - return *s1 - *s2; -} - -/** - * Compare 2 strings. - * - * @note The comparison is case-insensitive. - **/ -_PUBLIC_ bool strequal_w(const char *s1, const char *s2) -{ - return strcasecmp_m(s1,s2) == 0; -} - -/** - Compare 2 strings (case sensitive). -**/ -_PUBLIC_ bool strcsequal_w(const char *s1,const char *s2) -{ - if (s1 == s2) - return true; - if (!s1 || !s2) - return false; - - return strcmp(s1,s2) == 0; -} - - -/** - String replace. - NOTE: oldc and newc must be 7 bit characters -**/ -_PUBLIC_ void string_replace_w(char *s, char oldc, char newc) -{ - while (s && *s) { - size_t size; - codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size); - if (c == oldc) { - *s = newc; - } - s += size; - } -} - -/** - Paranoid strcpy into a buffer of given length (includes terminating - zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars - and replaces with '_'. Deliberately does *NOT* check for multibyte - characters. Don't change it ! -**/ - -_PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength) -{ - size_t len, i; - - if (maxlength == 0) { - /* can't fit any bytes at all! */ - return NULL; - } - - if (!dest) { - DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n")); - return NULL; - } - - if (!src) { - *dest = 0; - return dest; - } - - len = strlen(src); - if (len >= maxlength) - len = maxlength - 1; - - if (!other_safe_chars) - other_safe_chars = ""; - - for(i = 0; i < len; i++) { - int val = (src[i] & 0xff); - if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val)) - dest[i] = src[i]; - else - dest[i] = '_'; - } - - dest[i] = '\0'; - - return dest; -} - -/** - Count the number of UCS2 characters in a string. Normally this will - be the same as the number of bytes in a string for single byte strings, - but will be different for multibyte. -**/ -_PUBLIC_ size_t strlen_m(const char *s) -{ - size_t count = 0; - - if (!s) { - return 0; - } - - while (*s && !(((uint8_t)*s) & 0x80)) { - s++; - count++; - } - - if (!*s) { - return count; - } - - while (*s) { - size_t c_size; - codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &c_size); - if (c < 0x10000) { - count += 1; - } else { - count += 2; - } - s += c_size; - } - - return count; -} - -/** - Work out the number of multibyte chars in a string, including the NULL - terminator. -**/ -_PUBLIC_ size_t strlen_m_term(const char *s) -{ - if (!s) { - return 0; - } - - return strlen_m(s) + 1; -} - -/** - Strchr and strrchr_m are a bit complex on general multi-byte strings. -**/ -_PUBLIC_ char *strchr_m(const char *s, char c) -{ - if (s == NULL) { - return NULL; - } - /* characters below 0x3F are guaranteed to not appear in - non-initial position in multi-byte charsets */ - if ((c & 0xC0) == 0) { - return strchr(s, c); - } - - while (*s) { - size_t size; - codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size); - if (c2 == c) { - return discard_const_p(char, s); - } - s += size; - } - - return NULL; -} - -/** - * Multibyte-character version of strrchr - */ -_PUBLIC_ char *strrchr_m(const char *s, char c) -{ - char *ret = NULL; - - if (s == NULL) { - return NULL; - } - - /* characters below 0x3F are guaranteed to not appear in - non-initial position in multi-byte charsets */ - if ((c & 0xC0) == 0) { - return strrchr(s, c); - } - - while (*s) { - size_t size; - codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size); - if (c2 == c) { - ret = discard_const_p(char, s); - } - s += size; - } - - return ret; -} - -/** - return True if any (multi-byte) character is lower case -*/ -_PUBLIC_ bool strhaslower(const char *string) -{ - while (*string) { - size_t c_size; - codepoint_t s; - codepoint_t t; - - s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size); - string += c_size; - - t = toupper_w(s); - - if (s != t) { - return true; /* that means it has lower case chars */ - } - } - - return false; -} - -/** - return True if any (multi-byte) character is upper case -*/ -_PUBLIC_ bool strhasupper(const char *string) -{ - while (*string) { - size_t c_size; - codepoint_t s; - codepoint_t t; - - s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size); - string += c_size; - - t = tolower_w(s); - - if (s != t) { - return true; /* that means it has upper case chars */ - } - } - - return false; -} - -/** - Convert a string to lower case, allocated with talloc -**/ -_PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src) -{ - size_t size=0; - char *dest; - struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm); - - /* this takes advantage of the fact that upper/lower can't - change the length of a character by more than 1 byte */ - dest = talloc_array(ctx, char, 2*(strlen(src))+1); - if (dest == NULL) { - return NULL; - } - - while (*src) { - size_t c_size; - codepoint_t c = next_codepoint(iconv_convenience, src, &c_size); - src += c_size; - - c = tolower_w(c); - - c_size = push_codepoint(iconv_convenience, dest+size, c); - if (c_size == -1) { - talloc_free(dest); - return NULL; - } - size += c_size; - } - - dest[size] = 0; - - /* trim it so talloc_append_string() works */ - dest = talloc_realloc(ctx, dest, char, size+1); - - talloc_set_name_const(dest, dest); - - return dest; -} - -/** - Convert a string to UPPER case, allocated with talloc - source length limited to n bytes -**/ -_PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n) -{ - size_t size=0; - char *dest; - struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm); - - if (!src) { - return NULL; - } - - /* this takes advantage of the fact that upper/lower can't - change the length of a character by more than 1 byte */ - dest = talloc_array(ctx, char, 2*(n+1)); - if (dest == NULL) { - return NULL; - } - - while (*src && n--) { - size_t c_size; - codepoint_t c = next_codepoint(iconv_convenience, src, &c_size); - src += c_size; - - c = toupper_w(c); - - c_size = push_codepoint(iconv_convenience, dest+size, c); - if (c_size == -1) { - talloc_free(dest); - return NULL; - } - size += c_size; - } - - dest[size] = 0; - - /* trim it so talloc_append_string() works */ - dest = talloc_realloc(ctx, dest, char, size+1); - - talloc_set_name_const(dest, dest); - - return dest; -} - -/** - Convert a string to UPPER case, allocated with talloc -**/ -_PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src) -{ - return strupper_talloc_n(ctx, src, src?strlen(src):0); -} - -/** - talloc_strdup() a unix string to upper case. -**/ -_PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src) -{ - return strupper_talloc(ctx, src); -} - -/** - Convert a string to lower case. -**/ -_PUBLIC_ void strlower_m(char *s) -{ - char *d; - struct smb_iconv_convenience *iconv_convenience; - - /* this is quite a common operation, so we want it to be - fast. We optimise for the ascii case, knowing that all our - supported multi-byte character sets are ascii-compatible - (ie. they match for the first 128 chars) */ - while (*s && !(((uint8_t)*s) & 0x80)) { - *s = tolower((uint8_t)*s); - s++; - } - - if (!*s) - return; - - iconv_convenience = lp_iconv_convenience(global_loadparm); - - d = s; - - while (*s) { - size_t c_size, c_size2; - codepoint_t c = next_codepoint(iconv_convenience, s, &c_size); - c_size2 = push_codepoint(iconv_convenience, d, tolower_w(c)); - if (c_size2 > c_size) { - DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n", - c, tolower_w(c), (int)c_size, (int)c_size2)); - smb_panic("codepoint expansion in strlower_m\n"); - } - s += c_size; - d += c_size2; - } - *d = 0; -} - -/** - Convert a string to UPPER case. -**/ -_PUBLIC_ void strupper_m(char *s) -{ - char *d; - struct smb_iconv_convenience *iconv_convenience; - - /* this is quite a common operation, so we want it to be - fast. We optimise for the ascii case, knowing that all our - supported multi-byte character sets are ascii-compatible - (ie. they match for the first 128 chars) */ - while (*s && !(((uint8_t)*s) & 0x80)) { - *s = toupper((uint8_t)*s); - s++; - } - - if (!*s) - return; - - iconv_convenience = lp_iconv_convenience(global_loadparm); - - d = s; - - while (*s) { - size_t c_size, c_size2; - codepoint_t c = next_codepoint(iconv_convenience, s, &c_size); - c_size2 = push_codepoint(iconv_convenience, d, toupper_w(c)); - if (c_size2 > c_size) { - DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n", - c, toupper_w(c), (int)c_size, (int)c_size2)); - smb_panic("codepoint expansion in strupper_m\n"); - } - s += c_size; - d += c_size2; - } - *d = 0; -} - - -/** - Find the number of 'c' chars in a string -**/ -_PUBLIC_ size_t count_chars_w(const char *s, char c) -{ - size_t count = 0; - - while (*s) { - size_t size; - codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size); - if (c2 == c) count++; - s += size; - } - - return count; -} - - |