/* Unix SMB/CIFS implementation. Character set conversion Extensions Copyright (C) Igor Vergeichik 2001 Copyright (C) Andrew Tridgell 2001 Copyright (C) Simo Sorce 2001 Copyright (C) Martin Pool 2003 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "includes.h" /** * @file * * @brief Character-set conversion routines built on our iconv. * * @note Samba's internal character set (at least in the 3.0 series) * is always the same as the one for the Unix filesystem. It is * not necessarily UTF-8 and may be different on machines that * need i18n filenames to be compatible with Unix software. It does * have to be a superset of ASCII. All multibyte sequences must start * with a byte with the high bit set. * * @sa lib/iconv.c */ /** * Convert string from one encoding to another, making error checking etc * Slow path version - uses (slow) iconv. * * @param src pointer to source string (multibyte or singlebyte) * @param srclen length of the source string in bytes * @param dest pointer to destination string (multibyte or singlebyte) * @param destlen maximal length allowed for string * @param converted size is the number of bytes occupied in the destination * * @returns false and sets errno on fail, true on success. * * Ensure the srclen contains the terminating zero. * **/ static bool convert_string_internal(charset_t from, charset_t to, void const *src, size_t srclen, void *dest, size_t destlen, size_t *converted_size) { size_t i_len, o_len; size_t retval; const char* inbuf = (const char*)src; char* outbuf = (char*)dest; smb_iconv_t descriptor; struct smb_iconv_handle *ic; lazy_initialize_conv(); ic = get_iconv_handle(); descriptor = get_conv_handle(ic, from, to); if (srclen == (size_t)-1) { if (from == CH_UTF16LE || from == CH_UTF16BE) { srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2; } else { srclen = strlen((const char *)src)+1; } } if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { errno = EINVAL; return false; } i_len=srclen; o_len=destlen; retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len); if (retval == (size_t)-1) { return false; } *converted_size = destlen-o_len; return true; } /** * Convert string from one encoding to another, making error checking etc * Fast path version - handles ASCII first. * * @param src pointer to source string (multibyte or singlebyte) * @param srclen length of the source string in bytes, or -1 for nul terminated. * @param dest pointer to destination string (multibyte or singlebyte) * @param destlen maximal length allowed for string - *NEVER* -1. * @param converted size is the number of bytes occupied in the destination * * @returns false and sets errno on fail, true on success. * * Ensure the srclen contains the terminating zero. * * This function has been hand-tuned to provide a fast path. * Don't change unless you really know what you are doing. JRA. **/ bool convert_string_error(charset_t from, charset_t to, void const *src, size_t srclen, void *dest, size_t destlen, size_t *converted_size) { /* * NB. We deliberately don't do a strlen here if srclen == -1. * This is very expensive over millions of calls and is taken * care of in the slow path in convert_string_internal. JRA. */ #ifdef DEVELOPER SMB_ASSERT(destlen != (size_t)-1); #endif if (srclen == 0) { *converted_size = 0; return true; } if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; size_t retval = 0; /* If all characters are ascii, fast path here. */ while (slen && dlen) { if ((lastp = *p) <= 0x7f) { *q++ = *p++; if (slen != (size_t)-1) { slen--; } dlen--; retval++; if (!lastp) break; } else { #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS goto general_case; #else bool ret = convert_string_internal(from, to, p, slen, q, dlen, converted_size); *converted_size += retval; return ret; #endif } } *converted_size = retval; if (!dlen) { /* Even if we fast path we should note if we ran out of room. */ if (((slen != (size_t)-1) && slen) || ((slen == (size_t)-1) && lastp)) { errno = E2BIG; return false; } } return true; } else if (from == CH_UTF16LE && to != CH_UTF16LE) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t retval = 0; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; /* If all characters are ascii, fast path here. */ while (((slen == (size_t)-1) || (slen >= 2)) && dlen) { if (((lastp = *p) <= 0x7f) && (p[1] == 0)) { *q++ = *p; if (slen != (size_t)-1) { slen -= 2; } p += 2; dlen--; retval++; if (!lastp) break; } else { #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS goto general_case; #else bool ret = convert_string_internal(from, to, p, slen, q, dlen, converted_size); *converted_size += retval; return ret; #endif } } *converted_size = retval; if (!dlen) { /* Even if we fast path we should note if we ran out of room. */ if (((slen != (size_t)-1) && slen) || ((slen == (size_t)-1) && lastp)) { errno = E2BIG; return false; } } return true; } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) { const unsigned char *p = (const unsigned char *)src; unsigned char *q = (unsigned char *)dest; size_t retval = 0; size_t slen = srclen; size_t dlen = destlen; unsigned char lastp = '\0'; /* If all characters are ascii, fast path here. */ while (slen && (dlen >= 2)) { if ((lastp = *p) <= 0x7F) { *q++ = *p++; *q++ = '\0'; if (slen != (size_t)-1) { slen--; } dlen -= 2; retval += 2; if (!lastp) break; } else { #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS goto general_case; #else bool ret = convert_string_internal(from, to, p, slen, q, dlen, converted_size); *converted_size += retval; return ret; #endif } } *converted_size = retval; if (!dlen) { /* Even if we fast path we should note if we ran out of room. */ if (((slen != (size_t)-1) && slen) || ((slen == (size_t)-1) && lastp)) { errno = E2BIG; return false; } } return true; } #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS general_case: #endif return convert_string_internal(from, to, src, srclen, dest, destlen, converted_size); } bool convert_string(charset_t from, charset_t to, void const *src, size_t srclen, void *dest, size_t destlen, size_t *converted_size) { bool ret = convert_string_error(from, to, src, srclen, dest, destlen, converted_size); if(ret==false) { const char *reason="unknown error"; switch(errno) { case EINVAL: reason="Incomplete multibyte sequence"; DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n", reason, (const char *)src)); break; case E2BIG: { struct smb_iconv_handle *ic; lazy_initialize_conv(); ic = get_iconv_handle(); reason="No more room"; if (from == CH_UNIX) { DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n", charset_name(ic, from), charset_name(ic, to), (unsigned int)srclen, (unsigned int)destlen, (const char *)src)); } else { DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n", charset_name(ic, from), charset_name(ic, to), (unsigned int)srclen, (unsigned int)destlen)); } break; } case EILSEQ: reason="Illegal multibyte sequence"; DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n", reason, (const char *)src)); break; default: DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n", reason, (const char *)src)); break; } /* smb_panic(reason); */ } return ret; } /** * Convert between character sets, allocating a new buffer using talloc for the result. * * @param srclen length of source buffer. * @param dest always set at least to NULL * @parm converted_size set to the number of bytes occupied by the string in * the destination on success. * @note -1 is not accepted for srclen. * * @return true if new buffer was correctly allocated, and string was * converted. * * Ensure the srclen contains the terminating zero. * * I hate the goto's in this function. It's embarressing..... * There has to be a cleaner way to do this. JRA. */ bool convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to, void const *src, size_t srclen, void *dst, size_t *converted_size) { size_t i_len, o_len, destlen = (srclen * 3) / 2; size_t retval; const char *inbuf = (const char *)src; char *outbuf = NULL, *ob = NULL; smb_iconv_t descriptor; void **dest = (void **)dst; struct smb_iconv_handle *ic; *dest = NULL; if (src == NULL || srclen == (size_t)-1) { errno = EINVAL; return false; } if (srclen == 0) { /* We really should treat this as an error, but there are too many callers that need this to return a NULL terminated string in the correct character set. */ if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) { destlen = 2; } else { destlen = 1; } ob = talloc_zero_array(ctx, char, destlen); if (ob == NULL) { errno = ENOMEM; return false; } *converted_size = destlen; *dest = ob; return true; } lazy_initialize_conv(); ic = get_iconv_handle(); descriptor = get_conv_handle(ic, from, to); if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { DEBUG(0,("convert_string_talloc: Conversion not supported.\n")); errno = EOPNOTSUPP; return false; } convert: /* +2 is for ucs2 null termination. */ if ((destlen*2)+2 < destlen) { /* wrapped ! abort. */ DEBUG(0, ("convert_string_talloc: destlen wrapped !\n")); TALLOC_FREE(outbuf); errno = EOPNOTSUPP; return false; } else { destlen = destlen * 2; } /* +2 is for ucs2 null termination. */ ob = (char *)TALLOC_REALLOC(ctx, ob, destlen + 2); if (!ob) { DEBUG(0, ("convert_string_talloc: realloc failed!\n")); errno = ENOMEM; return false; } outbuf = ob; i_len = srclen; o_len = destlen; retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len); if(retval == (size_t)-1) { const char *reason="unknown error"; switch(errno) { case EINVAL: reason="Incomplete multibyte sequence"; DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf)); break; case E2BIG: goto convert; case EILSEQ: reason="Illegal multibyte sequence"; DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf)); break; } DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf)); /* smb_panic(reason); */ TALLOC_FREE(ob); return false; } destlen = destlen - o_len; /* Don't shrink unless we're reclaiming a lot of * space. This is in the hot codepath and these * reallocs *cost*. JRA. */ if (o_len > 1024) { /* We're shrinking here so we know the +2 is safe from wrap. */ ob = (char *)TALLOC_REALLOC(ctx,ob,destlen + 2); } if (destlen && !ob) { DEBUG(0, ("convert_string_talloc: out of memory!\n")); errno = ENOMEM; return false; } *dest = ob; /* Must ucs2 null terminate in the extra space we allocated. */ ob[destlen] = '\0'; ob[destlen+1] = '\0'; /* Ensure we can never return a *converted_size of zero. */ if (destlen == 0) { /* As we're now returning false on a bad smb_iconv call, this should never happen. But be safe anyway. */ if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) { destlen = 2; } else { destlen = 1; } } *converted_size = destlen; return true; }