From 8177fc778b02d9f61ef482fc60d32f353be77ba4 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Tue, 21 Dec 1999 23:14:01 +0000 Subject: Added new unicode functions - not used yet, but are the basis for the internal unicode conversion of Samba. Jeremy. (This used to be commit 302412df64aa4b6572b13ef61dfd68c3f8ebbb8b) --- source3/lib/util_unistr.c | 210 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 209 insertions(+), 1 deletion(-) (limited to 'source3/lib/util_unistr.c') diff --git a/source3/lib/util_unistr.c b/source3/lib/util_unistr.c index 185e7c3547..fca9d8bfda 100644 --- a/source3/lib/util_unistr.c +++ b/source3/lib/util_unistr.c @@ -345,8 +345,9 @@ int unistrcpy(char *dst, char *src) /******************************************************************* - free any existing maps + Free any existing maps. ********************************************************************/ + static void free_maps(smb_ucs2_t **pp_cp_to_ucs2, uint16 **pp_ucs2_to_cp) { /* this handles identity mappings where we share the pointer */ @@ -556,3 +557,210 @@ BOOL load_unix_unicode_map(const char *unix_char_set) strupper(upper_unix_char_set); return load_unicode_map(upper_unix_char_set, &unixcp_to_ucs2, &ucs2_to_unixcp); } + +/******************************************************************* + The following functions reproduce many of the non-UNICODE standard + string functions in Samba. +********************************************************************/ + +/******************************************************************* + Convert a UNICODE string to multibyte format. Note that the 'src' is in + native byte order, not little endian. Always zero terminates. + dst_len is in bytes. +********************************************************************/ + +static char *unicode_to_multibyte(char *dst, const smb_ucs2_t *src, + size_t dst_len, const uint16 *ucs2_to_cp) +{ + size_t i; + + for(i = 0; (i < (dst_len - 1)) && src[i];) { + smb_ucs2_t val = ucs2_to_cp[*src]; + if(val < 256) { + dst[i++] = (char)val; + } else if (i < (dst_len - 2)) { + + /* + * A 2 byte value is always written as + * high/low into the buffer stream. + */ + + dst[i++] = (char)((val >> 8) & 0xff); + dst[i++] = (char)(val & 0xff); + } + } + + dst[i] = '\0'; + + return dst; +} + +/******************************************************************* + Convert a multibyte string to UNICODE format. Note that the 'dst' is in + native byte order, not little endian. Always zero terminates. + dst_len is in bytes. +********************************************************************/ + +smb_ucs2_t *multibyte_to_unicode(smb_ucs2_t *dst, const char *src, + size_t dst_len, smb_ucs2_t *cp_to_ucs2) +{ + size_t i; + + dst_len /= sizeof(smb_ucs2_t); /* Convert to smb_ucs2_t units. */ + + for(i = 0; (i < (dst_len - 1)) && src[i];) { + size_t skip = skip_multibyte_char(*src); + smb_ucs2_t val = (*src & 0xff); + + /* + * If this is a multibyte character + * then work out the index value for the unicode conversion. + */ + + if (skip == 2) + val = ((val << 8) | (src[1] & 0xff)); + + dst[i++] = cp_to_ucs2[val]; + if (skip) + src += skip; + else + src++; + } + + dst[i] = 0; + + return dst; +} + +/******************************************************************* + Convert a UNICODE string to multibyte format. Note that the 'src' is in + native byte order, not little endian. Always zero terminates. + This function may be replaced if the MB codepage format is an + encoded one (ie. utf8, hex). See the code in lib/kanji.c + for details. dst_len is in bytes. +********************************************************************/ + +char *unicode_to_unix(char *dst, const smb_ucs2_t *src, size_t dst_len) +{ + return unicode_to_multibyte(dst, src, dst_len, ucs2_to_unixcp); +} + +/******************************************************************* + Convert a UNIX string to UNICODE format. Note that the 'dst' is in + native byte order, not little endian. Always zero terminates. + This function may be replaced if the UNIX codepage format is a + multi-byte one (ie. JIS, SJIS or utf8). See the code in lib/kanji.c + for details. dst_len is in bytes, not ucs2 units. +********************************************************************/ + +smb_ucs2_t *unix_to_unicode(smb_ucs2_t *dst, const char *src, size_t dst_len) +{ + return multibyte_to_unicode(dst, src, dst_len, unixcp_to_ucs2); +} + +/******************************************************************* + Convert a UNICODE string to DOS format. Note that the 'src' is in + native byte order, not little endian. Always zero terminates. + dst_len is in bytes. +********************************************************************/ + +char *unicode_to_dos(char *dst, const smb_ucs2_t *src, size_t dst_len) +{ + return unicode_to_multibyte(dst, src, dst_len, ucs2_to_doscp); +} + +/******************************************************************* + Convert a DOS string to UNICODE format. Note that the 'dst' is in + native byte order, not little endian. Always zero terminates. + This function may be replaced if the DOS codepage format is a + multi-byte one (ie. JIS, SJIS or utf8). See the code in lib/kanji.c + for details. dst_len is in bytes, not ucs2 units. +********************************************************************/ + +smb_ucs2_t *dos_to_unicode(smb_ucs2_t *dst, const char *src, size_t dst_len) +{ + return multibyte_to_unicode(dst, src, dst_len, doscp_to_ucs2); +} + +/******************************************************************* + Count the number of characters in a smb_ucs2_t string. +********************************************************************/ + +size_t wstrlen(const smb_ucs2_t *src) +{ + size_t len; + + for(len = 0; *src; len++) + ; + + return len; +} + +/******************************************************************* + Safe wstring copy into a known length string. maxlength includes + the terminating zero. maxlength is in bytes. +********************************************************************/ + +smb_ucs2_t *safe_wstrcpy(smb_ucs2_t *dest,const smb_ucs2_t *src, size_t maxlength) +{ + size_t ucs2_len; + + if (!dest) { + DEBUG(0,("ERROR: NULL dest in safe_wstrcpy\n")); + return NULL; + } + + if (!src) { + *dest = 0; + return dest; + } + + ucs2_len = wstrlen(src); + + if (ucs2_len >= (maxlength/sizeof(smb_ucs2_t))) { + fstring out; + DEBUG(0,("ERROR: string overflow by %u bytes in safe_wstrcpy [%.50s]\n", + (unsigned int)((ucs2_len*sizeof(smb_ucs2_t))-maxlength), + unicode_to_unix(out,src,sizeof(out))) ); + ucs2_len = (maxlength/sizeof(smb_ucs2_t)) - 1; + } + + memcpy(dest, src, ucs2_len*sizeof(smb_ucs2_t)); + dest[ucs2_len] = 0; + return dest; +} + +/******************************************************************* + Safe string cat into a string. maxlength includes the terminating zero. + maxlength is in bytes. +********************************************************************/ + +smb_ucs2_t *safe_wstrcat(smb_ucs2_t *dest, const smb_ucs2_t *src, size_t maxlength) +{ + size_t ucs2_src_len, ucs2_dest_len; + + if (!dest) { + DEBUG(0,("ERROR: NULL dest in safe_wstrcat\n")); + return NULL; + } + + if (!src) { + return dest; + } + + ucs2_src_len = wstrlen(src); + ucs2_dest_len = wstrlen(dest); + + if (ucs2_src_len + ucs2_dest_len >= (maxlength/sizeof(smb_ucs2_t))) { + fstring out; + int new_len = (maxlength/sizeof(smb_ucs2_t)) - ucs2_dest_len - 1; + DEBUG(0,("ERROR: string overflow by %u characters in safe_wstrcat [%.50s]\n", + (unsigned int)((sizeof(smb_ucs2_t)*(ucs2_src_len + ucs2_dest_len)) - maxlength), + unicode_to_unix(out,src,sizeof(out))) ); + ucs2_src_len = (size_t)(new_len > 0 ? new_len : 0); + } + + memcpy(&dest[ucs2_dest_len], src, ucs2_src_len*sizeof(smb_ucs2_t)); + dest[ucs2_dest_len + ucs2_src_len] = 0; + return dest; +} -- cgit