diff options
Diffstat (limited to 'source4/lib/charset/util_unistr.c')
-rw-r--r-- | source4/lib/charset/util_unistr.c | 615 |
1 files changed, 615 insertions, 0 deletions
diff --git a/source4/lib/charset/util_unistr.c b/source4/lib/charset/util_unistr.c new file mode 100644 index 0000000000..faa1398eac --- /dev/null +++ b/source4/lib/charset/util_unistr.c @@ -0,0 +1,615 @@ +/* + Unix SMB/CIFS implementation. + Samba utility functions + Copyright (C) Andrew Tridgell 1992-2001 + Copyright (C) Simo Sorce 2001 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include "includes.h" +#include "system/iconv.h" + +/** + * @file + * @brief Unicode string manipulation + */ + +/* these 2 tables define the unicode case handling. They are loaded + at startup either via mmap() or read() from the lib directory */ +static void *upcase_table; +static void *lowcase_table; + + +/******************************************************************* +load the case handling tables +********************************************************************/ +static void load_case_tables(void) +{ + TALLOC_CTX *mem_ctx; + + mem_ctx = talloc_init("load_case_tables"); + if (!mem_ctx) { + smb_panic("No memory for case_tables"); + } + upcase_table = map_file(data_path(mem_ctx, "upcase.dat"), 0x20000); + lowcase_table = map_file(data_path(mem_ctx, "lowcase.dat"), 0x20000); + talloc_free(mem_ctx); + if (upcase_table == NULL) { + /* try also under codepages for testing purposes */ + upcase_table = map_file("codepages/upcase.dat", 0x20000); + if (upcase_table == NULL) { + upcase_table = (void *)-1; + } + } + if (lowcase_table == NULL) { + /* try also under codepages for testing purposes */ + lowcase_table = map_file("codepages/lowcase.dat", 0x20000); + if (lowcase_table == NULL) { + lowcase_table = (void *)-1; + } + } +} + +/** + Convert a codepoint_t to upper case. +**/ +codepoint_t toupper_w(codepoint_t val) +{ + if (val < 128) { + return toupper(val); + } + if (upcase_table == NULL) { + load_case_tables(); + } + if (upcase_table == (void *)-1) { + return val; + } + if (val & 0xFFFF0000) { + return val; + } + return SVAL(upcase_table, val*2); +} + +/** + Convert a codepoint_t to lower case. +**/ +codepoint_t tolower_w(codepoint_t val) +{ + if (val < 128) { + return tolower(val); + } + if (lowcase_table == NULL) { + load_case_tables(); + } + if (lowcase_table == (void *)-1) { + return val; + } + if (val & 0xFFFF0000) { + return val; + } + return SVAL(lowcase_table, val*2); +} + +/** + compare two codepoints case insensitively +*/ +int codepoint_cmpi(codepoint_t c1, codepoint_t c2) +{ + if (c1 == c2 || + toupper_w(c1) == toupper_w(c2)) { + return 0; + } + return c1 - c2; +} + +/** + Case insensitive string compararison +**/ +_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2) +{ + codepoint_t c1=0, c2=0; + size_t size1, size2; + + while (*s1 && *s2) { + c1 = next_codepoint(s1, &size1); + c2 = next_codepoint(s2, &size2); + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + /* what else can we do?? */ + return strcasecmp(s1, s2); + } + + if (toupper_w(c1) != toupper_w(c2)) { + return c1 - c2; + } + } + + return *s1 - *s2; +} + +/** + * Get the next token from a string, return False if none found. + * Handles double-quotes. + * + * Based on a routine by GJC@VILLAGE.COM. + * Extensively modified by Andrew.Tridgell@anu.edu.au + **/ +_PUBLIC_ BOOL next_token(const char **ptr,char *buff, const char *sep, size_t bufsize) +{ + const char *s; + BOOL quoted; + size_t len=1; + + if (!ptr) + return(False); + + s = *ptr; + + /* default to simple separators */ + if (!sep) + sep = " \t\n\r"; + + /* find the first non sep char */ + while (*s && strchr_m(sep,*s)) + s++; + + /* nothing left? */ + if (! *s) + return(False); + + /* copy over the token */ + for (quoted = False; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) { + if (*s == '\"') { + quoted = !quoted; + } else { + len++; + *buff++ = *s; + } + } + + *ptr = (*s) ? s+1 : s; + *buff = 0; + + return(True); +} + +/** + Case insensitive string compararison, length limited +**/ +_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n) +{ + codepoint_t c1=0, c2=0; + size_t size1, size2; + + while (*s1 && *s2 && n) { + n--; + + c1 = next_codepoint(s1, &size1); + c2 = next_codepoint(s2, &size2); + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + /* what else can we do?? */ + return strcasecmp(s1, s2); + } + + if (toupper_w(c1) != toupper_w(c2)) { + return c1 - c2; + } + } + + if (n == 0) { + return 0; + } + + return *s1 - *s2; +} + +/** + * Compare 2 strings. + * + * @note The comparison is case-insensitive. + **/ +_PUBLIC_ BOOL strequal_w(const char *s1, const char *s2) +{ + if (s1 == s2) + return(True); + if (!s1 || !s2) + return(False); + + return strcasecmp_m(s1,s2) == 0; +} + +/** + Compare 2 strings (case sensitive). +**/ +_PUBLIC_ BOOL strcsequal_w(const char *s1,const char *s2) +{ + if (s1 == s2) + return(True); + if (!s1 || !s2) + return(False); + + return strcmp(s1,s2) == 0; +} + + +/** + String replace. + NOTE: oldc and newc must be 7 bit characters +**/ +_PUBLIC_ void string_replace_w(char *s, char oldc, char newc) +{ + while (*s) { + size_t size; + codepoint_t c = next_codepoint(s, &size); + if (c == oldc) { + *s = newc; + } + s += size; + } +} + +/** + Paranoid strcpy into a buffer of given length (includes terminating + zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars + and replaces with '_'. Deliberately does *NOT* check for multibyte + characters. Don't change it ! +**/ + +_PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength) +{ + size_t len, i; + + if (maxlength == 0) { + /* can't fit any bytes at all! */ + return NULL; + } + + if (!dest) { + DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n")); + return NULL; + } + + if (!src) { + *dest = 0; + return dest; + } + + len = strlen(src); + if (len >= maxlength) + len = maxlength - 1; + + if (!other_safe_chars) + other_safe_chars = ""; + + for(i = 0; i < len; i++) { + int val = (src[i] & 0xff); + if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val)) + dest[i] = src[i]; + else + dest[i] = '_'; + } + + dest[i] = '\0'; + + return dest; +} + +/** + Count the number of UCS2 characters in a string. Normally this will + be the same as the number of bytes in a string for single byte strings, + but will be different for multibyte. +**/ +_PUBLIC_ size_t strlen_m(const char *s) +{ + size_t count = 0; + + if (!s) { + return 0; + } + + while (*s && !(((uint8_t)*s) & 0x80)) { + s++; + count++; + } + + if (!*s) { + return count; + } + + while (*s) { + size_t c_size; + codepoint_t c = next_codepoint(s, &c_size); + if (c < 0x10000) { + count += 1; + } else { + count += 2; + } + s += c_size; + } + + return count; +} + +/** + Work out the number of multibyte chars in a string, including the NULL + terminator. +**/ +_PUBLIC_ size_t strlen_m_term(const char *s) +{ + if (!s) { + return 0; + } + + return strlen_m(s) + 1; +} + +/** + Strchr and strrchr_m are a bit complex on general multi-byte strings. +**/ +_PUBLIC_ char *strchr_m(const char *s, char c) +{ + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strchr(s, c); + } + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint(s, &size); + if (c2 == c) { + return discard_const(s); + } + s += size; + } + + return NULL; +} + +/** + * Multibyte-character version of strrchr + */ +_PUBLIC_ char *strrchr_m(const char *s, char c) +{ + char *ret = NULL; + + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strrchr(s, c); + } + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint(s, &size); + if (c2 == c) { + ret = discard_const(s); + } + s += size; + } + + return ret; +} + +/** + return True if any (multi-byte) character is lower case +*/ +_PUBLIC_ BOOL strhaslower(const char *string) +{ + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint(string, &c_size); + string += c_size; + + t = toupper_w(s); + + if (s != t) { + return True; /* that means it has lower case chars */ + } + } + + return False; +} + +/** + return True if any (multi-byte) character is upper case +*/ +_PUBLIC_ BOOL strhasupper(const char *string) +{ + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint(string, &c_size); + string += c_size; + + t = tolower_w(s); + + if (s != t) { + return True; /* that means it has upper case chars */ + } + } + + return False; +} + +/** + Convert a string to lower case, allocated with talloc +**/ +_PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src) +{ + size_t size=0; + char *dest; + + /* this takes advantage of the fact that upper/lower can't + change the length of a character by more than 1 byte */ + dest = talloc_size(ctx, 2*(strlen(src))+1); + if (dest == NULL) { + return NULL; + } + + while (*src) { + size_t c_size; + codepoint_t c = next_codepoint(src, &c_size); + src += c_size; + + c = tolower_w(c); + + c_size = push_codepoint(dest+size, c); + if (c_size == -1) { + talloc_free(dest); + return NULL; + } + size += c_size; + } + + dest[size] = 0; + + return dest; +} + +/** + Convert a string to UPPER case, allocated with talloc +**/ +_PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src) +{ + size_t size=0; + char *dest; + + if (!src) { + return NULL; + } + + /* this takes advantage of the fact that upper/lower can't + change the length of a character by more than 1 byte */ + dest = talloc_size(ctx, 2*(strlen(src))+1); + if (dest == NULL) { + return NULL; + } + + while (*src) { + size_t c_size; + codepoint_t c = next_codepoint(src, &c_size); + src += c_size; + + c = toupper_w(c); + + c_size = push_codepoint(dest+size, c); + if (c_size == -1) { + talloc_free(dest); + return NULL; + } + size += c_size; + } + + dest[size] = 0; + + return dest; +} + +/** + Convert a string to lower case. +**/ +_PUBLIC_ void strlower_m(char *s) +{ + char *d; + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars) */ + while (*s && !(((uint8_t)*s) & 0x80)) { + *s = tolower((uint8_t)*s); + s++; + } + + if (!*s) + return; + + d = s; + + while (*s) { + size_t c_size, c_size2; + codepoint_t c = next_codepoint(s, &c_size); + c_size2 = push_codepoint(d, tolower_w(c)); + if (c_size2 > c_size) { + DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n", + c, tolower_w(c), (int)c_size, (int)c_size2)); + smb_panic("codepoint expansion in strlower_m\n"); + } + s += c_size; + d += c_size2; + } + *d = 0; +} + +/** + Convert a string to UPPER case. +**/ +_PUBLIC_ void strupper_m(char *s) +{ + char *d; + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars) */ + while (*s && !(((uint8_t)*s) & 0x80)) { + *s = toupper((uint8_t)*s); + s++; + } + + if (!*s) + return; + + d = s; + + while (*s) { + size_t c_size, c_size2; + codepoint_t c = next_codepoint(s, &c_size); + c_size2 = push_codepoint(d, toupper_w(c)); + if (c_size2 > c_size) { + DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n", + c, toupper_w(c), (int)c_size, (int)c_size2)); + smb_panic("codepoint expansion in strupper_m\n"); + } + s += c_size; + d += c_size2; + } + *d = 0; +} + |