Move charset library to top level.

author: Jelmer Vernooij <jelmer@samba.org> 2008-10-24 00:06:35 +0200
committer: Jelmer Vernooij <jelmer@samba.org> 2008-10-24 00:06:35 +0200
commit: 09a63accb8863a883ef8aba3a44776ac92ae5764 (patch)
tree: 1181f77950c4674f80b61421354d199d30ede6bd /lib/util/charset/util_unistr.c
parent: 8baf1695492300fd8c2ce7e4f106a14f510357e5 (diff)
download: samba-09a63accb8863a883ef8aba3a44776ac92ae5764.tar.gz
samba-09a63accb8863a883ef8aba3a44776ac92ae5764.tar.bz2
samba-09a63accb8863a883ef8aba3a44776ac92ae5764.zip
1 files changed, 684 insertions, 0 deletions
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c
new file mode 100644
index 0000000000..e4f4bb551a
--- /dev/null
+++ b/lib/util/charset/util_unistr.c
@@ -0,0 +1,684 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   Copyright (C) Andrew Tridgell 1992-2001
+   Copyright (C) Simo Sorce 2001
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/locale.h"
+#include "dynconfig/dynconfig.h"
+#include "param/param.h"
+
+/**
+ * @file
+ * @brief Unicode string manipulation
+ */
+
+/* these 2 tables define the unicode case handling.  They are loaded
+   at startup either via mmap() or read() from the lib directory */
+static void *upcase_table;
+static void *lowcase_table;
+
+
+/*******************************************************************
+load the case handling tables
+********************************************************************/
+void load_case_tables(void)
+{
+	TALLOC_CTX *mem_ctx;
+
+	mem_ctx = talloc_init("load_case_tables");
+	if (!mem_ctx) {
+		smb_panic("No memory for case_tables");
+	}
+	upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
+	lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
+	talloc_free(mem_ctx);
+	if (upcase_table == NULL) {
+		/* try also under codepages for testing purposes */
+		upcase_table = map_file("codepages/upcase.dat", 0x20000);
+		if (upcase_table == NULL) {
+			upcase_table = (void *)-1;
+		}
+	}
+	if (lowcase_table == NULL) {
+		/* try also under codepages for testing purposes */
+		lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
+		if (lowcase_table == NULL) {
+			lowcase_table = (void *)-1;
+		}
+	}
+}
+
+/**
+ Convert a codepoint_t to upper case.
+**/
+_PUBLIC_ codepoint_t toupper_w(codepoint_t val)
+{
+	if (val < 128) {
+		return toupper(val);
+	}
+	if (upcase_table == NULL) {
+		load_case_tables();
+	}
+	if (upcase_table == (void *)-1) {
+		return val;
+	}
+	if (val & 0xFFFF0000) {
+		return val;
+	}
+	return SVAL(upcase_table, val*2);
+}
+
+/**
+ Convert a codepoint_t to lower case.
+**/
+_PUBLIC_ codepoint_t tolower_w(codepoint_t val)
+{
+	if (val < 128) {
+		return tolower(val);
+	}
+	if (lowcase_table == NULL) {
+		load_case_tables();
+	}
+	if (lowcase_table == (void *)-1) {
+		return val;
+	}
+	if (val & 0xFFFF0000) {
+		return val;
+	}
+	return SVAL(lowcase_table, val*2);
+}
+
+/**
+  compare two codepoints case insensitively
+*/
+_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
+{
+	if (c1 == c2 ||
+	    toupper_w(c1) == toupper_w(c2)) {
+		return 0;
+	}
+	return c1 - c2;
+}
+
+/**
+ Case insensitive string compararison
+**/
+_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
+{
+	codepoint_t c1=0, c2=0;
+	size_t size1, size2;
+	struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
+
+	/* handle null ptr comparisons to simplify the use in qsort */
+	if (s1 == s2) return 0;
+	if (s1 == NULL) return -1;
+	if (s2 == NULL) return 1;
+
+	while (*s1 && *s2) {
+		c1 = next_codepoint(iconv_convenience, s1, &size1);
+		c2 = next_codepoint(iconv_convenience, s2, &size2);
+
+		s1 += size1;
+		s2 += size2;
+
+		if (c1 == c2) {
+			continue;
+		}
+
+		if (c1 == INVALID_CODEPOINT ||
+		    c2 == INVALID_CODEPOINT) {
+			/* what else can we do?? */
+			return strcasecmp(s1, s2);
+		}
+
+		if (toupper_w(c1) != toupper_w(c2)) {
+			return c1 - c2;
+		}
+	}
+
+	return *s1 - *s2;
+}
+
+/**
+ * Get the next token from a string, return False if none found.
+ * Handles double-quotes.
+ * 
+ * Based on a routine by GJC@VILLAGE.COM. 
+ * Extensively modified by Andrew.Tridgell@anu.edu.au
+ **/
+_PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
+{
+	const char *s;
+	bool quoted;
+	size_t len=1;
+
+	if (!ptr)
+		return false;
+
+	s = *ptr;
+
+	/* default to simple separators */
+	if (!sep)
+		sep = " \t\n\r";
+
+	/* find the first non sep char */
+	while (*s && strchr_m(sep,*s))
+		s++;
+	
+	/* nothing left? */
+	if (!*s)
+		return false;
+	
+	/* copy over the token */
+	for (quoted = false; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
+		if (*s == '\"') {
+			quoted = !quoted;
+		} else {
+			len++;
+			*buff++ = *s;
+		}
+	}
+	
+	*ptr = (*s) ? s+1 : s;  
+	*buff = 0;
+	
+	return true;
+}
+
+/**
+ Case insensitive string compararison, length limited
+**/
+_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
+{
+	codepoint_t c1=0, c2=0;
+	size_t size1, size2;
+	struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
+
+	/* handle null ptr comparisons to simplify the use in qsort */
+	if (s1 == s2) return 0;
+	if (s1 == NULL) return -1;
+	if (s2 == NULL) return 1;
+
+	while (*s1 && *s2 && n) {
+		n--;
+
+		c1 = next_codepoint(iconv_convenience, s1, &size1);
+		c2 = next_codepoint(iconv_convenience, s2, &size2);
+
+		s1 += size1;
+		s2 += size2;
+
+		if (c1 == c2) {
+			continue;
+		}
+
+		if (c1 == INVALID_CODEPOINT ||
+		    c2 == INVALID_CODEPOINT) {
+			/* what else can we do?? */
+			return strcasecmp(s1, s2);
+		}
+
+		if (toupper_w(c1) != toupper_w(c2)) {
+			return c1 - c2;
+		}
+	}
+
+	if (n == 0) {
+		return 0;
+	}
+
+	return *s1 - *s2;
+}
+
+/**
+ * Compare 2 strings.
+ *
+ * @note The comparison is case-insensitive.
+ **/
+_PUBLIC_ bool strequal_w(const char *s1, const char *s2)
+{
+	return strcasecmp_m(s1,s2) == 0;
+}
+
+/**
+ Compare 2 strings (case sensitive).
+**/
+_PUBLIC_ bool strcsequal_w(const char *s1,const char *s2)
+{
+	if (s1 == s2)
+		return true;
+	if (!s1 || !s2)
+		return false;
+	
+	return strcmp(s1,s2) == 0;
+}
+
+
+/**
+ String replace.
+ NOTE: oldc and newc must be 7 bit characters
+**/
+_PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
+{
+	while (s && *s) {
+		size_t size;
+		codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
+		if (c == oldc) {
+			*s = newc;
+		}
+		s += size;
+	}
+}
+
+/**
+ Paranoid strcpy into a buffer of given length (includes terminating
+ zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
+ and replaces with '_'. Deliberately does *NOT* check for multibyte
+ characters. Don't change it !
+**/
+
+_PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
+{
+	size_t len, i;
+
+	if (maxlength == 0) {
+		/* can't fit any bytes at all! */
+		return NULL;
+	}
+
+	if (!dest) {
+		DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
+		return NULL;
+	}
+
+	if (!src) {
+		*dest = 0;
+		return dest;
+	}  
+
+	len = strlen(src);
+	if (len >= maxlength)
+		len = maxlength - 1;
+
+	if (!other_safe_chars)
+		other_safe_chars = "";
+
+	for(i = 0; i < len; i++) {
+		int val = (src[i] & 0xff);
+		if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
+			dest[i] = src[i];
+		else
+			dest[i] = '_';
+	}
+
+	dest[i] = '\0';
+
+	return dest;
+}
+
+/**
+ Count the number of UCS2 characters in a string. Normally this will
+ be the same as the number of bytes in a string for single byte strings,
+ but will be different for multibyte.
+**/
+_PUBLIC_ size_t strlen_m(const char *s)
+{
+	size_t count = 0;
+
+	if (!s) {
+		return 0;
+	}
+
+	while (*s && !(((uint8_t)*s) & 0x80)) {
+		s++;
+		count++;
+	}
+
+	if (!*s) {
+		return count;
+	}
+
+	while (*s) {
+		size_t c_size;
+		codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &c_size);
+		if (c < 0x10000) {
+			count += 1;
+		} else {
+			count += 2;
+		}
+		s += c_size;
+	}
+
+	return count;
+}
+
+/**
+   Work out the number of multibyte chars in a string, including the NULL
+   terminator.
+**/
+_PUBLIC_ size_t strlen_m_term(const char *s)
+{
+	if (!s) {
+		return 0;
+	}
+
+	return strlen_m(s) + 1;
+}
+
+/**
+ Strchr and strrchr_m are a bit complex on general multi-byte strings. 
+**/
+_PUBLIC_ char *strchr_m(const char *s, char c)
+{
+	if (s == NULL) {
+		return NULL;
+	}
+	/* characters below 0x3F are guaranteed to not appear in
+	   non-initial position in multi-byte charsets */
+	if ((c & 0xC0) == 0) {
+		return strchr(s, c);
+	}
+
+	while (*s) {
+		size_t size;
+		codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
+		if (c2 == c) {
+			return discard_const_p(char, s);
+		}
+		s += size;
+	}
+
+	return NULL;
+}
+
+/**
+ * Multibyte-character version of strrchr
+ */
+_PUBLIC_ char *strrchr_m(const char *s, char c)
+{
+	char *ret = NULL;
+
+	if (s == NULL) {
+		return NULL;
+	}
+
+	/* characters below 0x3F are guaranteed to not appear in
+	   non-initial position in multi-byte charsets */
+	if ((c & 0xC0) == 0) {
+		return strrchr(s, c);
+	}
+
+	while (*s) {
+		size_t size;
+		codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
+		if (c2 == c) {
+			ret = discard_const_p(char, s);
+		}
+		s += size;
+	}
+
+	return ret;
+}
+
+/**
+  return True if any (multi-byte) character is lower case
+*/
+_PUBLIC_ bool strhaslower(const char *string)
+{
+	while (*string) {
+		size_t c_size;
+		codepoint_t s;
+		codepoint_t t;
+
+		s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
+		string += c_size;
+
+		t = toupper_w(s);
+
+		if (s != t) {
+			return true; /* that means it has lower case chars */
+		}
+	}
+
+	return false;
+} 
+
+/**
+  return True if any (multi-byte) character is upper case
+*/
+_PUBLIC_ bool strhasupper(const char *string)
+{
+	while (*string) {
+		size_t c_size;
+		codepoint_t s;
+		codepoint_t t;
+
+		s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
+		string += c_size;
+
+		t = tolower_w(s);
+
+		if (s != t) {
+			return true; /* that means it has upper case chars */
+		}
+	}
+
+	return false;
+} 
+
+/**
+ Convert a string to lower case, allocated with talloc
+**/
+_PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
+{
+	size_t size=0;
+	char *dest;
+	struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
+
+	/* this takes advantage of the fact that upper/lower can't
+	   change the length of a character by more than 1 byte */
+	dest = talloc_array(ctx, char, 2*(strlen(src))+1);
+	if (dest == NULL) {
+		return NULL;
+	}
+
+	while (*src) {
+		size_t c_size;
+		codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
+		src += c_size;
+
+		c = tolower_w(c);
+
+		c_size = push_codepoint(iconv_convenience, dest+size, c);
+		if (c_size == -1) {
+			talloc_free(dest);
+			return NULL;
+		}
+		size += c_size;
+	}
+
+	dest[size] = 0;
+
+	/* trim it so talloc_append_string() works */
+	dest = talloc_realloc(ctx, dest, char, size+1);
+
+	talloc_set_name_const(dest, dest);
+
+	return dest;
+}
+
+/**
+ Convert a string to UPPER case, allocated with talloc
+ source length limited to n bytes
+**/
+_PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n)
+{
+	size_t size=0;
+	char *dest;
+	struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
+	
+	if (!src) {
+		return NULL;
+	}
+
+	/* this takes advantage of the fact that upper/lower can't
+	   change the length of a character by more than 1 byte */
+	dest = talloc_array(ctx, char, 2*(n+1));
+	if (dest == NULL) {
+		return NULL;
+	}
+
+	while (*src && n--) {
+		size_t c_size;
+		codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
+		src += c_size;
+
+		c = toupper_w(c);
+
+		c_size = push_codepoint(iconv_convenience, dest+size, c);
+		if (c_size == -1) {
+			talloc_free(dest);
+			return NULL;
+		}
+		size += c_size;
+	}
+
+	dest[size] = 0;
+
+	/* trim it so talloc_append_string() works */
+	dest = talloc_realloc(ctx, dest, char, size+1);
+
+	talloc_set_name_const(dest, dest);
+
+	return dest;
+}
+
+/**
+ Convert a string to UPPER case, allocated with talloc
+**/
+_PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
+{
+	return strupper_talloc_n(ctx, src, src?strlen(src):0);
+}
+
+/**
+ talloc_strdup() a unix string to upper case.
+**/
+_PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
+{
+	return strupper_talloc(ctx, src);
+}
+
+/**
+ Convert a string to lower case.
+**/
+_PUBLIC_ void strlower_m(char *s)
+{
+	char *d;
+	struct smb_iconv_convenience *iconv_convenience;
+
+	/* this is quite a common operation, so we want it to be
+	   fast. We optimise for the ascii case, knowing that all our
+	   supported multi-byte character sets are ascii-compatible
+	   (ie. they match for the first 128 chars) */
+	while (*s && !(((uint8_t)*s) & 0x80)) {
+		*s = tolower((uint8_t)*s);
+		s++;
+	}
+
+	if (!*s)
+		return;
+
+	iconv_convenience = lp_iconv_convenience(global_loadparm);
+
+	d = s;
+
+	while (*s) {
+		size_t c_size, c_size2;
+		codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
+		c_size2 = push_codepoint(iconv_convenience, d, tolower_w(c));
+		if (c_size2 > c_size) {
+			DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
+				 c, tolower_w(c), (int)c_size, (int)c_size2));
+			smb_panic("codepoint expansion in strlower_m\n");
+		}
+		s += c_size;
+		d += c_size2;
+	}
+	*d = 0;
+}
+
+/**
+ Convert a string to UPPER case.
+**/
+_PUBLIC_ void strupper_m(char *s)
+{
+	char *d;
+	struct smb_iconv_convenience *iconv_convenience;
+
+	/* this is quite a common operation, so we want it to be
+	   fast. We optimise for the ascii case, knowing that all our
+	   supported multi-byte character sets are ascii-compatible
+	   (ie. they match for the first 128 chars) */
+	while (*s && !(((uint8_t)*s) & 0x80)) {
+		*s = toupper((uint8_t)*s);
+		s++;
+	}
+
+	if (!*s)
+		return;
+
+	iconv_convenience = lp_iconv_convenience(global_loadparm);
+
+	d = s;
+
+	while (*s) {
+		size_t c_size, c_size2;
+		codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
+		c_size2 = push_codepoint(iconv_convenience, d, toupper_w(c));
+		if (c_size2 > c_size) {
+			DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
+				 c, toupper_w(c), (int)c_size, (int)c_size2));
+			smb_panic("codepoint expansion in strupper_m\n");
+		}
+		s += c_size;
+		d += c_size2;
+	}
+	*d = 0;
+}
+
+
+/**
+ Find the number of 'c' chars in a string
+**/
+_PUBLIC_ size_t count_chars_w(const char *s, char c)
+{
+	size_t count = 0;
+
+	while (*s) {
+		size_t size;
+		codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
+		if (c2 == c) count++;
+		s += size;
+	}
+
+	return count;
+}
+
+
author	Jelmer Vernooij <jelmer@samba.org>	2008-10-24 00:06:35 +0200
committer	Jelmer Vernooij <jelmer@samba.org>	2008-10-24 00:06:35 +0200
commit	09a63accb8863a883ef8aba3a44776ac92ae5764 (patch)
tree	1181f77950c4674f80b61421354d199d30ede6bd /lib/util/charset/util_unistr.c
parent	8baf1695492300fd8c2ce7e4f106a14f510357e5 (diff)
download	samba-09a63accb8863a883ef8aba3a44776ac92ae5764.tar.gz samba-09a63accb8863a883ef8aba3a44776ac92ae5764.tar.bz2 samba-09a63accb8863a883ef8aba3a44776ac92ae5764.zip