summaryrefslogtreecommitdiff
path: root/source4/lib/util_unistr.c
diff options
context:
space:
mode:
authorAndrew Tridgell <tridge@samba.org>2004-10-08 08:13:00 +0000
committerGerald (Jerry) Carter <jerry@samba.org>2007-10-10 12:59:39 -0500
commit7d32679e9683c81aca538f0267684332a28a286f (patch)
tree445aecfad24e8dab1fe7a200904a712212fa7091 /source4/lib/util_unistr.c
parent48f960ab47707ca24898834da4da440d1f7fb0d9 (diff)
downloadsamba-7d32679e9683c81aca538f0267684332a28a286f.tar.gz
samba-7d32679e9683c81aca538f0267684332a28a286f.tar.bz2
samba-7d32679e9683c81aca538f0267684332a28a286f.zip
r2857: this commit gets rid of smb_ucs2_t, wpstring and fpstring, plus lots of associated functions.
The motivation for this change was to avoid having to convert to/from ucs2 strings for so many operations. Doing that was slow, used many static buffers, and was also incorrect as it didn't cope properly with unicode codepoints above 65536 (which could not be represented correctly as smb_ucs2_t chars) The two core functions that allowed this change are next_codepoint() and push_codepoint(). These functions allow you to correctly walk a arbitrary multi-byte string a character at a time without converting the whole string to ucs2. While doing this cleanup I also fixed several ucs2 string handling bugs. See the commit for details. The following code (which counts the number of occuraces of 'c' in a string) shows how to use the new interface: size_t count_chars(const char *s, char c) { size_t count = 0; while (*s) { size_t size; codepoint_t c2 = next_codepoint(s, &size); if (c2 == c) count++; s += size; } return count; } (This used to be commit 814881f0e50019196b3aa9fbe4aeadbb98172040)
Diffstat (limited to 'source4/lib/util_unistr.c')
-rw-r--r--source4/lib/util_unistr.c225
1 files changed, 56 insertions, 169 deletions
diff --git a/source4/lib/util_unistr.c b/source4/lib/util_unistr.c
index 713f50708b..8b7504986f 100644
--- a/source4/lib/util_unistr.c
+++ b/source4/lib/util_unistr.c
@@ -21,16 +21,16 @@
#include "includes.h"
-/* these 3 tables define the unicode case handling. They are loaded
+/* these 2 tables define the unicode case handling. They are loaded
at startup either via mmap() or read() from the lib directory */
-static smb_ucs2_t *upcase_table;
-static smb_ucs2_t *lowcase_table;
+static void *upcase_table;
+static void *lowcase_table;
/*******************************************************************
load the case handling tables
********************************************************************/
-void load_case_tables(void)
+static void load_case_tables(void)
{
static int initialised;
int i;
@@ -56,14 +56,10 @@ void load_case_tables(void)
smb_panic("No memory for upcase tables");
}
for (i=0;i<0x10000;i++) {
- smb_ucs2_t v;
- SSVAL(&v, 0, i);
- upcase_table[v] = i;
+ SSVAL(upcase_table, i*2, i);
}
for (i=0;i<256;i++) {
- smb_ucs2_t v;
- SSVAL(&v, 0, UCS2_CHAR(i));
- upcase_table[v] = UCS2_CHAR(islower(i)?toupper(i):i);
+ SSVAL(upcase_table, i*2, islower(i)?toupper(i):i);
}
}
@@ -74,199 +70,79 @@ void load_case_tables(void)
smb_panic("No memory for lowcase tables");
}
for (i=0;i<0x10000;i++) {
- smb_ucs2_t v;
- SSVAL(&v, 0, i);
- lowcase_table[v] = i;
+ SSVAL(lowcase_table, i*2, i);
}
for (i=0;i<256;i++) {
- smb_ucs2_t v;
- SSVAL(&v, 0, UCS2_CHAR(i));
- lowcase_table[v] = UCS2_CHAR(isupper(i)?tolower(i):i);
+ SSVAL(lowcase_table, i*2, isupper(i)?tolower(i):i);
}
}
}
/*******************************************************************
- Convert a wchar to upper case.
+ Convert a codepoint_t to upper case.
********************************************************************/
-smb_ucs2_t toupper_w(smb_ucs2_t val)
+codepoint_t toupper_w(codepoint_t val)
{
- return upcase_table[SVAL(&val,0)];
-}
-
-/*******************************************************************
- Convert a wchar to lower case.
-********************************************************************/
-static smb_ucs2_t tolower_w( smb_ucs2_t val )
-{
- return lowcase_table[SVAL(&val,0)];
-
-}
-
-/*******************************************************************
-determine if a character is lowercase
-********************************************************************/
-BOOL islower_w(smb_ucs2_t c)
-{
- return upcase_table[SVAL(&c,0)] != c;
+ if (val & 0xFFFF0000) {
+ return val;
+ }
+ if (val < 128) {
+ return toupper(val);
+ }
+ if (upcase_table == NULL) {
+ load_case_tables();
+ }
+ return SVAL(upcase_table, val*2);
}
/*******************************************************************
-determine if a character is uppercase
+ Convert a codepoint_t to lower case.
********************************************************************/
-BOOL isupper_w(smb_ucs2_t c)
+codepoint_t tolower_w(codepoint_t val)
{
- return lowcase_table[SVAL(&c,0)] != c;
+ if (val & 0xFFFF0000) {
+ return val;
+ }
+ if (val < 128) {
+ return tolower(val);
+ }
+ if (lowcase_table == NULL) {
+ load_case_tables();
+ }
+ return SVAL(lowcase_table, val*2);
}
-
/*******************************************************************
- Count the number of characters in a smb_ucs2_t string.
+return the number of bytes occupied by a buffer in CH_UTF16 format
+the result includes the null termination
********************************************************************/
-size_t strlen_w(const smb_ucs2_t *src)
+size_t utf16_len(const void *buf)
{
size_t len;
- for (len = 0; SVAL(src,0); len++, src++) ;
+ for (len = 0; SVAL(buf,len); len += 2) ;
- return len;
+ return len + 2;
}
/*******************************************************************
- Count up to max number of characters in a smb_ucs2_t string.
+return the number of bytes occupied by a buffer in CH_UTF16 format
+the result includes the null termination
+limited by 'n' bytes
********************************************************************/
-size_t strnlen_w(const smb_ucs2_t *src, size_t max)
+size_t utf16_len_n(const void *src, size_t n)
{
size_t len;
- for (len = 0; (len < max) && SVAL(src, 0); len++, src++) ;
-
- return len;
-}
-
-/*******************************************************************
-wide strchr()
-********************************************************************/
-smb_ucs2_t *strchr_w(const smb_ucs2_t *s, smb_ucs2_t c)
-{
- while (*s != 0) {
- if (c == *s) return discard_const_p(smb_ucs2_t, s);
- s++;
- }
- if (c == *s) return discard_const_p(smb_ucs2_t, s);
-
- return NULL;
-}
-
-smb_ucs2_t *strchr_wa(const smb_ucs2_t *s, char c)
-{
- return strchr_w(s, UCS2_CHAR(c));
-}
-
-smb_ucs2_t *strrchr_w(const smb_ucs2_t *s, smb_ucs2_t c)
-{
- const smb_ucs2_t *p = s;
- int len = strlen_w(s);
- if (len == 0) return NULL;
- p += (len - 1);
- do {
- if (c == *p) return discard_const_p(smb_ucs2_t, p);
- } while (p-- != s);
- return NULL;
-}
-
-/*******************************************************************
- Convert a string to lower case.
- return True if any char is converted
-********************************************************************/
-BOOL strlower_w(smb_ucs2_t *s)
-{
- BOOL ret = False;
- while (*s) {
- smb_ucs2_t v = tolower_w(*s);
- if (v != *s) {
- *s = v;
- ret = True;
- }
- s++;
- }
- return ret;
-}
-
-/*******************************************************************
- Convert a string to upper case.
- return True if any char is converted
-********************************************************************/
-BOOL strupper_w(smb_ucs2_t *s)
-{
- BOOL ret = False;
- while (*s) {
- smb_ucs2_t v = toupper_w(*s);
- if (v != *s) {
- *s = v;
- ret = True;
- }
- s++;
- }
- return ret;
-}
-
-/*******************************************************************
-case insensitive string comparison
-********************************************************************/
-int strcasecmp_w(const smb_ucs2_t *a, const smb_ucs2_t *b)
-{
- while (*b && toupper_w(*a) == toupper_w(*b)) { a++; b++; }
- return (tolower_w(*a) - tolower_w(*b));
-}
-
-/*******************************************************************
-replace any occurence of oldc with newc in unicode string
-********************************************************************/
-
-void string_replace_w(smb_ucs2_t *s, smb_ucs2_t oldc, smb_ucs2_t newc)
-{
- for(;*s;s++) {
- if(*s==oldc) *s=newc;
- }
-}
-
-
-/*
- The *_wa() functions take a combination of 7 bit ascii
- and wide characters They are used so that you can use string
- functions combining C string constants with ucs2 strings
-
- The char* arguments must NOT be multibyte - to be completely sure
- of this only pass string constants */
+ for (len = 0; (len+2 < n) && SVAL(src, len); len += 2) ;
-
-void pstrcpy_wa(smb_ucs2_t *dest, const char *src)
-{
- int i;
- for (i=0;i<PSTRING_LEN;i++) {
- dest[i] = UCS2_CHAR(src[i]);
- if (src[i] == 0) return;
+ if (len+2 <= n) {
+ len += 2;
}
-}
-int strcmp_wa(const smb_ucs2_t *a, const char *b)
-{
- while (*b && *a == UCS2_CHAR(*b)) { a++; b++; }
- return (*a - UCS2_CHAR(*b));
+ return len;
}
-const smb_ucs2_t *strpbrk_wa(const smb_ucs2_t *s, const char *p)
-{
- while (*s != 0) {
- int i;
- for (i=0; p[i] && *s != UCS2_CHAR(p[i]); i++)
- ;
- if (p[i]) return s;
- s++;
- }
- return NULL;
-}
size_t ucs2_align(const void *base_ptr, const void *p, int flags)
{
@@ -275,3 +151,14 @@ size_t ucs2_align(const void *base_ptr, const void *p, int flags)
return PTR_DIFF(p, base_ptr) & 1;
}
+/*
+ compare two codepoints case insensitively
+*/
+int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
+{
+ if (c1 == c2 ||
+ toupper_w(c1) == toupper_w(c2)) {
+ return 0;
+ }
+ return c1 - c2;
+}