From 41051fd3d3ac7450771518aa12b660867ed7e819 Mon Sep 17 00:00:00 2001 From: Andrew Bartlett Date: Fri, 18 Mar 2011 19:10:23 +1100 Subject: lib/util: Merge basic string length and comparison functions These functions now use the codepoints for more accurate string handling and now form common code. Andrew Bartlett Autobuild-User: Andrew Bartlett Autobuild-Date: Wed Mar 23 08:21:54 CET 2011 on sn-devel-104 --- lib/util/charset/charset.h | 2 +- lib/util/charset/tests/charset.c | 16 +- lib/util/charset/util_str.c | 416 +++++++++++++++++++++++++++++++++++++++ lib/util/charset/util_unistr.c | 334 +------------------------------ lib/util/charset/wscript_build | 2 +- 5 files changed, 427 insertions(+), 343 deletions(-) create mode 100644 lib/util/charset/util_str.c (limited to 'lib') diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h index 474d77e54e..943bfa4695 100644 --- a/lib/util/charset/charset.h +++ b/lib/util/charset/charset.h @@ -128,7 +128,7 @@ size_t strlen_m_term_null(const char *s); size_t strlen_m(const char *s); char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength); void string_replace_m(char *s, char oldc, char newc); -bool strcsequal_m(const char *s1,const char *s2); +bool strcsequal(const char *s1,const char *s2); bool strequal_m(const char *s1, const char *s2); int strncasecmp_m(const char *s1, const char *s2, size_t n); bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize); diff --git a/lib/util/charset/tests/charset.c b/lib/util/charset/tests/charset.c index 72fd11b128..351b91c7b7 100644 --- a/lib/util/charset/tests/charset.c +++ b/lib/util/charset/tests/charset.c @@ -69,14 +69,14 @@ static bool test_strequal_m(struct torture_context *tctx) return true; } -static bool test_strcsequal_m(struct torture_context *tctx) +static bool test_strcsequal(struct torture_context *tctx) { - torture_assert(tctx, !strcsequal_m("foo", "bar"), "different strings"); - torture_assert(tctx, strcsequal_m("foo", "foo"), "same case strings"); - torture_assert(tctx, !strcsequal_m("foo", "Foo"), "different case strings"); - torture_assert(tctx, !strcsequal_m(NULL, "Foo"), "one NULL"); - torture_assert(tctx, !strcsequal_m("foo", NULL), "other NULL"); - torture_assert(tctx, strcsequal_m(NULL, NULL), "both NULL"); + torture_assert(tctx, !strcsequal("foo", "bar"), "different strings"); + torture_assert(tctx, strcsequal("foo", "foo"), "same case strings"); + torture_assert(tctx, !strcsequal("foo", "Foo"), "different case strings"); + torture_assert(tctx, !strcsequal(NULL, "Foo"), "one NULL"); + torture_assert(tctx, !strcsequal("foo", NULL), "other NULL"); + torture_assert(tctx, strcsequal(NULL, NULL), "both NULL"); return true; } @@ -253,7 +253,7 @@ struct torture_suite *torture_local_charset(TALLOC_CTX *mem_ctx) torture_suite_add_simple_test(suite, "codepoint_cmpi", test_codepoint_cmpi); torture_suite_add_simple_test(suite, "strcasecmp_m", test_strcasecmp_m); torture_suite_add_simple_test(suite, "strequal_m", test_strequal_m); - torture_suite_add_simple_test(suite, "strcsequal_m", test_strcsequal_m); + torture_suite_add_simple_test(suite, "strcsequal", test_strcsequal); torture_suite_add_simple_test(suite, "string_replace_m", test_string_replace_m); torture_suite_add_simple_test(suite, "strncasecmp_m", test_strncasecmp_m); torture_suite_add_simple_test(suite, "next_token", test_next_token); diff --git a/lib/util/charset/util_str.c b/lib/util/charset/util_str.c new file mode 100644 index 0000000000..597b031675 --- /dev/null +++ b/lib/util/charset/util_str.c @@ -0,0 +1,416 @@ +/* + Unix SMB/CIFS implementation. + Samba utility functions + Copyright (C) Andrew Tridgell 1992-2001 + Copyright (C) Simo Sorce 2001 + Copyright (C) Andrew Bartlett 2011 + Copyright (C) Jeremy Allison 1992-2007 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "includes.h" +#include "system/locale.h" + +#ifdef strcasecmp +#undef strcasecmp +#endif + +/** + Case insensitive string compararison +**/ +_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2) +{ + codepoint_t c1=0, c2=0; + size_t size1, size2; + struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); + + /* handle null ptr comparisons to simplify the use in qsort */ + if (s1 == s2) return 0; + if (s1 == NULL) return -1; + if (s2 == NULL) return 1; + + while (*s1 && *s2) { + c1 = next_codepoint_convenience(iconv_convenience, s1, &size1); + c2 = next_codepoint_convenience(iconv_convenience, s2, &size2); + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + /* what else can we do?? */ + return strcasecmp(s1, s2); + } + + if (toupper_m(c1) != toupper_m(c2)) { + return c1 - c2; + } + } + + return *s1 - *s2; +} + +/** + Case insensitive string compararison, length limited +**/ +_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n) +{ + codepoint_t c1=0, c2=0; + size_t size1, size2; + struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); + + /* handle null ptr comparisons to simplify the use in qsort */ + if (s1 == s2) return 0; + if (s1 == NULL) return -1; + if (s2 == NULL) return 1; + + while (*s1 && *s2 && n) { + n--; + + c1 = next_codepoint_convenience(iconv_convenience, s1, &size1); + c2 = next_codepoint_convenience(iconv_convenience, s2, &size2); + + s1 += size1; + s2 += size2; + + if (c1 == c2) { + continue; + } + + if (c1 == INVALID_CODEPOINT || + c2 == INVALID_CODEPOINT) { + /* what else can we do?? */ + return strcasecmp(s1, s2); + } + + if (toupper_m(c1) != toupper_m(c2)) { + return c1 - c2; + } + } + + if (n == 0) { + return 0; + } + + return *s1 - *s2; +} + +/** + * Compare 2 strings. + * + * @note The comparison is case-insensitive. + **/ +_PUBLIC_ bool strequal_m(const char *s1, const char *s2) +{ + return strcasecmp_m(s1,s2) == 0; +} + +/** + Compare 2 strings (case sensitive). +**/ +_PUBLIC_ bool strcsequal(const char *s1,const char *s2) +{ + if (s1 == s2) + return true; + if (!s1 || !s2) + return false; + + return strcmp(s1,s2) == 0; +} + +/** + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). + */ +_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset) +{ + size_t count = 0; + struct smb_iconv_convenience *ic = get_iconv_convenience(); + + if (!s) { + return 0; + } + + while (*s && !(((uint8_t)*s) & 0x80)) { + s++; + count++; + } + + if (!*s) { + return count; + } + + while (*s) { + size_t c_size; + codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size); + s += c_size; + + switch (dst_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + if (c < 0x10000) { + /* Unicode char fits into 16 bits. */ + count += 1; + } else { + /* Double-width unicode char - 32 bits. */ + count += 2; + } + break; + case CH_UTF8: + /* + * this only checks ranges, and does not + * check for invalid codepoints + */ + if (c < 0x80) { + count += 1; + } else if (c < 0x800) { + count += 2; + } else if (c < 0x1000) { + count += 3; + } else { + count += 4; + } + break; + default: + /* + * non-unicode encoding: + * assume that each codepoint fits into + * one unit in the destination encoding. + */ + count += 1; + } + } + + return count; +} + +_PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset, + const charset_t dst_charset) +{ + if (!s) { + return 0; + } + return strlen_m_ext(s, src_charset, dst_charset) + 1; +} + +/** + * Calculate the number of 16-bit units that would be needed to convert + * the input string which is expected to be in CH_UNIX encoding to UTF16. + * + * This will be the same as the number of bytes in a string for single + * byte strings, but will be different for multibyte. + */ +_PUBLIC_ size_t strlen_m(const char *s) +{ + return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); +} + +/** + Work out the number of multibyte chars in a string, including the NULL + terminator. +**/ +_PUBLIC_ size_t strlen_m_term(const char *s) +{ + if (!s) { + return 0; + } + + return strlen_m(s) + 1; +} + +/* + * Weird helper routine for the winreg pipe: If nothing is around, return 0, + * if a string is there, include the terminator. + */ + +_PUBLIC_ size_t strlen_m_term_null(const char *s) +{ + size_t len; + if (!s) { + return 0; + } + len = strlen_m(s); + if (len == 0) { + return 0; + } + + return len+1; +} + +/** + Strchr and strrchr_m are a bit complex on general multi-byte strings. +**/ +_PUBLIC_ char *strchr_m(const char *src, char c) +{ + const char *s; + struct smb_iconv_convenience *ic = get_iconv_convenience(); + if (src == NULL) { + return NULL; + } + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strchr(src, c); + } + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars) */ + + for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) { + if (*s == c) + return (char *)s; + } + + if (!*s) + return NULL; + +#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS + /* With compose characters we must restart from the beginning. JRA. */ + s = src; +#endif + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint_convenience(ic, s, &size); + if (c2 == c) { + return discard_const_p(char, s); + } + s += size; + } + + return NULL; +} + +/** + * Multibyte-character version of strrchr + */ +_PUBLIC_ char *strrchr_m(const char *s, char c) +{ + struct smb_iconv_convenience *ic = get_iconv_convenience(); + char *ret = NULL; + + if (s == NULL) { + return NULL; + } + + /* characters below 0x3F are guaranteed to not appear in + non-initial position in multi-byte charsets */ + if ((c & 0xC0) == 0) { + return strrchr(s, c); + } + + /* this is quite a common operation, so we want it to be + fast. We optimise for the ascii case, knowing that all our + supported multi-byte character sets are ascii-compatible + (ie. they match for the first 128 chars). Also, in Samba + we only search for ascii characters in 'c' and that + in all mb character sets with a compound character + containing c, if 'c' is not a match at position + p, then p[-1] > 0x7f. JRA. */ + + { + size_t len = strlen(s); + const char *cp = s; + bool got_mb = false; + + if (len == 0) + return NULL; + cp += (len - 1); + do { + if (c == *cp) { + /* Could be a match. Part of a multibyte ? */ + if ((cp > s) && + (((unsigned char)cp[-1]) & 0x80)) { + /* Yep - go slow :-( */ + got_mb = true; + break; + } + /* No - we have a match ! */ + return (char *)cp; + } + } while (cp-- != s); + if (!got_mb) + return NULL; + } + + while (*s) { + size_t size; + codepoint_t c2 = next_codepoint_convenience(ic, s, &size); + if (c2 == c) { + ret = discard_const_p(char, s); + } + s += size; + } + + return ret; +} + +/** + return True if any (multi-byte) character is lower case +*/ +_PUBLIC_ bool strhaslower(const char *string) +{ + struct smb_iconv_convenience *ic = get_iconv_convenience(); + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint_convenience(ic, string, &c_size); + string += c_size; + + t = toupper_m(s); + + if (s != t) { + return true; /* that means it has lower case chars */ + } + } + + return false; +} + +/** + return True if any (multi-byte) character is upper case +*/ +_PUBLIC_ bool strhasupper(const char *string) +{ + struct smb_iconv_convenience *ic = get_iconv_convenience(); + while (*string) { + size_t c_size; + codepoint_t s; + codepoint_t t; + + s = next_codepoint_convenience(ic, string, &c_size); + string += c_size; + + t = tolower_m(s); + + if (s != t) { + return true; /* that means it has upper case chars */ + } + } + + return false; +} + diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c index b6bfb29e7d..ad2ba687f4 100644 --- a/lib/util/charset/util_unistr.c +++ b/lib/util/charset/util_unistr.c @@ -21,45 +21,6 @@ #include "includes.h" #include "system/locale.h" -/** - Case insensitive string compararison -**/ -_PUBLIC_ int strcasecmp_m(const char *s1, const char *s2) -{ - codepoint_t c1=0, c2=0; - size_t size1, size2; - struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); - - /* handle null ptr comparisons to simplify the use in qsort */ - if (s1 == s2) return 0; - if (s1 == NULL) return -1; - if (s2 == NULL) return 1; - - while (*s1 && *s2) { - c1 = next_codepoint_convenience(iconv_convenience, s1, &size1); - c2 = next_codepoint_convenience(iconv_convenience, s2, &size2); - - s1 += size1; - s2 += size2; - - if (c1 == c2) { - continue; - } - - if (c1 == INVALID_CODEPOINT || - c2 == INVALID_CODEPOINT) { - /* what else can we do?? */ - return strcasecmp(s1, s2); - } - - if (toupper_m(c1) != toupper_m(c2)) { - return c1 - c2; - } - } - - return *s1 - *s2; -} - /** * Get the next token from a string, return False if none found. * Handles double-quotes. @@ -106,74 +67,6 @@ _PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bu return true; } -/** - Case insensitive string compararison, length limited -**/ -_PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n) -{ - codepoint_t c1=0, c2=0; - size_t size1, size2; - struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); - - /* handle null ptr comparisons to simplify the use in qsort */ - if (s1 == s2) return 0; - if (s1 == NULL) return -1; - if (s2 == NULL) return 1; - - while (*s1 && *s2 && n) { - n--; - - c1 = next_codepoint_convenience(iconv_convenience, s1, &size1); - c2 = next_codepoint_convenience(iconv_convenience, s2, &size2); - - s1 += size1; - s2 += size2; - - if (c1 == c2) { - continue; - } - - if (c1 == INVALID_CODEPOINT || - c2 == INVALID_CODEPOINT) { - /* what else can we do?? */ - return strcasecmp(s1, s2); - } - - if (toupper_m(c1) != toupper_m(c2)) { - return c1 - c2; - } - } - - if (n == 0) { - return 0; - } - - return *s1 - *s2; -} - -/** - * Compare 2 strings. - * - * @note The comparison is case-insensitive. - **/ -_PUBLIC_ bool strequal_m(const char *s1, const char *s2) -{ - return strcasecmp_m(s1,s2) == 0; -} - -/** - Compare 2 strings (case sensitive). -**/ -_PUBLIC_ bool strcsequal_m(const char *s1,const char *s2) -{ - if (s1 == s2) - return true; - if (!s1 || !s2) - return false; - - return strcmp(s1,s2) == 0; -} - /** String replace. @@ -238,231 +131,6 @@ _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_ return dest; } -/** - * Calculate the number of units (8 or 16-bit, depending on the - * destination charset), that would be needed to convert the input - * string which is expected to be in in src_charset encoding to the - * destination charset (which should be a unicode charset). - */ -_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset) -{ - size_t count = 0; - struct smb_iconv_convenience *ic = get_iconv_convenience(); - - if (!s) { - return 0; - } - - while (*s && !(((uint8_t)*s) & 0x80)) { - s++; - count++; - } - - if (!*s) { - return count; - } - - while (*s) { - size_t c_size; - codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size); - s += c_size; - - switch (dst_charset) { - case CH_UTF16LE: - case CH_UTF16BE: - case CH_UTF16MUNGED: - if (c < 0x10000) { - count += 1; - } else { - count += 2; - } - break; - case CH_UTF8: - /* - * this only checks ranges, and does not - * check for invalid codepoints - */ - if (c < 0x80) { - count += 1; - } else if (c < 0x800) { - count += 2; - } else if (c < 0x1000) { - count += 3; - } else { - count += 4; - } - break; - default: - /* - * non-unicode encoding: - * assume that each codepoint fits into - * one unit in the destination encoding. - */ - count += 1; - } - } - - return count; -} - -_PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset, - const charset_t dst_charset) -{ - if (!s) { - return 0; - } - return strlen_m_ext(s, src_charset, dst_charset) + 1; -} - -/** - * Calculate the number of 16-bit units that would be needed to convert - * the input string which is expected to be in CH_UNIX encoding to UTF16. - * - * This will be the same as the number of bytes in a string for single - * byte strings, but will be different for multibyte. - */ -_PUBLIC_ size_t strlen_m(const char *s) -{ - return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); -} - -/** - Work out the number of multibyte chars in a string, including the NULL - terminator. -**/ -_PUBLIC_ size_t strlen_m_term(const char *s) -{ - if (!s) { - return 0; - } - - return strlen_m(s) + 1; -} - -/* - * Weird helper routine for the winreg pipe: If nothing is around, return 0, - * if a string is there, include the terminator. - */ - -_PUBLIC_ size_t strlen_m_term_null(const char *s) -{ - size_t len; - if (!s) { - return 0; - } - len = strlen_m(s); - if (len == 0) { - return 0; - } - - return len+1; -} - -/** - Strchr and strrchr_m are a bit complex on general multi-byte strings. -**/ -_PUBLIC_ char *strchr_m(const char *s, char c) -{ - struct smb_iconv_convenience *ic = get_iconv_convenience(); - if (s == NULL) { - return NULL; - } - /* characters below 0x3F are guaranteed to not appear in - non-initial position in multi-byte charsets */ - if ((c & 0xC0) == 0) { - return strchr(s, c); - } - - while (*s) { - size_t size; - codepoint_t c2 = next_codepoint_convenience(ic, s, &size); - if (c2 == c) { - return discard_const_p(char, s); - } - s += size; - } - - return NULL; -} - -/** - * Multibyte-character version of strrchr - */ -_PUBLIC_ char *strrchr_m(const char *s, char c) -{ - struct smb_iconv_convenience *ic = get_iconv_convenience(); - char *ret = NULL; - - if (s == NULL) { - return NULL; - } - - /* characters below 0x3F are guaranteed to not appear in - non-initial position in multi-byte charsets */ - if ((c & 0xC0) == 0) { - return strrchr(s, c); - } - - while (*s) { - size_t size; - codepoint_t c2 = next_codepoint_convenience(ic, s, &size); - if (c2 == c) { - ret = discard_const_p(char, s); - } - s += size; - } - - return ret; -} - -/** - return True if any (multi-byte) character is lower case -*/ -_PUBLIC_ bool strhaslower(const char *string) -{ - struct smb_iconv_convenience *ic = get_iconv_convenience(); - while (*string) { - size_t c_size; - codepoint_t s; - codepoint_t t; - - s = next_codepoint_convenience(ic, string, &c_size); - string += c_size; - - t = toupper_m(s); - - if (s != t) { - return true; /* that means it has lower case chars */ - } - } - - return false; -} - -/** - return True if any (multi-byte) character is upper case -*/ -_PUBLIC_ bool strhasupper(const char *string) -{ - struct smb_iconv_convenience *ic = get_iconv_convenience(); - while (*string) { - size_t c_size; - codepoint_t s; - codepoint_t t; - - s = next_codepoint_convenience(ic, string, &c_size); - string += c_size; - - t = tolower_m(s); - - if (s != t) { - return true; /* that means it has upper case chars */ - } - } - - return false; -} - /** Convert a string to lower case, allocated with talloc **/ @@ -517,7 +185,7 @@ _PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n) size_t size=0; char *dest; struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); - + if (!src) { return NULL; } diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build index ab7cfc412d..a245ef1b0c 100644 --- a/lib/util/charset/wscript_build +++ b/lib/util/charset/wscript_build @@ -13,6 +13,6 @@ bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER', public_deps='iconv replace talloc') bld.SAMBA_SUBSYSTEM('CODEPOINTS', - source='codepoints.c', + source='codepoints.c util_str.c', deps='DYNCONFIG ICONV_WRAPPER' ) -- cgit