diff options
author | Michael Adam <obnox@samba.org> | 2010-10-30 02:03:02 +0200 |
---|---|---|
committer | Michael Adam <obnox@samba.org> | 2010-11-03 22:45:19 +0000 |
commit | 288d55b511bcfd5ed3d6b464046808ce82aedaec (patch) | |
tree | 7d1afb502a4470e1b38f7c94d266ba5fa115dda7 | |
parent | 4579d7ea2920bda8de2339c6ecd190e65454fa43 (diff) | |
download | samba-288d55b511bcfd5ed3d6b464046808ce82aedaec.tar.gz samba-288d55b511bcfd5ed3d6b464046808ce82aedaec.tar.bz2 samba-288d55b511bcfd5ed3d6b464046808ce82aedaec.zip |
s3:lib/util_str: add strlen_m_ext() that takes input and output charset
The function calculates the number of units (8 or 16-bit, depending
on the destination charset), that would be needed to convert the
input string which is expected to be in in src_charset encoding
to the dst_charset (which should be a unicode charset).
-rw-r--r-- | source3/include/proto.h | 2 | ||||
-rw-r--r-- | source3/lib/util_str.c | 68 |
2 files changed, 57 insertions, 13 deletions
diff --git a/source3/include/proto.h b/source3/include/proto.h index 89aa623314..5e88476e44 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -1433,6 +1433,8 @@ char *strnrchr_m(const char *s, char c, unsigned int n); char *strstr_m(const char *src, const char *findstr); void strlower_m(char *s); void strupper_m(char *s); +size_t strlen_m_ext(const char *s, const charset_t src_charset, + const charset_t dst_charset); size_t strlen_m(const char *s); size_t strlen_m_term(const char *s); size_t strlen_m_term_null(const char *s); diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c index 4e16fb3a45..27147365cd 100644 --- a/source3/lib/util_str.c +++ b/source3/lib/util_str.c @@ -1454,14 +1454,14 @@ void strupper_m(char *s) } /** - * Calculate the number of 16-bit units that would be needed to convert - * the input string which is expected to be in CH_UNIX encoding to UTF16. - * - * This will be the same as the number of bytes in a string for single - * byte strings, but will be different for multibyte. + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). */ -size_t strlen_m(const char *s) +size_t strlen_m_ext(const char *s, const charset_t src_charset, + const charset_t dst_charset) { size_t count = 0; @@ -1480,21 +1480,63 @@ size_t strlen_m(const char *s) while (*s) { size_t c_size; - codepoint_t c = next_codepoint(s, &c_size); - if (c < 0x10000) { - /* Unicode char fits into 16 bits. */ + codepoint_t c = next_codepoint_ext(s, src_charset, &c_size); + s += c_size; + + switch (dst_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + if (c < 0x10000) { + /* Unicode char fits into 16 bits. */ + count += 1; + } else { + /* Double-width unicode char - 32 bits. */ + count += 2; + } + break; + case CH_UTF8: + /* + * this only checks ranges, and does not + * check for invalid codepoints + */ + if (c < 0x80) { + count += 1; + } else if (c < 0x800) { + count += 2; + } else if (c < 0x1000) { + count += 3; + } else { + count += 4; + } + break; + default: + /* + * non-unicode encoding: + * assume that each codepoint fits into + * one unit in the destination encoding. + */ count += 1; - } else { - /* Double-width unicode char - 32 bits. */ - count += 2; } - s += c_size; } return count; } /** + * Calculate the number of 16-bit units that would bee needed to convert + * the input string which is expected to be in CH_UNIX encoding to UTF16. + * + * This will be the same as the number of bytes in a string for single + * byte strings, but will be different for multibyte. + */ + +size_t strlen_m(const char *s) +{ + return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); +} + +/** Count the number of UCS2 characters in a string including the null terminator. **/ |