summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorMichael Adam <obnox@samba.org>2010-10-30 02:03:02 +0200
committerMichael Adam <obnox@samba.org>2010-11-03 22:45:20 +0000
commit82c8b31ebce2783e439399f662591b03ab5a1960 (patch)
treeb826ea1e69fcd262fe24310c0607bce550904b3d /lib
parent72f83368653832a7630e9fbda02e516e1aa7faff (diff)
downloadsamba-82c8b31ebce2783e439399f662591b03ab5a1960.tar.gz
samba-82c8b31ebce2783e439399f662591b03ab5a1960.tar.bz2
samba-82c8b31ebce2783e439399f662591b03ab5a1960.zip
lib/util/charset/util_unistr: add strlen_m_ext that takes input and output charset
The function calculates the number of units (8 or 16-bit, depending on the destination charset), that would be needed to convert the input string which is expected to be in in src_charset encoding to the dst_charset (which should be a unicode charset).
Diffstat (limited to 'lib')
-rw-r--r--lib/util/charset/charset.h1
-rw-r--r--lib/util/charset/util_unistr.c60
2 files changed, 51 insertions, 10 deletions
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h
index 283212dbff..8222a0586e 100644
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -120,6 +120,7 @@ struct smb_iconv_convenience;
#define strupper(s) strupper_m(s)
char *strchr_m(const char *s, char c);
+size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset);
size_t strlen_m_term(const char *s);
size_t strlen_m_term_null(const char *s);
size_t strlen_m(const char *s);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c
index 79a9ffe3df..93fc24da15 100644
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -249,11 +249,12 @@ _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_
}
/**
- Count the number of UCS2 characters in a string. Normally this will
- be the same as the number of bytes in a string for single byte strings,
- but will be different for multibyte.
-**/
-_PUBLIC_ size_t strlen_m(const char *s)
+ * Calculate the number of units (8 or 16-bit, depending on the
+ * destination charset), that would be needed to convert the input
+ * string which is expected to be in in src_charset encoding to the
+ * destination charset (which should be a unicode charset).
+ */
+_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
{
size_t count = 0;
struct smb_iconv_convenience *ic = get_iconv_convenience();
@@ -273,19 +274,58 @@ _PUBLIC_ size_t strlen_m(const char *s)
while (*s) {
size_t c_size;
- codepoint_t c = next_codepoint_convenience(ic, s, &c_size);
- if (c < 0x10000) {
+ codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size);
+ s += c_size;
+
+ switch (dst_charset) {
+ case CH_UTF16LE:
+ case CH_UTF16BE:
+ case CH_UTF16MUNGED:
+ if (c < 0x10000) {
+ count += 1;
+ } else {
+ count += 2;
+ }
+ break;
+ case CH_UTF8:
+ /*
+ * this only checks ranges, and does not
+ * check for invalid codepoints
+ */
+ if (c < 0x80) {
+ count += 1;
+ } else if (c < 0x800) {
+ count += 2;
+ } else if (c < 0x1000) {
+ count += 3;
+ } else {
+ count += 4;
+ }
+ break;
+ default:
+ /*
+ * non-unicode encoding:
+ * assume that each codepoint fits into
+ * one unit in the destination encoding.
+ */
count += 1;
- } else {
- count += 2;
}
- s += c_size;
}
return count;
}
/**
+ Count the number of UCS2 characters in a string. Normally this will
+ be the same as the number of bytes in a string for single byte strings,
+ but will be different for multibyte.
+**/
+_PUBLIC_ size_t strlen_m(const char *s)
+{
+ return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
+}
+
+/**
Work out the number of multibyte chars in a string, including the NULL
terminator.
**/