diff options
Diffstat (limited to 'source3/lib')
-rw-r--r-- | source3/lib/kanji.c | 400 | ||||
-rw-r--r-- | source3/lib/util.c | 96 |
2 files changed, 303 insertions, 193 deletions
diff --git a/source3/lib/kanji.c b/source3/lib/kanji.c index 04eecb5437..b85f0c9354 100644 --- a/source3/lib/kanji.c +++ b/source3/lib/kanji.c @@ -88,105 +88,110 @@ static char hex_tag = HEXTAG; /******************************************************************* SHIFT JIS functions ********************************************************************/ + /******************************************************************* search token from S1 separated any char of S2 S1 contains SHIFT JIS chars. ********************************************************************/ + static char *sj_strtok(char *s1, char *s2) { - static char *s = NULL; - char *q; - if (!s1) { - if (!s) { - return NULL; - } - s1 = s; - } - for (q = s1; *s1; ) { - if (is_shift_jis (*s1)) { - s1 += 2; - } else if (is_kana (*s1)) { - s1++; - } else { - char *p = strchr (s2, *s1); - if (p) { - if (s1 != q) { - s = s1 + 1; - *s1 = '\0'; - return q; - } - q = s1 + 1; - } - s1++; - } + static char *s = NULL; + char *q; + if (!s1) { + if (!s) { + return NULL; } - s = NULL; - if (*q) { - return q; + s1 = s; + } + for (q = s1; *s1; ) { + if (is_shift_jis (*s1)) { + s1 += 2; + } else if (is_kana (*s1)) { + s1++; + } else { + char *p = strchr (s2, *s1); + if (p) { + if (s1 != q) { + s = s1 + 1; + *s1 = '\0'; + return q; + } + q = s1 + 1; + } + s1++; } - return NULL; + } + s = NULL; + if (*q) { + return q; + } + return NULL; } /******************************************************************* search string S2 from S1 S1 contains SHIFT JIS chars. ********************************************************************/ + static char *sj_strstr(char *s1, char *s2) { - int len = strlen ((char *) s2); - if (!*s2) - return (char *) s1; - for (;*s1;) { - if (*s1 == *s2) { - if (strncmp (s1, s2, len) == 0) - return (char *) s1; - } - if (is_shift_jis (*s1)) { - s1 += 2; - } else { - s1++; - } + int len = strlen ((char *) s2); + if (!*s2) + return (char *) s1; + for (;*s1;) { + if (*s1 == *s2) { + if (strncmp (s1, s2, len) == 0) + return (char *) s1; + } + if (is_shift_jis (*s1)) { + s1 += 2; + } else { + s1++; } - return 0; + } + return 0; } /******************************************************************* Search char C from beginning of S. S contains SHIFT JIS chars. ********************************************************************/ + static char *sj_strchr (char *s, int c) { - for (; *s; ) { - if (*s == c) - return (char *) s; - if (is_shift_jis (*s)) { - s += 2; - } else { - s++; - } + for (; *s; ) { + if (*s == c) + return (char *) s; + if (is_shift_jis (*s)) { + s += 2; + } else { + s++; } - return 0; + } + return 0; } /******************************************************************* Search char C end of S. S contains SHIFT JIS chars. ********************************************************************/ + static char *sj_strrchr(char *s, int c) { - char *q; + char *q; - for (q = 0; *s; ) { - if (*s == c) { - q = (char *) s; - } - if (is_shift_jis (*s)) { - s += 2; - } else { - s++; - } + for (q = 0; *s; ) { + if (*s == c) { + q = (char *) s; } - return q; + if (is_shift_jis (*s)) { + s += 2; + } else { + s++; + } + } + return q; } /******************************************************************* @@ -259,35 +264,35 @@ static BOOL simpch_is_multibyte_char_1(char c) static char *generic_multibyte_strtok(char *s1, char *s2) { - static char *s = NULL; - char *q; - if (!s1) { - if (!s) { - return NULL; - } - s1 = s; + static char *s = NULL; + char *q; + if (!s1) { + if (!s) { + return NULL; } - for (q = s1; *s1; ) { - if ((*is_multibyte_char_1)(*s1)) { - s1 += 2; - } else { - char *p = strchr (s2, *s1); - if (p) { - if (s1 != q) { - s = s1 + 1; - *s1 = '\0'; - return q; - } - q = s1 + 1; - } - s1++; + s1 = s; + } + for (q = s1; *s1; ) { + if ((*is_multibyte_char_1)(*s1)) { + s1 += 2; + } else { + char *p = strchr (s2, *s1); + if (p) { + if (s1 != q) { + s = s1 + 1; + *s1 = '\0'; + return q; } + q = s1 + 1; + } + s1++; } - s = NULL; - if (*q) { - return q; - } - return NULL; + } + s = NULL; + if (*q) { + return q; + } + return NULL; } /******************************************************************* @@ -297,21 +302,21 @@ static char *generic_multibyte_strtok(char *s1, char *s2) static char *generic_multibyte_strstr(char *s1, char *s2) { - int len = strlen ((char *) s2); - if (!*s2) + int len = strlen ((char *) s2); + if (!*s2) + return (char *) s1; + for (;*s1;) { + if (*s1 == *s2) { + if (strncmp (s1, s2, len) == 0) return (char *) s1; - for (;*s1;) { - if (*s1 == *s2) { - if (strncmp (s1, s2, len) == 0) - return (char *) s1; - } - if ((*is_multibyte_char_1)(*s1)) { - s1 += 2; - } else { - s1++; - } } - return 0; + if ((*is_multibyte_char_1)(*s1)) { + s1 += 2; + } else { + s1++; + } + } + return 0; } /******************************************************************* @@ -321,16 +326,16 @@ static char *generic_multibyte_strstr(char *s1, char *s2) static char *generic_multibyte_strchr(char *s, int c) { - for (; *s; ) { - if (*s == c) - return (char *) s; - if ((*is_multibyte_char_1)(*s)) { - s += 2; - } else { - s++; - } + for (; *s; ) { + if (*s == c) + return (char *) s; + if ((*is_multibyte_char_1)(*s)) { + s += 2; + } else { + s++; } - return 0; + } + return 0; } /******************************************************************* @@ -340,19 +345,19 @@ static char *generic_multibyte_strchr(char *s, int c) static char *generic_multibyte_strrchr(char *s, int c) { - char *q; + char *q; - for (q = 0; *s; ) { - if (*s == c) { - q = (char *) s; - } - if ((*is_multibyte_char_1)(*s)) { - s += 2; - } else { - s++; - } + for (q = 0; *s; ) { + if (*s == c) { + q = (char *) s; } - return q; + if ((*is_multibyte_char_1)(*s)) { + s += 2; + } else { + s++; + } + } + return q; } /******************************************************************* @@ -370,119 +375,125 @@ static int skip_generic_multibyte_char(char c) /******************************************************************* Code conversion ********************************************************************/ + /* convesion buffer */ static char cvtbuf[1024]; /******************************************************************* EUC <-> SJIS ********************************************************************/ + static int euc2sjis (int hi, int lo) { - if (hi & 1) - return ((hi / 2 + (hi < 0xdf ? 0x31 : 0x71)) << 8) | - (lo - (lo >= 0xe0 ? 0x60 : 0x61)); - else - return ((hi / 2 + (hi < 0xdf ? 0x30 : 0x70)) << 8) | (lo - 2); + if (hi & 1) + return ((hi / 2 + (hi < 0xdf ? 0x31 : 0x71)) << 8) | + (lo - (lo >= 0xe0 ? 0x60 : 0x61)); + else + return ((hi / 2 + (hi < 0xdf ? 0x30 : 0x70)) << 8) | (lo - 2); } static int sjis2euc (int hi, int lo) { - if (lo >= 0x9f) - return ((hi * 2 - (hi >= 0xe0 ? 0xe0 : 0x60)) << 8) | (lo + 2); - else - return ((hi * 2 - (hi >= 0xe0 ? 0xe1 : 0x61)) << 8) | - (lo + (lo >= 0x7f ? 0x60 : 0x61)); + if (lo >= 0x9f) + return ((hi * 2 - (hi >= 0xe0 ? 0xe0 : 0x60)) << 8) | (lo + 2); + else + return ((hi * 2 - (hi >= 0xe0 ? 0xe1 : 0x61)) << 8) | + (lo + (lo >= 0x7f ? 0x60 : 0x61)); } /******************************************************************* Convert FROM contain SHIFT JIS codes to EUC codes return converted buffer ********************************************************************/ + static char *sj_to_euc(char *from, BOOL overwrite) { - char *out; - char *save; - - save = (char *) from; - for (out = cvtbuf; *from;) { - if (is_shift_jis (*from)) { - int code = sjis2euc ((int) from[0] & 0xff, (int) from[1] & 0xff); - *out++ = (code >> 8) & 0xff; - *out++ = code; - from += 2; - } else if (is_kana (*from)) { - *out++ = (char)euc_kana; - *out++ = *from++; - } else { - *out++ = *from++; - } - } - *out = 0; - if (overwrite) { - pstrcpy((char *) save, (char *) cvtbuf); - return (char *) save; + char *out; + char *save; + + save = (char *) from; + for (out = cvtbuf; *from;) { + if (is_shift_jis (*from)) { + int code = sjis2euc ((int) from[0] & 0xff, (int) from[1] & 0xff); + *out++ = (code >> 8) & 0xff; + *out++ = code; + from += 2; + } else if (is_kana (*from)) { + *out++ = (char)euc_kana; + *out++ = *from++; } else { - return cvtbuf; + *out++ = *from++; } + } + *out = 0; + if (overwrite) { + pstrcpy((char *) save, (char *) cvtbuf); + return (char *) save; + } else { + return cvtbuf; + } } /******************************************************************* Convert FROM contain EUC codes to SHIFT JIS codes return converted buffer ********************************************************************/ + static char *euc_to_sj(char *from, BOOL overwrite) { - char *out; - char *save; - - save = (char *) from; - for (out = cvtbuf; *from; ) { - if (is_euc (*from)) { - int code = euc2sjis ((int) from[0] & 0xff, (int) from[1] & 0xff); - *out++ = (code >> 8) & 0xff; - *out++ = code; - from += 2; - } else if (is_euc_kana (*from)) { - *out++ = from[1]; - from += 2; - } else { - *out++ = *from++; - } - } - *out = 0; - if (overwrite) { - pstrcpy(save, (char *) cvtbuf); - return save; + char *out; + char *save; + + save = (char *) from; + for (out = cvtbuf; *from; ) { + if (is_euc (*from)) { + int code = euc2sjis ((int) from[0] & 0xff, (int) from[1] & 0xff); + *out++ = (code >> 8) & 0xff; + *out++ = code; + from += 2; + } else if (is_euc_kana (*from)) { + *out++ = from[1]; + from += 2; } else { - return cvtbuf; + *out++ = *from++; } + } + *out = 0; + if (overwrite) { + pstrcpy(save, (char *) cvtbuf); + return save; + } else { + return cvtbuf; + } } /******************************************************************* JIS7,JIS8,JUNET <-> SJIS ********************************************************************/ + static int sjis2jis(int hi, int lo) { - if (lo >= 0x9f) - return ((hi * 2 - (hi >= 0xe0 ? 0x160 : 0xe0)) << 8) | (lo - 0x7e); - else - return ((hi * 2 - (hi >= 0xe0 ? 0x161 : 0xe1)) << 8) | - (lo - (lo >= 0x7f ? 0x20 : 0x1f)); + if (lo >= 0x9f) + return ((hi * 2 - (hi >= 0xe0 ? 0x160 : 0xe0)) << 8) | (lo - 0x7e); + else + return ((hi * 2 - (hi >= 0xe0 ? 0x161 : 0xe1)) << 8) | + (lo - (lo >= 0x7f ? 0x20 : 0x1f)); } static int jis2sjis(int hi, int lo) { - if (hi & 1) - return ((hi / 2 + (hi < 0x5f ? 0x71 : 0xb1)) << 8) | - (lo + (lo >= 0x60 ? 0x20 : 0x1f)); - else - return ((hi / 2 + (hi < 0x5f ? 0x70 : 0xb0)) << 8) | (lo + 0x7e); + if (hi & 1) + return ((hi / 2 + (hi < 0x5f ? 0x71 : 0xb1)) << 8) | + (lo + (lo >= 0x60 ? 0x20 : 0x1f)); + else + return ((hi / 2 + (hi < 0x5f ? 0x70 : 0xb0)) << 8) | (lo + 0x7e); } /******************************************************************* Convert FROM contain JIS codes to SHIFT JIS codes return converted buffer ********************************************************************/ + static char *jis8_to_sj(char *from, BOOL overwrite) { char *out; @@ -533,6 +544,7 @@ static char *jis8_to_sj(char *from, BOOL overwrite) Convert FROM contain SHIFT JIS codes to JIS codes return converted buffer ********************************************************************/ + static char *sj_to_jis8(char *from, BOOL overwrite) { char *out; @@ -1146,6 +1158,17 @@ static BOOL not_multibyte_char_1(char c) } /******************************************************************* + Function to determine if we are in a multibyte code page. +*******************************************************************/ + +static BOOL is_multibyte_codepage_val = False; + +BOOL is_multibyte_codepage(void) +{ + return is_multibyte_codepage_val; +} + +/******************************************************************* Setup the function pointers for the functions that are replaced when multi-byte codepages are used. @@ -1165,6 +1188,7 @@ void initialize_multibyte_vectors( int client_codepage) multibyte_strtok = (char *(*)(char *, char *)) sj_strtok; _skip_multibyte_char = skip_kanji_multibyte_char; is_multibyte_char_1 = is_kanji_multibyte_char_1; + is_multibyte_codepage_val = True; break; case HANGUL_CODEPAGE: multibyte_strchr = (char *(*)(char *, int )) generic_multibyte_strchr; @@ -1173,6 +1197,7 @@ void initialize_multibyte_vectors( int client_codepage) multibyte_strtok = (char *(*)(char *, char *)) generic_multibyte_strtok; _skip_multibyte_char = skip_generic_multibyte_char; is_multibyte_char_1 = hangul_is_multibyte_char_1; + is_multibyte_codepage_val = True; break; case BIG5_CODEPAGE: multibyte_strchr = (char *(*)(char *, int )) generic_multibyte_strchr; @@ -1181,6 +1206,7 @@ void initialize_multibyte_vectors( int client_codepage) multibyte_strtok = (char *(*)(char *, char *)) generic_multibyte_strtok; _skip_multibyte_char = skip_generic_multibyte_char; is_multibyte_char_1 = big5_is_multibyte_char_1; + is_multibyte_codepage_val = True; break; case SIMPLIFIED_CHINESE_CODEPAGE: multibyte_strchr = (char *(*)(char *, int )) generic_multibyte_strchr; @@ -1189,6 +1215,7 @@ void initialize_multibyte_vectors( int client_codepage) multibyte_strtok = (char *(*)(char *, char *)) generic_multibyte_strtok; _skip_multibyte_char = skip_generic_multibyte_char; is_multibyte_char_1 = simpch_is_multibyte_char_1; + is_multibyte_codepage_val = True; break; /* * Single char size code page. @@ -1200,6 +1227,7 @@ void initialize_multibyte_vectors( int client_codepage) multibyte_strtok = (char *(*)(char *, char *)) strtok; _skip_multibyte_char = skip_non_multibyte_char; is_multibyte_char_1 = not_multibyte_char_1; + is_multibyte_codepage_val = False; break; } } diff --git a/source3/lib/util.c b/source3/lib/util.c index e5486e6159..58106acd46 100644 --- a/source3/lib/util.c +++ b/source3/lib/util.c @@ -1117,8 +1117,28 @@ char *skip_string(char *buf,int n) } /******************************************************************* + Count the number of characters in a string. Normally this will + be the same as the number of bytes in a string for single byte strings, + but will be different for multibyte. + 16.oct.98, jdblair@cobaltnet.com. +********************************************************************/ + +size_t str_charnum(char *s) +{ + size_t len = 0; + + while (*s != '\0') { + int skip = skip_multibyte_char(*s); + s += (skip ? skip : 1); + len++; + } + return len; +} + +/******************************************************************* trim the specified elements off the front and back of a string ********************************************************************/ + BOOL trim_string(char *s,char *front,char *back) { BOOL ret = False; @@ -1138,14 +1158,76 @@ BOOL trim_string(char *s,char *front,char *back) } } - s_len = strlen(s); - while (back_len && s_len >= back_len && - (strncmp(s + s_len - back_len, back, back_len)==0)) + /* + * We split out the multibyte code page + * case here for speed purposes. Under a + * multibyte code page we need to walk the + * string forwards only and multiple times. + * Thanks to John Blair for finding this + * one. JRA. + */ + + if(back_len) { - ret = True; - s[s_len - back_len] = 0; - s_len = strlen(s); - } + if(!is_multibyte_codepage()) + { + s_len = strlen(s); + while ((s_len >= back_len) && + (strncmp(s + s_len - back_len, back, back_len)==0)) + { + ret = True; + s[s_len - back_len] = '\0'; + s_len = strlen(s); + } + } + else + { + + /* + * Multibyte code page case. + * Keep going through the string, trying + * to match the 'back' string with the end + * of the string. If we get a match, truncate + * 'back' off the end of the string and + * go through the string again from the + * start. Keep doing this until we have + * gone through the string with no match + * at the string end. + */ + + size_t mb_back_len = str_charnum(back); + size_t mb_s_len = str_charnum(s); + + while(mb_s_len >= mb_back_len) + { + size_t charcount = 0; + char *mbp = s; + + while(charcount < (mb_s_len - mb_back_len)) + { + size_t skip = skip_multibyte_char(*mbp); + mbp += (skip ? skip : 1); + charcount++; + } + + /* + * mbp now points at mb_back_len multibyte + * characters from the end of s. + */ + + if(strcmp(mbp, back) == 0) + { + ret = True; + *mbp = '\0'; + mb_s_len = str_charnum(s); + mbp = s; + } + else + break; + } /* end while mb_s_len... */ + } /* end else .. */ + } /* end if back_len .. */ + return(ret); } |