From 3ad2ee22bb4ecee24d069bb7efadff18c25044d1 Mon Sep 17 00:00:00 2001 From: Jeremy Allison Date: Tue, 3 Oct 2000 02:12:14 +0000 Subject: utf-8 and EUC3 patch from Hiroshi Miura Samba User Group Japan staff. mkdir high bits patch from Robert Dahlem" . jeremy. (This used to be commit b40191d27180ab1e59935086073c4d312552f717) --- source3/lib/doscalls.c | 2 + source3/lib/kanji.c | 473 ++++++++++++++++++++++++++++++++++++++++++++-- source3/lib/util_unistr.c | 14 ++ 3 files changed, 476 insertions(+), 13 deletions(-) (limited to 'source3/lib') diff --git a/source3/lib/doscalls.c b/source3/lib/doscalls.c index 50c446faeb..8d0071dde6 100644 --- a/source3/lib/doscalls.c +++ b/source3/lib/doscalls.c @@ -112,6 +112,7 @@ int dos_lstat(char *fname,SMB_STRUCT_STAT *sbuf) return(sys_lstat(dos_to_unix(fname,False),sbuf)); } +#if 0 /* VFS */ /******************************************************************* Mkdir() that calls dos_to_unix. Cope with UNIXes that don't allow high order mode bits on mkdir. @@ -128,6 +129,7 @@ int dos_mkdir(char *dname,mode_t mode) else return ret; } +#endif /******************************************************************* Rmdir() - call dos_to_unix. diff --git a/source3/lib/kanji.c b/source3/lib/kanji.c index 43b22c19cc..fe65f98b58 100644 --- a/source3/lib/kanji.c +++ b/source3/lib/kanji.c @@ -22,6 +22,8 @@ and extend coding system to EUC/SJIS/JIS/HEX at 1994.10.11 and add all jis codes sequence type at 1995.8.16 Notes: Hexadecimal code by + Adding features about Machine dependent codes and User Defined Codes + by Hiroshi MIURA 2000.3.19 */ #define _KANJI_C_ @@ -387,15 +389,63 @@ static char cvtbuf[2*sizeof(pstring)]; static int euc2sjis (int hi, int lo) { - if (hi & 1) - return ((hi / 2 + (hi < 0xdf ? 0x31 : 0x71)) << 8) | - (lo - (lo >= 0xe0 ? 0x60 : 0x61)); - else - return ((hi / 2 + (hi < 0xdf ? 0x30 : 0x70)) << 8) | (lo - 2); + int w; + int maxidx = SJISREVTBLSIZ; + int minidx = 0; + int i = 2; + + if (hi & 1) { + hi = hi / 2 + (hi < 0xdf ? 0x31 : 0x71); + w = (hi << 8) | (lo - (lo >= 0xe0 ? 0x60 : 0x61)); + } else { + hi = hi / 2 + (hi < 0xdf ? 0x30 : 0x70); + w = (hi << 8) | (lo - 2); + } + if ( (0x87 < hi ) && (hi < 0xed ) ) { + return w; + } + while ( maxidx >= minidx ) { + if ( sjisrev[i].start > w ) { + maxidx = i-1; + } else if ( w > sjisrev[i].end ) { + minidx = i+1; + } else { + w -= sjisrev[i].start; + w += sjisrev[i].rstart; + break; + } + i = (int)( minidx + (maxidx - minidx) % 2 ); + } + return w; } static int sjis2euc (int hi, int lo) { + int minidx = 0; + int maxidx = SJISCONVTBLSIZ; + int i = ( 0 + SJISCONVTBLSIZ ) % 2; + int w = (int)((hi << 8) | lo); + + if ( (sjisconv[0].start < w) && (w < sjisconv[SJISCONVTBLSIZ].end) ) { + while (maxidx >= minidx) { + if ( sjisconv[i].start > w ) { + maxidx = i-1; + } else if (w > sjisconv[i].end) { + minidx = i+1; + } else { + w -= sjisconv[i].start; + w += sjisconv[i].rstart; + break; + } + i = (int)( minidx + (maxidx-minidx)%2 ); + } + hi = (int) ((w >> 8) & 0xff); + lo = (int) (w & 0xff); + } + if (hi >= 0xf0) { + hi = GETAHI; + lo = GETALO; + } if (lo >= 0x9f) return ((hi * 2 - (hi >= 0xe0 ? 0xe0 : 0x60)) << 8) | (lo + 2); else @@ -418,7 +468,7 @@ static char *sj_to_euc(char *from, BOOL overwrite) if (is_shift_jis (*from)) { int code = sjis2euc ((int) from[0] & 0xff, (int) from[1] & 0xff); *out++ = (code >> 8) & 0xff; - *out++ = code; + *out++ = code & 0xff; from += 2; } else if (is_kana (*from)) { *out++ = (char)euc_kana; @@ -451,7 +501,7 @@ static char *euc_to_sj(char *from, BOOL overwrite) if (is_euc (*from)) { int code = euc2sjis ((int) from[0] & 0xff, (int) from[1] & 0xff); *out++ = (code >> 8) & 0xff; - *out++ = code; + *out++ = code & 0xff; from += 2; } else if (is_euc_kana (*from)) { *out++ = from[1]; @@ -461,8 +511,260 @@ static char *euc_to_sj(char *from, BOOL overwrite) } } *out = 0; + if (overwrite) { - pstrcpy(save, (char *) cvtbuf); + pstrcpy(save, (char *) cvtbuf); + return save; + } else { + return cvtbuf; + } +} + +/******************************************************************* + EUC3 <-> SJIS +********************************************************************/ +static int sjis3euc (int hi, int lo, int *len) +{ + int i,w; + int minidx; + int maxidx; + + w = (int)((hi << 8) | lo); + + /* no sjis */ + if ( ( 0x40 >= lo ) && (lo >= 0xfc) && (lo == 0x7f )) { + w = (GETAHI << 8) | GETALO; + + /* IBM Extended Kanji */ + } else if (( w == 0xfa54 )||( w == 0x81ca )) { + *len = 2; + return (0xa2cc); + + } else if (( w == 0xfa5b )||( w == 0x81e6)) { + *len = 2; + return (0xa2e8); + + } else if (( 0xfa <= hi ) && ( hi <= 0xfc ) ) { + i = w - 0xfa40 - ( hi - 0xfa )*( 0xfb40 - 0xfafc) - ((lo < 0x7f)? 0 : 1 ); + if ( i <= EUC3CONVTBLSIZ ){ + *len = 3; + return euc3conv[i]; + } + +/* NEC selected IBM Extend Kanji */ + /* there are 3 code that is not good for conv */ + } else if (( 0x8754 <= w ) && ( w <= 0x878a)) { + minidx = 0; + maxidx = EUC3CONV2TBLSIZ; + i = minidx + (maxidx - minidx) % 2; + while ( maxidx >= minidx ) { + if ( euc3conv2[i].sjis > w ) { + maxidx = i-1; + } else if ( w > euc3conv2[i].sjis ) { + minidx = i+1; + } else { + *len = 3; + return (euc3conv2[i].euc); + } + i = (int)( minidx + (maxidx - minidx) % 2 ); + } + /* else normal EUC */ + + } else if (( w == 0xeef9 ) || ( w == 0x81ca )) { + *len = 2; + return (0xa2cc); + + } else if (( 0xed <= hi ) && ( hi <= 0xef )) { + minidx = 0; + maxidx = SJISREVTBLSIZ; + i = 10; + while ( maxidx >= minidx ) { + if ( sjisrev[i].start > w ) { + maxidx = i-1; + } else if ( w > sjisrev[i].end ) { + minidx = i+1; + } else { + w -= sjisrev[i].start; + w += sjisrev[i].rstart; + break; + } + i = (int)( minidx + (maxidx - minidx) % 2 ); + } + if ( w >= 0xfa40 ) { + i = w - 0xfa40 - ( hi - 0xfa )*( 0xfb40 - 0xfafc) - ((lo < 0x7f)? 0 : 1 ); + if ( i <= EUC3CONVTBLSIZ ){ + *len = 3; + return euc3conv[i]; + } else { + w = (GETAHI << 8) | GETALO; + } + } + /* else normal EUC */ + +/* UDC half low*/ +/* this area maps to the G2 UDC area: 0xf5a1 -- 0xfefe */ + } else if ((0xf0 <= hi) && (hi <= 0xf4)) { + *len = 2; + if (lo >= 0x9f) { + return (((hi * 2 - 0xea) << 8) | (lo + 2)); + } else { + return (((hi * 2 - 0xeb) << 8) | (lo + (lo >=0x7f ? 0x60: 0x61 ))); + } + +/* UDC half high*/ +/* this area maps to the G3 UDC area: 0xf8f5a1 -- 0xf8fefe */ + } else if ((0xf5 <= hi) && (hi <= 0xf9)) { + *len = 3; + if (lo >= 0x9f) { + return (((hi*2 - 0xf4) << 8) | (lo + 2)); + } else { + return (((hi*2 - 0xf5) << 8) | (lo + (lo >= 0x7f ? 0x60: 0x61 ))); + } + /* ....checked all special case */ + } + + /* These Normal 2 byte EUC */ + *len = 2; + hi = (int) ((w >> 8) & 0xff); + lo = (int) (w & 0xff); + + if (hi >= 0xf0) { /* Check range */ + hi = GETAHI; + lo = GETALO; + } + + if (lo >= 0x9f) + return ((hi * 2 - (hi >= 0xe0 ? 0xe0 : 0x60)) << 8) | (lo + 2); + else + return ((hi * 2 - (hi >= 0xe0 ? 0xe1 : 0x61)) << 8) | + (lo + (lo >= 0x7f ? 0x60 : 0x61)); +} + +static int euc3sjis (int hi, int lo, BOOL is_3byte) +{ + int w; + + w = (int)((hi << 8) | lo); + if (is_3byte) { + if (( 0xf5 <= hi) && ( hi <= 0xfe)) { + /* UDC half high*/ + /* this area maps to the G3 UDC area */ + /* 0xf8f5a1 -- 0xf8fefe --> 0xf540 -- 0xf9fc */ + if (hi & 1) { + return (((hi / 2 + 0x7b) << 8) | (lo - (lo >= 0xe0 ? 0x60 : 0x61))); + } else { + return (((hi / 2 + 0x7a) << 8) | (lo - 2)); + } + } else { + /* Using map table */ + int minidx = 0; + int maxidx = EUC3REVTBLSIZ; + int i = minidx + (maxidx - minidx) % 2; + + while ( maxidx >= minidx ) { + if (euc3rev[i].euc > w) { + maxidx = i-1; + } else if (euc3rev[i].euc < w) { + minidx = i+1; + } else { + return (euc3rev[i].sjis); + } + i = (int)( minidx + ( maxidx - minidx ) % 2); + } + return ((GETAHI << 8 ) | GETALO); + } + } else { /* is_2byte */ + if ((0xf5 <= hi) && (hi <= 0xfe)) { + /* UDC half low*/ + /* this area maps to the G2 UDC area */ + /* 0xf5a1 -- 0xfefe --> 0xf040 -- 0xf4fc */ + if (hi & 1) { + return (((hi / 2 + 0x76) << 8) | (lo - (lo >= 0xe0 ? 0x60 : 0x61))); + } else { + return (((hi / 2 + 0x75) << 8) | (lo - 2)); + } + } else { /* Normal EUC */ + if (hi & 1) { + hi = hi / 2 + (hi < 0xdf ? 0x31 : 0x71); + return ((hi << 8) | (lo - (lo >= 0xe0 ? 0x60 : 0x61))); + } else { + hi = hi / 2 + (hi < 0xdf ? 0x30 : 0x70); + return ((hi << 8) | (lo - 2)); + } + } + } + return ((GETAHI << 8) | GETALO); +} + +/******************************************************************* + Convert FROM contain SHIFT JIS codes to EUC codes (with SS2) + return converted buffer +********************************************************************/ + +static char *sj_to_euc3(char *from, BOOL overwrite) +{ + char *out; + char *save; + int len; + + save = (char *) from; + for (out = cvtbuf; *from && (out - cvtbuf < sizeof(cvtbuf)-4);) { + if (is_shift_jis (*from)) { + int code = sjis3euc ((int) from[0] & 0xff, (int) from[1] & 0xff, &len); + if (len == 3) { + *out++ = (char)euc_sup; + } + *out++ = (code >> 8) & 0xff; + *out++ = code & 0xff; + from += 2; + } else if (is_kana (*from)) { + *out++ = (char)euc_kana; + *out++ = *from++; + } else { + *out++ = *from++; + } + } + *out = 0; + if (overwrite) { + pstrcpy((char *) save, (char *) cvtbuf); + return (char *) save; + } else { + return cvtbuf; + } +} + +/******************************************************************* + Convert FROM contain EUC codes (with Sup-Kanji) to SHIFT JIS codes + return converted buffer +********************************************************************/ +static char *euc3_to_sj(char *from, BOOL overwrite) +{ + char *out; + char *save; + + save = (char *) from; + for (out = cvtbuf; *from && (out - cvtbuf < sizeof(cvtbuf)-3); ) { + if (is_euc_sup (*from)) { + int code = euc3sjis((int) from[1] & 0xff, (int) from[2] & 0xff, True); + *out++ = (code >> 8) & 0xff; + *out++ = code & 0xff; + from += 3; + } else if (is_euc (*from)) { + int code = euc3sjis ((int) from[0] & 0xff, (int) from[1] & 0xff,False); + *out++ = (code >> 8) & 0xff; + *out++ = code & 0xff; + from += 2; + } else if (is_euc_kana (*from)) { + *out++ = from[1]; + from += 2; + } else { + *out++ = *from++; + } + } + *out = 0; + + if (overwrite) { + pstrcpy(save, (char *) cvtbuf); return save; } else { return cvtbuf; @@ -475,6 +777,31 @@ static char *euc_to_sj(char *from, BOOL overwrite) static int sjis2jis(int hi, int lo) { + int minidx = 0; + int maxidx = SJISCONVTBLSIZ; + int i = (0 + SJISCONVTBLSIZ) % 2; + int w = (int)((hi << 8) | lo); + + if ((sjisconv[0].start < w) && (w < sjisconv[SJISCONVTBLSIZ].end)) { + while (maxidx >= minidx) { + if (sjisconv[i].start > w) { + maxidx = i-1; + } else if (w > sjisconv[i].end) { + minidx = i+1; + } else { + w -= sjisconv[i].start; + w += sjisconv[i].rstart; + break; + } + i = (int)( minidx + (maxidx-minidx) %2 ); + } + hi = (int) ((w >> 8) & 0xff); + lo = (int) (w & 0xff); + } + if (hi >= 0xf0) { + hi = GETAHI; + lo = GETALO; + } if (lo >= 0x9f) return ((hi * 2 - (hi >= 0xe0 ? 0x160 : 0xe0)) << 8) | (lo - 0x7e); else @@ -484,11 +811,35 @@ static int sjis2jis(int hi, int lo) static int jis2sjis(int hi, int lo) { - if (hi & 1) - return ((hi / 2 + (hi < 0x5f ? 0x71 : 0xb1)) << 8) | - (lo + (lo >= 0x60 ? 0x20 : 0x1f)); - else - return ((hi / 2 + (hi < 0x5f ? 0x70 : 0xb0)) << 8) | (lo + 0x7e); + int w; + int minidx = 0; + int maxidx = SJISREVTBLSIZ; + int i = 2; + + if (hi & 1) { + hi = hi / 2 + (hi < 0x5f ? 0x71 : 0xb1); + w = (hi << 8) | (lo + (lo >= 0x60 ? 0x20 : 0x1f)); + } else { + hi = hi / 2 + (hi < 0x5f ? 0x70 : 0xb0); + w = (hi << 8) | (lo + 0x7e); + } + + if (( 0x87 < hi ) && ( hi < 0xed )) { + return w; + } + while (maxidx >= minidx) { + if (sjisrev[i].start > w) { + maxidx = i-1; + } else if (w > sjisrev[i].end) { + minidx = i+1; + } else { + w -= sjisrev[i].start; + w += sjisrev[i].rstart; + break; + } + i = (int)( minidx + (maxidx-minidx) %2 ); + } + return w; } /******************************************************************* @@ -999,6 +1350,90 @@ static char *sj_to_sj(char *from, BOOL overwrite) } } +/******************************************************************* + cp to utf8 +********************************************************************/ +static char *cp_to_utf8(char *from, BOOL overwrite) +{ + unsigned char *dst; + unsigned char *src; + smb_ucs2_t val; + int w; + size_t len; + + src = (unsigned char *)from; + dst = (unsigned char *)cvtbuf; + while (*src && (((char *)dst - cvtbuf) < sizeof(cvtbuf)-4)) { + len = _skip_multibyte_char(*src); + if ( len == 2 ) { + w = (int)(*src++ & 0xff); + w = (int)((w << 8)|(*src++ & 0xff)); + } else { + w = (int)(*src++ & 0xff); + } + val = doscp2ucs2(w); + + if ( val <= 0x7f ) { + *dst++ = (char)(val & 0xff); + } else if ( val <= 0x7ff ){ + *dst++ = (char)( 0xc0 | ((val >> 6) & 0xff)); + *dst++ = (char)( 0x80 | ( val & 0x3f )); + } else { + *dst++ = (char)( 0xe0 | ((val >> 12) & 0x0f)); + *dst++ = (char)( 0x80 | ((val >> 6) & 0x3f)); + *dst++ = (char)( 0x80 | (val & 0x3f)); + } + + } + *dst++='\0'; + if (overwrite) { + pstrcpy ((char *) from, (char *) cvtbuf); + return (char *) from; + } else { + return cvtbuf; + } +} + +/******************************************************************* + utf8 to cp +********************************************************************/ +static char *utf8_to_cp(char *from, BOOL overwrite) +{ + unsigned char *src; + unsigned char *dst; + smb_ucs2_t val; + int w; + + src = (unsigned char *)from; + dst = (unsigned char *)cvtbuf; + + while (*src && ((char *)dst - cvtbuf < sizeof(cvtbuf)-4)) { + val = (*src++ & 0xff); + if (val < 0x80) { + *dst++ = (char)(val & 0x7f); + } else if ((0xc0 <= val) && (val <= 0xdf) + && (0x80 <= *src) && (*src <= 0xbf)) { + w = ucs2doscp( ((val & 31) << 6) | ((*src++) & 63 )); + *dst++ = (char)((w >> 8) & 0xff); + *dst++ = (char)(w & 0xff); + } else { + val = (val & 0x0f) << 12; + val |= ((*src++ & 0x3f) << 6); + val |= (*src++ & 0x3f); + w = ucs2doscp(val); + *dst++ = (char)((w >> 8) & 0xff); + *dst++ = (char)(w & 0xff); + } + } + *dst++='\0'; + if (overwrite) { + pstrcpy ((char *) from, (char *) cvtbuf); + return (char *) from; + } else { + return cvtbuf; + } +} + /************************************************************************ conversion: _dos_to_unix _unix_to_dos @@ -1046,6 +1481,14 @@ static void setup_string_function(int codes) _dos_to_unix = sj_to_cap; _unix_to_dos = cap_to_sj; break; + case UTF8_CODE: + _dos_to_unix = cp_to_utf8; + _unix_to_dos = utf8_to_cp; + break; + case EUC3_CODE: + _dos_to_unix = sj_to_euc3; + _unix_to_dos = euc3_to_sj; + break; } } @@ -1142,6 +1585,10 @@ void interpret_coding_system(char *str) codes = JUNET_CODE; jis_kso = '@'; jis_ksi = 'H'; + } else if (strequal (str, "utf8")) { + codes = UTF8_CODE; + } else if (strequal (str, "euc3")) { + codes = EUC3_CODE; } setup_string_function (codes); } diff --git a/source3/lib/util_unistr.c b/source3/lib/util_unistr.c index b786d0c98b..93f5490ffc 100644 --- a/source3/lib/util_unistr.c +++ b/source3/lib/util_unistr.c @@ -1977,3 +1977,17 @@ smb_ucs2_t *string_truncate_w(smb_ucs2_t *s, size_t length) return s; } + +/****************************************************************** + functions for UTF8 support (using in kanji.c) + ******************************************************************/ +smb_ucs2_t doscp2ucs2(int w) +{ + return ((smb_ucs2_t)doscp_to_ucs2[w]); +} + +int ucs2doscp(smb_ucs2_t w) +{ + return ((int)ucs2_to_doscp[w]); +} + -- cgit