/* Unix SMB/CIFS implementation. Samba charset module for Mac OS X/Darwin Copyright (C) Benjamin Riefenstahl 2003 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * modules/charset_macosxfs.c * * A Samba charset module to use on Mac OS X/Darwin as the filesystem * and display encoding. * * Actually two implementations are provided here. The default * implementation is based on the official CFString API. The other is * based on internal CFString APIs as defined in the OpenDarwin * source. */ #include "includes.h" /* * Include OS frameworks. These are only needed in this module. */ #include <CoreFoundation/CFString.h> /* * See if autoconf has found us the internal headers in some form. */ #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H # include <Corefoundation/CFStringEncodingConverter.h> # include <Corefoundation/CFUnicodePrecomposition.h> # define USE_INTERNAL_API 1 #elif HAVE_CFSTRINGENCODINGCONVERTER_H # include <CFStringEncodingConverter.h> # include <CFUnicodePrecomposition.h> # define USE_INTERNAL_API 1 #endif /* * Compile time configuration: Do we want debug output? */ /* #define DEBUG_STRINGS 1 */ /* * A simple, but efficient memory provider for our buffers. */ static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize) { if (newsize > *size) { *size = newsize + 128; buffer = realloc(buffer, *size); } return buffer; } /* * While there is a version of OpenDarwin for intel, the usual case is * big-endian PPC. So we need byte swapping to handle the * little-endian byte order of the network protocol. We also need an * additional dynamic buffer to do this work for incoming data blocks, * because we have to consider the original data as constant. * * We abstract the differences away by providing a simple facade with * these functions/macros: * * le_to_native(dst,src,len) * native_to_le(cp,len) * set_ucbuffer_with_le(buffer,bufsize,data,size) * set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve) */ #ifdef WORDS_BIGENDIAN static inline void swap_bytes (char * dst, const char * src, size_t len) { const char *srcend = src + len; while (src < srcend) { dst[0] = src[1]; dst[1] = src[0]; dst += 2; src += 2; } } static inline void swap_bytes_inplace (char * cp, size_t len) { char temp; char *end = cp + len; while (cp < end) { temp = cp[1]; cp[1] = cp[0]; cp[0] = temp; cp += 2; } } #define le_to_native(dst,src,len) swap_bytes(dst,src,len) #define native_to_le(cp,len) swap_bytes_inplace(cp,len) #define set_ucbuffer_with_le(buffer,bufsize,data,size) \ set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0) #else /* ! WORDS_BIGENDIAN */ #define le_to_native(dst,src,len) memcpy(dst,src,len) #define native_to_le(cp,len) /* nothing */ #define set_ucbuffer_with_le(buffer,bufsize,data,size) \ (((void)(bufsize)),(UniChar*)(data)) #endif static inline UniChar *set_ucbuffer_with_le_copy ( UniChar *buffer, size_t *bufsize, const void *data, size_t size, size_t reserve) { buffer = resize_buffer(buffer, bufsize, size+reserve); le_to_native((char*)buffer,data,size); return buffer; } /* * A simple hexdump function for debugging error conditions. */ #define debug_out(s) DEBUG(0,(s)) #ifdef DEBUG_STRINGS static void hexdump( const char * label, const char * s, size_t len ) { size_t restlen = len; debug_out("<<<<<<<\n"); debug_out(label); debug_out("\n"); while (restlen > 0) { char line[100]; size_t i, j; char * d = line; #undef sprintf d += sprintf(d, "%04X ", (unsigned)(len-restlen)); *d++ = ' '; for( i = 0; i<restlen && i<8; ++i ) { d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF); } for( j = i; j<8; ++j ) { d += sprintf(d, " "); } *d++ = ' '; for( i = 8; i<restlen && i<16; ++i ) { d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF); } for( j = i; j<16; ++j ) { d += sprintf(d, " "); } *d++ = ' '; for( i = 0; i<restlen && i<16; ++i ) { if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i])) *d++ = '.'; else *d++ = s[i]; } *d++ = '\n'; *d = 0; restlen -= i; s += i; debug_out(line); } debug_out(">>>>>>>\n"); } #else /* !DEBUG_STRINGS */ #define hexdump(label,s,len) /* nothing */ #endif #if !USE_INTERNAL_API /* * An implementation based on documented Mac OS X APIs. * * This does a certain amount of memory management, creating and * manipulating CFString objects. We try to minimize the impact by * keeping those objects around and re-using them. We also use * external backing store for the CFStrings where this is possible and * benficial. * * The Unicode normalizations forms available at this level are * generic, not specifically for the file system. So they may not be * perfect fits. */ static size_t macosxfs_encoding_pull( void *cd, /* Encoder handle */ char **inbuf, size_t *inbytesleft, /* Script string */ char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */ { static const int script_code = kCFStringEncodingUTF8; static CFMutableStringRef cfstring = NULL; size_t outsize; CFRange range; (void) cd; /* UNUSED */ if (0 == *inbytesleft) { return 0; } if (NULL == cfstring) { /* * A version with an external backing store as in the * push function should have been more efficient, but * testing shows, that it is actually slower (!). * Maybe kCFAllocatorDefault gets shortcut evaluation * internally, while kCFAllocatorNull doesn't. */ cfstring = CFStringCreateMutable(kCFAllocatorDefault,0); } /* * Three methods of appending to a CFString, choose the most * efficient. */ if (0 == (*inbuf)[*inbytesleft-1]) { CFStringAppendCString(cfstring, *inbuf, script_code); } else if (*inbytesleft <= 255) { Str255 buffer; buffer[0] = *inbytesleft; memcpy(buffer+1, *inbuf, buffer[0]); CFStringAppendPascalString(cfstring, buffer, script_code); } else { /* * We would like to use a fixed buffer and a loop * here, but than we can't garantee that the input is * well-formed UTF-8, as we are supposed to do. */ static char *buffer = NULL; static size_t buflen = 0; buffer = resize_buffer(buffer, &buflen, *inbytesleft+1); memcpy(buffer, *inbuf, *inbytesleft); buffer[*inbytesleft] = 0; CFStringAppendCString(cfstring, *inbuf, script_code); } /* * Compose characters, using the non-canonical composition * form. */ CFStringNormalize(cfstring, kCFStringNormalizationFormC); outsize = CFStringGetLength(cfstring); range = CFRangeMake(0,outsize); if (outsize == 0) { /* * HACK: smbd/mangle_hash2.c:is_legal_name() expects * errors here. That function will always pass 2 * characters. smbd/open.c:check_for_pipe() cuts a * patchname to 10 characters blindly. Suppress the * debug output in those cases. */ if(2 != *inbytesleft && 10 != *inbytesleft) { debug_out("String conversion: " "An unknown error occurred\n"); hexdump("UTF8->UTF16LE (old) input", *inbuf, *inbytesleft); } errno = EILSEQ; /* Not sure, but this is what we have * actually seen. */ return -1; } if (outsize*2 > *outbytesleft) { CFStringDelete(cfstring, range); debug_out("String conversion: " "Output buffer too small\n"); hexdump("UTF8->UTF16LE (old) input", *inbuf, *inbytesleft); errno = E2BIG; return -1; } CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf); CFStringDelete(cfstring, range); native_to_le(*outbuf, outsize*2); /* * Add a converted null byte, if the CFString conversions * prevented that until now. */ if (0 == (*inbuf)[*inbytesleft-1] && (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) { if ((outsize*2+2) > *outbytesleft) { debug_out("String conversion: " "Output buffer too small\n"); hexdump("UTF8->UTF16LE (old) input", *inbuf, *inbytesleft); errno = E2BIG; return -1; } (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0; outsize += 2; } *inbuf += *inbytesleft; *inbytesleft = 0; *outbuf += outsize*2; *outbytesleft -= outsize*2; return 0; } static size_t macosxfs_encoding_push( void *cd, /* Encoder handle */ char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */ char **outbuf, size_t *outbytesleft) /* Script string */ { static const int script_code = kCFStringEncodingUTF8; static CFMutableStringRef cfstring = NULL; static UniChar *buffer = NULL; static size_t buflen = 0; CFIndex outsize, cfsize, charsconverted; (void) cd; /* UNUSED */ if (0 == *inbytesleft) { return 0; } /* * We need a buffer that can hold 4 times the original data, * because that is the theoretical maximum that decomposition * can create currently (in Unicode 4.0). */ buffer = set_ucbuffer_with_le_copy( buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft); if (NULL == cfstring) { cfstring = CFStringCreateMutableWithExternalCharactersNoCopy( kCFAllocatorDefault, buffer, *inbytesleft/2, buflen/2, kCFAllocatorNull); } else { CFStringSetExternalCharactersNoCopy( cfstring, buffer, *inbytesleft/2, buflen/2); } /* * Decompose characters, using the non-canonical decomposition * form. * * NB: This isn't exactly what HFS+ wants (see note on * kCFStringEncodingUseHFSPlusCanonical in * CFStringEncodingConverter.h), but AFAIK it's the best that * the official API can do. */ CFStringNormalize(cfstring, kCFStringNormalizationFormD); cfsize = CFStringGetLength(cfstring); charsconverted = CFStringGetBytes( cfstring, CFRangeMake(0,cfsize), script_code, 0, False, *outbuf, *outbytesleft, &outsize); if (0 == charsconverted) { debug_out("String conversion: " "Buffer too small or not convertable\n"); hexdump("UTF16LE->UTF8 (old) input", *inbuf, *inbytesleft); errno = EILSEQ; /* Probably more likely. */ return -1; } /* * Add a converted null byte, if the CFString conversions * prevented that until now. */ if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] && (0 != (*outbuf)[outsize-1])) { if (((size_t)outsize+1) > *outbytesleft) { debug_out("String conversion: " "Output buffer too small\n"); hexdump("UTF16LE->UTF8 (old) input", *inbuf, *inbytesleft); errno = E2BIG; return -1; } (*outbuf)[outsize] = 0; ++outsize; } *inbuf += *inbytesleft; *inbytesleft = 0; *outbuf += outsize; *outbytesleft -= outsize; return 0; } #else /* USE_INTERNAL_API */ /* * An implementation based on internal code as known from the * OpenDarwin CVS. * * This code doesn't need much memory management because it uses * functions that operate on the raw memory directly. * * The push routine here is faster and more compatible with HFS+ than * the other implementation above. The pull routine is only faster * for some strings, slightly slower for others. The pull routine * looses because it has to iterate over the data twice, once to * decode UTF-8 and than to do the character composition required by * Windows. */ static size_t macosxfs_encoding_pull( void *cd, /* Encoder handle */ char **inbuf, size_t *inbytesleft, /* Script string */ char **outbuf, size_t *outbytesleft) /* UTF-16-LE string */ { static const int script_code = kCFStringEncodingUTF8; UInt32 srcCharsUsed = 0; UInt32 dstCharsUsed = 0; UInt32 result; uint32_t dstDecomposedUsed = 0; uint32_t dstPrecomposedUsed = 0; (void) cd; /* UNUSED */ if (0 == *inbytesleft) { return 0; } result = CFStringEncodingBytesToUnicode( script_code, kCFStringEncodingComposeCombinings, *inbuf, *inbytesleft, &srcCharsUsed, (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed); switch(result) { case kCFStringEncodingConversionSuccess: if (*inbytesleft == srcCharsUsed) break; else ; /*fall through*/ case kCFStringEncodingInsufficientOutputBufferLength: debug_out("String conversion: " "Output buffer too small\n"); hexdump("UTF8->UTF16LE (new) input", *inbuf, *inbytesleft); errno = E2BIG; return -1; case kCFStringEncodingInvalidInputStream: /* * HACK: smbd/mangle_hash2.c:is_legal_name() expects * errors here. That function will always pass 2 * characters. smbd/open.c:check_for_pipe() cuts a * patchname to 10 characters blindly. Suppress the * debug output in those cases. */ if(2 != *inbytesleft && 10 != *inbytesleft) { debug_out("String conversion: " "Invalid input sequence\n"); hexdump("UTF8->UTF16LE (new) input", *inbuf, *inbytesleft); } errno = EILSEQ; return -1; case kCFStringEncodingConverterUnavailable: debug_out("String conversion: " "Unknown encoding\n"); hexdump("UTF8->UTF16LE (new) input", *inbuf, *inbytesleft); errno = EINVAL; return -1; } /* * It doesn't look like CFStringEncodingBytesToUnicode() can * produce precomposed characters (flags=ComposeCombinings * doesn't do it), so we need another pass over the data here. * We can do this in-place, as the string can only get * shorter. * * (Actually in theory there should be an internal * decomposition and reordering before the actual composition * step. But we should be able to rely on that we always get * fully decomposed strings for input, so this can't create * problems in reality.) */ CFUniCharPrecompose( (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed, (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed); native_to_le(*outbuf, dstPrecomposedUsed*2); *inbuf += srcCharsUsed; *inbytesleft -= srcCharsUsed; *outbuf += dstPrecomposedUsed*2; *outbytesleft -= dstPrecomposedUsed*2; return 0; } static size_t macosxfs_encoding_push( void *cd, /* Encoder handle */ char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */ char **outbuf, size_t *outbytesleft) /* Script string */ { static const int script_code = kCFStringEncodingUTF8; static UniChar *buffer = NULL; static size_t buflen = 0; UInt32 srcCharsUsed=0, dstCharsUsed=0, result; (void) cd; /* UNUSED */ if (0 == *inbytesleft) { return 0; } buffer = set_ucbuffer_with_le( buffer, &buflen, *inbuf, *inbytesleft); result = CFStringEncodingUnicodeToBytes( script_code, kCFStringEncodingUseHFSPlusCanonical, buffer, *inbytesleft/2, &srcCharsUsed, *outbuf, *outbytesleft, &dstCharsUsed); switch(result) { case kCFStringEncodingConversionSuccess: if (*inbytesleft/2 == srcCharsUsed) break; else ; /*fall through*/ case kCFStringEncodingInsufficientOutputBufferLength: debug_out("String conversion: " "Output buffer too small\n"); hexdump("UTF16LE->UTF8 (new) input", *inbuf, *inbytesleft); errno = E2BIG; return -1; case kCFStringEncodingInvalidInputStream: /* * HACK: smbd/open.c:check_for_pipe():is_legal_name() * cuts a pathname to 10 characters blindly. Suppress * the debug output in those cases. */ if(10 != *inbytesleft) { debug_out("String conversion: " "Invalid input sequence\n"); hexdump("UTF16LE->UTF8 (new) input", *inbuf, *inbytesleft); } errno = EILSEQ; return -1; case kCFStringEncodingConverterUnavailable: debug_out("String conversion: " "Unknown encoding\n"); hexdump("UTF16LE->UTF8 (new) input", *inbuf, *inbytesleft); errno = EINVAL; return -1; } *inbuf += srcCharsUsed*2; *inbytesleft -= srcCharsUsed*2; *outbuf += dstCharsUsed; *outbytesleft -= dstCharsUsed; return 0; } #endif /* USE_INTERNAL_API */ /* * For initialization, actually install the encoding as "macosxfs". */ static struct charset_functions macosxfs_encoding_functions = { "MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push }; NTSTATUS init_module(void) { return smb_register_charset(&macosxfs_encoding_functions); } /* eof */