CFICUConverters.c   [plain text]


/*
 * Copyright (c) 2009 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/*
 *  CFICUConverters.c
 *  CoreFoundation
 *
 *  Created by Aki Inoue on 07/12/04.
 *  Copyright 2007-2009, Apple Inc. All rights reserved.
 *
 */

#include "CFStringEncodingDatabase.h"
#include "CFStringEncodingConverterPriv.h"
#include "CFICUConverters.h"
#include <CoreFoundation/CFStringEncodingExt.h>
#include <unicode/ucnv.h>
#include <unicode/uversion.h>
#include "CFInternal.h"
#include <stdio.h>

#if DEPLOYMENT_TARGET_WINDOWS
#define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
#define snprintf _snprintf
#endif

// Thread data support
typedef struct {
    uint8_t _numSlots;
    uint8_t _nextSlot;
    UConverter **_converters;
} __CFICUThreadData;

static void __CFICUThreadDataDestructor(void *context) {
    __CFICUThreadData * data = (__CFICUThreadData *)context;
    
    if (NULL != data->_converters) { // scan to make sure deallocation
        UConverter **converter = data->_converters;
        UConverter **limit = converter + data->_numSlots;
        
        while (converter < limit) {
            if (NULL != converter) ucnv_close(*converter);
            ++converter;
        }
        CFAllocatorDeallocate(NULL, data->_converters);
    }
    
    CFAllocatorDeallocate(NULL, data);
}

#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
#import <pthread.h>

CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
    __CFICUThreadData * data;

    pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
    data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);

    if (NULL == data) {
        data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
        memset(data, 0, sizeof(__CFICUThreadData));
        pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
    }

    return data;
}
#elif DEPLOYMENT_TARGET_WINDOWS
__private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }

CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
    __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();

    if (NULL == threadData->_icuThreadData) {
        threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
        memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
    }

    return (__CFICUThreadData *)threadData->_icuThreadData;
}
#else
#error Need implementation for thread data
#endif

__private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
#define STACK_BUFFER_SIZE (60)
    char buffer[STACK_BUFFER_SIZE];
    const char *result = NULL;
    UErrorCode errorCode = U_ZERO_ERROR;
    uint32_t codepage = 0;

    if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";

    if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows

    if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;

    if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);

    return result;
#undef STACK_BUFFER_SIZE
}

__private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
    uint32_t codepage;
    UErrorCode errorCode = U_ZERO_ERROR;

    if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);

    if (0 != ucnv_countAliases(icuName, &errorCode)) {
        CFStringEncoding encoding;
        const char *name;

        // Try WINDOWS platform
        name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
        
        if (NULL != name) {
            if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
            
            if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
        }

        // Try JAVA platform
        name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;

        // Try MIME platform
        name = ucnv_getStandardName(icuName, "MIME", &errorCode);
        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
    }

    return kCFStringEncodingInvalidId;
}

CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
    UConverter *converter;
    UErrorCode errorCode = U_ZERO_ERROR;
    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);

    if (0 != streamID) { // this is a part of streaming previously created
        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();

        --streamID; // map to array index

        if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
    }

    converter = ucnv_open(icuName, &errorCode);

    if (NULL != converter) {
        char lossyByte = CFStringEncodingMaskToLossyByte(flags);

        if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';

        if (0 ==lossyByte) {
            if (toUnicode) {
                ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
            } else {
                ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
            }
        } else {
            ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
        }
    }

    return converter;
}

#define ICU_CONVERTER_SLOT_INCREMENT (10)
#define ICU_CONVERTER_MAX_SLOT (255)

static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);

    if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
        if (0 == streamID) {
            __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();

            if (NULL == data->_converters) {
                data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
                memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
                data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
                data->_nextSlot = 0;
            } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
                CFIndex index;

                for (index = 0;index < data->_numSlots;index++) {
                    if (NULL == data->_converters[index]) {
                        data->_nextSlot = index;
                        break;
                    }
                }

                if (index >= data->_numSlots) { // we're full
                    UConverter **newConverters;
                    CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;

                    if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
                        CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
                        ucnv_close(converter);
                        return 0;
                    }

                    newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
                    memset(newConverters, 0, sizeof(UConverter *) * newSize);
                    memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
                    CFAllocatorDeallocate(NULL, data->_converters);
                    data->_converters = newConverters;
                    data->_nextSlot = data->_numSlots;
                    data->_numSlots = newSize;
                }
            }

            data->_converters[data->_nextSlot] = converter;
            streamID = data->_nextSlot + 1;

            // now find next slot
            ++data->_nextSlot;

            if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
                data->_nextSlot = 0;

                while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
            }
        }

        return CFStringEncodingStreamIDToMask(streamID);
    }

    if (0 != streamID) {
        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();

        --streamID; // map to array index

        if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
            data->_converters[streamID] = NULL;
            if (data->_nextSlot > streamID) data->_nextSlot = streamID;
        }
    }

    ucnv_close(converter);

    return 0;
}

#define MAX_BUFFER_SIZE (1000)

#if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
#warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
#endif
#define HAS_ICU_BUG_6024743 (1)
#define HAS_ICU_BUG_6025527 (1)

__private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
    UConverter *converter;
    UErrorCode errorCode = U_ZERO_ERROR;
    const UTF16Char *source = characters;
    const UTF16Char *sourceLimit = source + numChars;
    char *destination = (char *)bytes;
    const char *destinationLimit = destination + maxByteLen;
    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
    CFIndex status;

    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;

    if (0 == maxByteLen) {
        char buffer[MAX_BUFFER_SIZE];
        CFIndex totalLength = 0;

        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
            destination = buffer;
            destinationLimit = destination + MAX_BUFFER_SIZE;

            ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);

            totalLength += (destination - buffer);

            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
        }

        if (NULL != usedByteLen) *usedByteLen = totalLength;
    } else {
        ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
        
        if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
    }

    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));

    if (NULL != usedCharLen) {
#if HAS_ICU_BUG_6024743
/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
	if (kCFStringEncodingInvalidInputStream == status) {
#define MAX_ERROR_BUFFER_LEN (32)
	    UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
#undef MAX_ERROR_BUFFER_LEN

	    errorCode = U_ZERO_ERROR;

	    ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);

	    if (U_ZERO_ERROR == errorCode) {
		source -= errorLength;
	    } else {
		// Gah, something is terribly wrong. Reset everything
		source = characters; // 0 length
		if (NULL != usedByteLen) *usedByteLen = 0;
	    }
	}
#endif
	*usedCharLen = source - characters;
    }

    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);

    return status;
}

__private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
    UConverter *converter;
    UErrorCode errorCode = U_ZERO_ERROR;
    const char *source = (const char *)bytes;
    const char *sourceLimit = source + numBytes;
    UTF16Char *destination = characters;
    const UTF16Char *destinationLimit = destination + maxCharLen;
    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
    CFIndex status;

    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;

    if (0 == maxCharLen) {
        UTF16Char buffer[MAX_BUFFER_SIZE];
        CFIndex totalLength = 0;
        
        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
            destination = buffer;
            destinationLimit = destination + MAX_BUFFER_SIZE;
            
            ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
            
            totalLength += (destination - buffer);
            
            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
        }
        
        if (NULL != usedCharLen) *usedCharLen = totalLength;
    } else {
        ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);

        if (NULL != usedCharLen) *usedCharLen = destination - characters;
    }

    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));

    if (NULL != usedByteLen) {
#if HAS_ICU_BUG_6024743
	/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
	if (kCFStringEncodingInvalidInputStream == status) {
#define MAX_ERROR_BUFFER_LEN (32)
	    char errorBuffer[MAX_ERROR_BUFFER_LEN];
	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
#undef MAX_ERROR_BUFFER_LEN

	    errorCode = U_ZERO_ERROR;
	    
	    ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
	    
	    if (U_ZERO_ERROR == errorCode) {
#if HAS_ICU_BUG_6025527
                // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
                if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
#endif
		source -= errorLength;
	    } else {
		// Gah, something is terribly wrong. Reset everything
		source = (const char *)bytes; // 0 length
		if (NULL != usedCharLen) *usedCharLen = 0;
	    }
	}
#endif

	*usedByteLen = source - (const char *)bytes;
    }
    
    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);

    return status;
}

__private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
    CFIndex usedCharLen;
    return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
}

__private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
    CFIndex usedByteLen;
    return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
}

__private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
    CFIndex count = ucnv_countAvailable();
    CFIndex numEncodings = 0;
    CFStringEncoding *encodings;
    CFStringEncoding encoding;
    CFIndex index;

    if (0 == count) return NULL;

    encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);

    for (index = 0;index < count;index++) {
        encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));

        if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
    }

    if (0 == numEncodings) {
        CFAllocatorDeallocate(allocator, encodings);
        encodings = NULL;
    }

    *numberOfIndex = numEncodings;

    return encodings;
}