CFStringUtilities.c [plain text]
#include "CFInternal.h"
#include "CFStringEncodingConverterExt.h"
#include "CFUniChar.h"
#include <CoreFoundation/CFStringEncodingExt.h>
#include <CoreFoundation/CFPreferences.h>
#include <limits.h>
#if (DEPLOYMENT_TARGET_MACOSX) || DEPLOYMENT_TARGET_LINUX
#include <stdlib.h>
#elif defined(__WIN32__)
#include <stdlib.h>
#include <tchar.h>
#endif
Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) {
switch (theEncoding) {
case kCFStringEncodingASCII: case kCFStringEncodingMacRoman:
case kCFStringEncodingUTF8:
case kCFStringEncodingNonLossyASCII:
case kCFStringEncodingWindowsLatin1:
case kCFStringEncodingNextStepLatin:
case kCFStringEncodingUTF16:
case kCFStringEncodingUTF16BE:
case kCFStringEncodingUTF16LE:
case kCFStringEncodingUTF32:
case kCFStringEncodingUTF32BE:
case kCFStringEncodingUTF32LE:
return true;
default:
return CFStringEncodingIsValidEncoding(theEncoding);
}
}
const CFStringEncoding* CFStringGetListOfAvailableEncodings() {
return (const CFStringEncoding *)CFStringEncodingListOfAvailableEncodings();
}
CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) {
static CFMutableDictionaryRef mappingTable = NULL;
CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding) : NULL;
if (!theName) {
switch (theEncoding) {
case kCFStringEncodingUTF8: theName = CFSTR("Unicode (UTF-8)"); break;
case kCFStringEncodingUTF16: theName = CFSTR("Unicode (UTF-16)"); break;
case kCFStringEncodingUTF16BE: theName = CFSTR("Unicode (UTF-16BE)"); break;
case kCFStringEncodingUTF16LE: theName = CFSTR("Unicode (UTF-16LE)"); break;
case kCFStringEncodingUTF32: theName = CFSTR("Unicode (UTF-32)"); break;
case kCFStringEncodingUTF32BE: theName = CFSTR("Unicode (UTF-32BE)"); break;
case kCFStringEncodingUTF32LE: theName = CFSTR("Unicode (UTF-32LE)"); break;
case kCFStringEncodingNonLossyASCII: theName = CFSTR("Non-lossy ASCII"); break;
default: {
const char *encodingName = CFStringEncodingName(theEncoding);
if (encodingName) {
theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, encodingName, kCFStringEncodingASCII);
}
}
break;
}
if (theName) {
if (!mappingTable) mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)theEncoding, (const void*)theName);
CFRelease(theName);
}
}
return theName;
}
CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) {
static CFMutableDictionaryRef mappingTable = NULL;
CFStringEncoding result = kCFStringEncodingInvalidId;
CFMutableStringRef lowerCharsetName;
if (CFStringCompare(charsetName, CFSTR("utf-8"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) {
return kCFStringEncodingUTF8;
} else if (CFStringCompare(charsetName, CFSTR("iso-8859-1"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) {
return kCFStringEncodingISOLatin1;
}
lowerCharsetName = CFStringCreateMutableCopy(kCFAllocatorSystemDefault, 0, charsetName);
CFStringLowercase(lowerCharsetName, NULL);
if (mappingTable == NULL) {
CFMutableDictionaryRef table = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, &kCFTypeDictionaryKeyCallBacks, (const CFDictionaryValueCallBacks *)NULL);
const CFStringEncoding *encodings = CFStringGetListOfAvailableEncodings();
while (*encodings != kCFStringEncodingInvalidId) {
const char **nameList = CFStringEncodingCanonicalCharsetNames(*encodings);
if (nameList) {
while (*nameList) {
CFStringRef name = CFStringCreateWithCString(kCFAllocatorSystemDefault, *nameList++, kCFStringEncodingASCII);
if (name) {
CFDictionaryAddValue(table, (const void*)name, (const void*)(uintptr_t)*encodings);
CFRelease(name);
}
}
}
encodings++;
}
CFDictionaryAddValue(table, (const void*)CFSTR("unicode-1-1"), (const void*)kCFStringEncodingUTF16);
CFDictionaryAddValue(table, (const void*)CFSTR("iso-10646-ucs-2"), (const void*)kCFStringEncodingUTF16);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-16"), (const void*)kCFStringEncodingUTF16);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-16be"), (const void*)kCFStringEncodingUTF16BE);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-16le"), (const void*)kCFStringEncodingUTF16LE);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-32"), (const void*)kCFStringEncodingUTF32);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-32be"), (const void*)kCFStringEncodingUTF32BE);
CFDictionaryAddValue(table, (const void*)CFSTR("utf-32le"), (const void*)kCFStringEncodingUTF32LE);
mappingTable = table;
}
if (CFDictionaryContainsKey(mappingTable, (const void*)lowerCharsetName)) {
result = (CFStringEncoding)(uintptr_t)CFDictionaryGetValue(mappingTable, (const void*)lowerCharsetName);
}
CFRelease(lowerCharsetName);
return result;
}
CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) {
static CFMutableDictionaryRef mappingTable = NULL;
CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)encoding) : NULL;
if (!theName) {
switch (encoding) {
case kCFStringEncodingUTF16: theName = CFSTR("UTF-16"); break;
case kCFStringEncodingUTF16BE: theName = CFSTR("UTF-16BE"); break;
case kCFStringEncodingUTF16LE: theName = CFSTR("UTF-16LE"); break;
case kCFStringEncodingUTF32: theName = CFSTR("UTF-32"); break;
case kCFStringEncodingUTF32BE: theName = CFSTR("UTF-32BE"); break;
case kCFStringEncodingUTF32LE: theName = CFSTR("UTF-32LE"); break;
default: {
const char **nameList = CFStringEncodingCanonicalCharsetNames(encoding);
if (nameList && *nameList) {
CFMutableStringRef upperCaseName;
theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, *nameList, kCFStringEncodingASCII);
if (theName) {
upperCaseName = CFStringCreateMutableCopy(kCFAllocatorSystemDefault, 0, theName);
CFStringUppercase(upperCaseName, 0);
CFRelease(theName);
theName = upperCaseName;
}
}
}
break;
}
if (theName) {
if (!mappingTable) mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)encoding, (const void*)theName);
CFRelease(theName);
}
}
return theName;
}
enum {
NSASCIIStringEncoding = 1,
NSNEXTSTEPStringEncoding = 2,
NSJapaneseEUCStringEncoding = 3,
NSUTF8StringEncoding = 4,
NSISOLatin1StringEncoding = 5,
NSSymbolStringEncoding = 6,
NSNonLossyASCIIStringEncoding = 7,
NSShiftJISStringEncoding = 8,
NSISOLatin2StringEncoding = 9,
NSUnicodeStringEncoding = 10,
NSWindowsCP1251StringEncoding = 11,
NSWindowsCP1252StringEncoding = 12,
NSWindowsCP1253StringEncoding = 13,
NSWindowsCP1254StringEncoding = 14,
NSWindowsCP1250StringEncoding = 15,
NSISO2022JPStringEncoding = 21,
NSMacOSRomanStringEncoding = 30,
NSProprietaryStringEncoding = 65536
};
#define NSENCODING_MASK (1 << 31)
unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) {
switch (theEncoding & 0xFFF) {
case kCFStringEncodingASCII: return NSASCIIStringEncoding;
case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding;
case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding;
case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding;
case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding;
case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding;
#if DEPLOYMENT_TARGET_MACOSX
case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding;
case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding;
case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding;
case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding;
case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding;
case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding;
case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding;
case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding;
case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding;
#endif
#if DEPLOYMENT_TARGET_MACOSX
case kCFStringEncodingUnicode:
if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding;
else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding;
#endif
default:
return NSENCODING_MASK | theEncoding;
}
}
CFStringEncoding CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding) {
switch (theEncoding) {
case NSASCIIStringEncoding: return kCFStringEncodingASCII;
case NSNEXTSTEPStringEncoding: return kCFStringEncodingNextStepLatin;
case NSUTF8StringEncoding: return kCFStringEncodingUTF8;
case NSISOLatin1StringEncoding: return kCFStringEncodingISOLatin1;
case NSNonLossyASCIIStringEncoding: return kCFStringEncodingNonLossyASCII;
case NSUnicodeStringEncoding: return kCFStringEncodingUTF16;
case NSWindowsCP1252StringEncoding: return kCFStringEncodingWindowsLatin1;
case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman;
#if DEPLOYMENT_TARGET_MACOSX
case NSSymbolStringEncoding: return kCFStringEncodingMacSymbol;
case NSJapaneseEUCStringEncoding: return kCFStringEncodingEUC_JP;
case NSShiftJISStringEncoding: return kCFStringEncodingDOSJapanese;
case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP;
case NSISOLatin2StringEncoding: return kCFStringEncodingISOLatin2;
case NSWindowsCP1251StringEncoding: return kCFStringEncodingWindowsCyrillic;
case NSWindowsCP1253StringEncoding: return kCFStringEncodingWindowsGreek;
case NSWindowsCP1254StringEncoding: return kCFStringEncodingWindowsLatin5;
case NSWindowsCP1250StringEncoding: return kCFStringEncodingWindowsLatin2;
#endif
default:
return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId);
}
}
#define MACCODEPAGE_BASE (10000)
#define ISO8859CODEPAGE_BASE (28590)
static const uint16_t _CFToDOSCodePageList[] = {
437, -1, -1, -1, -1, 737, 775, -1, -1, -1, -1, -1, -1, -1, -1, -1, 850, 851, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, -1, 01, 932, 936, 949 , 950, };
static const uint16_t _CFToWindowsCodePageList[] = {
1252, 1250, 1251, 1253, 1254, 1255, 1256, 1257, 1258,
};
static const uint16_t _CFEUCToCodePage[] = { 51932, 51936, 51950, 51949,
};
UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) {
#if DEPLOYMENT_TARGET_MACOSX
CFStringEncoding encodingBase = theEncoding & 0x0FFF;
#endif
switch (theEncoding & 0x0F00) {
#if DEPLOYMENT_TARGET_MACOSX
case 0: if (encodingBase <= kCFStringEncodingMacCentralEurRoman) {
return MACCODEPAGE_BASE + encodingBase;
} else if (encodingBase == kCFStringEncodingMacTurkish) {
return 10081;
} else if (encodingBase == kCFStringEncodingMacCroatian) {
return 10082;
} else if (encodingBase == kCFStringEncodingMacIcelandic) {
return 10079;
}
break;
#endif
case 0x100: switch (theEncoding) {
case kCFStringEncodingUTF8: return 65001;
case kCFStringEncodingUTF16: return 1200;
case kCFStringEncodingUTF16BE: return 1201;
case kCFStringEncodingUTF32: return 65005;
case kCFStringEncodingUTF32BE: return 65006;
}
break;
#if (DEPLOYMENT_TARGET_MACOSX)
case 0x0200: if (encodingBase <= kCFStringEncodingISOLatin10) return ISO8859CODEPAGE_BASE + (encodingBase - 0x200);
break;
case 0x0400: if (encodingBase <= kCFStringEncodingDOSChineseTrad) return _CFToDOSCodePageList[encodingBase - 0x400];
break;
case 0x0500: if (encodingBase <= kCFStringEncodingWindowsVietnamese) return _CFToWindowsCodePageList[theEncoding - 0x500];
else if (encodingBase == kCFStringEncodingWindowsKoreanJohab) return 1361;
break;
case 0x600: if (encodingBase == kCFStringEncodingASCII) return 20127;
else if (encodingBase == kCFStringEncodingGB_18030_2000) return 54936;
break;
case 0x0800: switch (encodingBase) {
case kCFStringEncodingISO_2022_JP: return 50220;
case kCFStringEncodingISO_2022_CN: return 50227;
case kCFStringEncodingISO_2022_KR: return 50225;
}
break;
case 0x0900: if (encodingBase <= kCFStringEncodingEUC_KR) return _CFEUCToCodePage[encodingBase - 0x0900];
break;
case 0x0A00: switch (encodingBase) {
case kCFStringEncodingKOI8_R: return 20866;
case kCFStringEncodingHZ_GB_2312: return 52936;
case kCFStringEncodingKOI8_U: return 21866;
}
break;
case 0x0C00: if (encodingBase == kCFStringEncodingEBCDIC_CP037) return 37;
break;
#endif
}
return kCFStringEncodingInvalidId;
}
#if DEPLOYMENT_TARGET_MACOSX
static const struct {
uint16_t acp;
uint16_t encoding;
} _CFACPToCFTable[] = {
{37, kCFStringEncodingEBCDIC_CP037},
{437, kCFStringEncodingDOSLatinUS},
{737, kCFStringEncodingDOSGreek},
{775, kCFStringEncodingDOSBalticRim},
{850, kCFStringEncodingDOSLatin1},
{851, kCFStringEncodingDOSGreek1},
{852, kCFStringEncodingDOSLatin2},
{855, kCFStringEncodingDOSCyrillic},
{857, kCFStringEncodingDOSTurkish},
{860, kCFStringEncodingDOSPortuguese},
{861, kCFStringEncodingDOSIcelandic},
{862, kCFStringEncodingDOSHebrew},
{863, kCFStringEncodingDOSCanadianFrench},
{864, kCFStringEncodingDOSArabic},
{865, kCFStringEncodingDOSNordic},
{866, kCFStringEncodingDOSRussian},
{869, kCFStringEncodingDOSGreek2},
{874, kCFStringEncodingDOSThai},
{932, kCFStringEncodingDOSJapanese},
{936, kCFStringEncodingDOSChineseSimplif},
{949, kCFStringEncodingDOSKorean},
{950, kCFStringEncodingDOSChineseTrad},
{1250, kCFStringEncodingWindowsLatin2},
{1251, kCFStringEncodingWindowsCyrillic},
{1252, kCFStringEncodingWindowsLatin1},
{1253, kCFStringEncodingWindowsGreek},
{1254, kCFStringEncodingWindowsLatin5},
{1255, kCFStringEncodingWindowsHebrew},
{1256, kCFStringEncodingWindowsArabic},
{1257, kCFStringEncodingWindowsBalticRim},
{1258, kCFStringEncodingWindowsVietnamese},
{1361, kCFStringEncodingWindowsKoreanJohab},
{20127, kCFStringEncodingASCII},
{20866, kCFStringEncodingKOI8_R},
{21866, kCFStringEncodingKOI8_U},
{50220, kCFStringEncodingISO_2022_JP},
{50225, kCFStringEncodingISO_2022_KR},
{50227, kCFStringEncodingISO_2022_CN},
{51932, kCFStringEncodingEUC_JP},
{51936, kCFStringEncodingEUC_CN},
{51949, kCFStringEncodingEUC_KR},
{51950, kCFStringEncodingEUC_TW},
{52936, kCFStringEncodingHZ_GB_2312},
{54936, kCFStringEncodingGB_18030_2000},
};
static SInt32 bsearchEncoding(uint16_t target) {
const unsigned int *start, *end, *divider;
unsigned int size = sizeof(_CFACPToCFTable) / sizeof(UInt32);
start = (const unsigned int*)_CFACPToCFTable; end = (const unsigned int*)_CFACPToCFTable + (size - 1);
while (start <= end) {
divider = start + ((end - start) / 2);
if (*(const uint16_t*)divider == target) return *((const uint16_t*)divider + 1);
else if (*(const uint16_t*)divider > target) end = divider - 1;
else if (*(const uint16_t*)(divider + 1) > target) return *((const uint16_t*)divider + 1);
else start = divider + 1;
}
return (kCFStringEncodingInvalidId);
}
#endif
CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) {
if (theEncoding == 0 || theEncoding == 1) { return CFStringGetSystemEncoding();
} else if ((theEncoding >= MACCODEPAGE_BASE) && (theEncoding < 20000)) { if (theEncoding <= 10029) return theEncoding - MACCODEPAGE_BASE; #if (DEPLOYMENT_TARGET_MACOSX)
else if (theEncoding == 10079) return kCFStringEncodingMacIcelandic;
else if (theEncoding == 10081) return kCFStringEncodingMacTurkish;
else if (theEncoding == 10082) return kCFStringEncodingMacCroatian;
#endif
} else if ((theEncoding >= ISO8859CODEPAGE_BASE) && (theEncoding <= 28605)) { return (theEncoding - ISO8859CODEPAGE_BASE) + 0x200;
} else if (theEncoding == 65001) { return kCFStringEncodingUTF8;
} else if (theEncoding == 12000) { return kCFStringEncodingUTF16;
} else if (theEncoding == 12001) { return kCFStringEncodingUTF16BE;
} else if (theEncoding == 65005) { return kCFStringEncodingUTF32;
} else if (theEncoding == 65006) { return kCFStringEncodingUTF32BE;
} else {
#if DEPLOYMENT_TARGET_MACOSX
return bsearchEncoding(theEncoding);
#endif
}
return kCFStringEncodingInvalidId;
}
CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) {
CFStringEncoding macEncoding;
macEncoding = CFStringEncodingGetScriptCodeForEncoding(encoding);
return macEncoding;
}