CFUnicodeDecomposition.c [plain text]
#include <string.h>
#if KERNEL
#include <stdlib.h>
#include "CFUnicodeDecomposition.h"
#include "CFUniCharNonBaseData.h"
#include "CFUniCharDecompData.h"
#include "CFUniCharCombiningPriorityData.h"
#else KERNEL
#include <CoreFoundation/CFBase.h>
#include <CoreFoundation/CFCharacterSet.h>
#include "CFUniChar.h"
#include "CFUnicodeDecomposition.h"
#include "CFInternal.h"
#include "CFUniCharPriv.h"
#endif KERNEL
#if KERNEL
static const uint32_t __CFUniCharDecompositionTableLength = (sizeof(__CFUniCharDecompositionTable) / (sizeof(uint32_t) * 2));
#define __CFUniCharIsDecomposableCharacterWithFlag(character,isHFSPlus) (__CFUniCharIsDecomposableCharacter(character))
uint8_t **CFUniCharCombiningPriorityTable = __CFUniCharCombiningPriorityTable;
uint8_t **CFUniCharCombiningPriorityExtraTable = __CFUniCharCombiningPriorityExtraTable;
uint8_t CFUniCharNumberOfPlanesForCombiningPriority = sizeof(__CFUniCharCombiningPriorityTable) / sizeof(*__CFUniCharCombiningPriorityTable);
uint8_t **CFUniCharNonBaseBitmap = __CFUniCharNonBaseBitmap;
uint8_t CFUniCharNumberOfPlanesForNonBaseBitmap = sizeof(__CFUniCharNonBaseBitmap) / sizeof(*__CFUniCharNonBaseBitmap);
#else KERNEL
static UTF32Char *__CFUniCharDecompositionTable = NULL;
static uint32_t __CFUniCharDecompositionTableLength = 0;
static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL;
static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL;
static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL;
static const uint8_t *__CFUniCharNonBaseBitmapForBMP = NULL;
static CFSpinLock_t __CFUniCharDecompositionTableLock = 0;
static const uint8_t **__CFUniCharCombiningPriorityTable = NULL;
static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0;
static void __CFUniCharLoadDecompositionTable(void) {
__CFSpinLock(&__CFUniCharDecompositionTableLock);
if (NULL == __CFUniCharDecompositionTable) {
const void *bytes = CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping);
if (NULL == bytes) {
__CFSpinUnlock(&__CFUniCharDecompositionTableLock);
return;
}
__CFUniCharDecompositionTableLength = *(((uint32_t *)bytes)++);
__CFUniCharDecompositionTable = (UTF32Char *)bytes;
__CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength);
__CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2);
__CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0);
__CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0);
__CFUniCharNonBaseBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
}
__CFSpinUnlock(&__CFUniCharDecompositionTableLock);
}
static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = 0;
static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL;
static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0;
static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL;
static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
__CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock);
if (NULL == __CFUniCharCompatibilityDecompositionTable) {
const void *bytes = CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping);
if (NULL == bytes) {
__CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
return;
}
__CFUniCharCompatibilityDecompositionTableLength = *(((uint32_t *)bytes)++);
__CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes;
__CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength);
__CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2);
}
__CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
}
CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) {
return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF))));
}
CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) {
return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? __CFUniCharNonBaseBitmapForBMP : CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, ((character >> 16) & 0xFF))));
}
#endif KERNEL
typedef struct {
uint32_t _key;
uint32_t _value;
} __CFUniCharDecomposeMappings;
static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
const __CFUniCharDecomposeMappings *p, *q, *divider;
if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
return 0;
}
p = theTable;
q = p + (numElem-1);
while (p <= q) {
divider = p + ((q - p) >> 1);
if (character < divider->_key) { q = divider - 1; }
else if (character > divider->_key) { p = divider + 1; }
else { return divider->_value; }
}
return 0;
}
#if KERNEL
#define __CFUniCharGetCombiningPropertyForCharacter(character) __CFUniCharGetCombiningPriority(character)
#else KERNEL
#define __CFUniCharGetCombiningPropertyForCharacter(character) CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL))
#endif KERNEL
static void __CFUniCharPrioritySort(UTF32Char *characters, uint32_t length) {
uint32_t p1, p2;
UTF32Char *ch1, *ch2;
bool changes = true;
UTF32Char *end = characters + length;
#if !KERNEL
if (NULL == __CFUniCharCombiningPriorityTable) {
__CFSpinLock(&__CFUniCharDecompositionTableLock);
if (NULL == __CFUniCharCombiningPriorityTable) {
uint32_t numPlanes = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty);
uint32_t idx;
__CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(NULL, sizeof(uint8_t *) * numPlanes, 0);
for (idx = 0;idx < numPlanes;idx++) __CFUniCharCombiningPriorityTable[idx] = CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx);
__CFUniCharCombiningPriorityTableNumPlane = numPlanes;
}
__CFSpinUnlock(&__CFUniCharDecompositionTableLock);
}
#endif !KERNEL
if (length < 2) return;
do {
changes = false;
ch1 = characters; ch2 = characters + 1;
p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1);
while (ch2 < end) {
p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2);
if (p1 > p2) {
UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp;
changes = true;
}
++ch1; ++ch2;
}
} while (changes);
}
static uint32_t __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, uint32_t maxBufferLength) {
uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character);
uint32_t length = CFUniCharConvertFlagToCount(value);
UTF32Char firstChar = value & 0xFFFFFF;
#if KERNEL
const UTF32Char *mappings = (kCFUniCharNonBmpFlag & value ? (length == 1 ? &firstChar : __CFUniCharNonBMPMultipleDecompositionTable + firstChar) : NULL);
UTF16Char theChar = (UTF16Char)firstChar;
const UTF16Char *bmpMappings = (mappings ? NULL : (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar));
#else KERNEL
UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar);
#endif KERNEL
uint32_t usedLength = 0;
if (maxBufferLength < length) return 0;
#if KERNEL
if (bmpMappings) {
if (value & kCFUniCharRecursiveDecompositionFlag) {
usedLength = __CFUniCharRecursivelyDecomposeCharacter((UTF32Char)*bmpMappings, convertedChars, maxBufferLength - length);
--length; if (!usedLength || usedLength + length > maxBufferLength) return 0;
++bmpMappings;
convertedChars += usedLength;
}
usedLength += length;
while (length--) *(convertedChars++) = *(bmpMappings++);
return usedLength;
}
#endif KERNEL
if (value & kCFUniCharRecursiveDecompositionFlag) {
usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length);
--length; if (!usedLength || usedLength + length > maxBufferLength) return 0;
++mappings;
convertedChars += usedLength;
}
usedLength += length;
while (length--) *(convertedChars++) = *(mappings++);
return usedLength;
}
#define HANGUL_SBASE 0xAC00
#define HANGUL_LBASE 0x1100
#define HANGUL_VBASE 0x1161
#define HANGUL_TBASE 0x11A7
#define HANGUL_SCOUNT 11172
#define HANGUL_LCOUNT 19
#define HANGUL_VCOUNT 21
#define HANGUL_TCOUNT 28
#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
uint32_t CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, uint32_t maxBufferLength) {
#if !KERNEL
if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
#endif !KERNEL
if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) {
uint32_t length;
character -= HANGUL_SBASE;
length = (character % HANGUL_TCOUNT ? 3 : 2);
if (maxBufferLength < length) return 0;
*(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE;
*(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
return length;
} else {
return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength);
}
}
#if KERNEL
#define CFAllocatorAllocate(a,size,flag) malloc((size))
#define CFAllocatorDeallocate(a,ptr) free((ptr))
#endif KERNEL
#define MAX_BUFFER_LENGTH (32)
bool CFUniCharDecompose(const UTF16Char *src, uint32_t length, uint32_t *consumedLength, void *dst, uint32_t maxLength, uint32_t *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) {
uint32_t usedLength = 0;
uint32_t originalLength = length;
UTF32Char buffer[MAX_BUFFER_LENGTH];
UTF32Char *decompBuffer = buffer;
uint32_t decompBufferLen = MAX_BUFFER_LENGTH;
UTF32Char currentChar;
uint32_t idx;
bool isDecomp = false;
bool isNonBase = false;
#if !KERNEL
if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
#endif !KERNEL
while (length > 0) {
currentChar = *(src++);
--length;
if (currentChar < 0x80) {
if (maxLength) {
if (usedLength < maxLength) {
switch (dstFormat) {
case kCFUniCharUTF8Format: *(((uint8_t *)dst)++) = currentChar; break;
case kCFUniCharUTF16Format: *(((UTF16Char *)dst)++) = currentChar; break;
case kCFUniCharUTF32Format: *(((UTF32Char *)dst)++) = currentChar; break;
}
} else {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length - 1;
if (filledLength) *filledLength = usedLength;
return false;
}
}
++usedLength;
continue;
}
if (CFUniCharIsSurrogateHighCharacter(currentChar) && (length > 0) && CFUniCharIsSurrogateLowCharacter(*src)) {
currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++));
--length;
}
isDecomp = __CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus);
isNonBase = (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar));
if (!isDecomp || isNonBase) {
if (isNonBase) {
if (isDecomp) {
idx = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH);
} else {
idx = 1;
*decompBuffer = currentChar;
}
while (length > 0) {
if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) {
currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1));
} else {
currentChar = *src;
}
if (__CFUniCharIsNonBaseCharacter(currentChar)) {
if (currentChar > 0xFFFF) { length -= 2;
src += 2;
} else {
--length;
++src;
}
if ((idx + 1) >= decompBufferLen) {
UTF32Char *newBuffer;
decompBufferLen += MAX_BUFFER_LENGTH;
newBuffer = (UTF32Char *)CFAllocatorAllocate(NULL, sizeof(UTF32Char) * decompBufferLen, 0);
memmove(newBuffer, decompBuffer, (decompBufferLen - MAX_BUFFER_LENGTH) * sizeof(UTF32Char));
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
decompBuffer = newBuffer;
}
if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { idx += CFUniCharDecomposeCharacter(currentChar, decompBuffer + idx, MAX_BUFFER_LENGTH - idx);
} else {
decompBuffer[idx++] = currentChar;
}
} else {
break;
}
}
if (idx > 1) { __CFUniCharPrioritySort(decompBuffer, idx);
}
if (!CFUniCharFillDestinationBuffer(decompBuffer, idx, &dst, maxLength, &usedLength, dstFormat)) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
}
} else {
if (dstFormat == kCFUniCharUTF32Format) {
++usedLength;
if (maxLength) {
if (usedLength > maxLength) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
}
*(((UTF32Char *)dst)++) = currentChar;
}
} else {
if (!CFUniCharFillDestinationBuffer(¤tChar, 1, &dst, maxLength, &usedLength, dstFormat)) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
}
}
}
} else {
if (dstFormat == kCFUniCharUTF32Format && maxLength) {
idx = CFUniCharDecomposeCharacter(currentChar, dst, maxLength - usedLength);
if (idx == 0) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
} else if (needToReorder && (idx > 1)) { bool moreCombiningMarks = false;
++((UTF32Char *)dst); --idx; ++usedLength;
while (length > 0) {
if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) {
currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1));
} else {
currentChar = *src;
}
if (__CFUniCharIsNonBaseCharacter(currentChar)) {
if (currentChar > 0xFFFF) { length -= 2;
src += 2;
} else {
--length;
++src;
}
if ((idx + usedLength + 1) >= maxLength) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
}
((UTF32Char *)dst)[idx++] = currentChar;
moreCombiningMarks = true;
} else {
break;
}
}
if (moreCombiningMarks) __CFUniCharPrioritySort(((UTF32Char *)dst), idx);
}
usedLength += idx;
((UTF32Char *)dst) += idx;
} else {
idx = CFUniCharDecomposeCharacter(currentChar, decompBuffer, decompBufferLen);
if (maxLength && idx + usedLength > maxLength) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
} else if (needToReorder && (idx > 1)) { bool moreCombiningMarks = false;
while (length > 0) {
if (CFUniCharIsSurrogateHighCharacter(*src) && ((length + 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src + 1))) {
currentChar = CFUniCharGetLongCharacterForSurrogatePair(*src, *(src + 1));
} else {
currentChar = *src;
}
if (__CFUniCharIsNonBaseCharacter(currentChar)) {
if (currentChar > 0xFFFF) { length -= 2;
src += 2;
} else {
--length;
++src;
}
if ((idx + 1) >= decompBufferLen) {
UTF32Char *newBuffer;
decompBufferLen += MAX_BUFFER_LENGTH;
newBuffer = (UTF32Char *)CFAllocatorAllocate(NULL, sizeof(UTF32Char) * decompBufferLen, 0);
memmove(newBuffer, decompBuffer, (decompBufferLen - MAX_BUFFER_LENGTH) * sizeof(UTF32Char));
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
decompBuffer = newBuffer;
}
decompBuffer[idx++] = currentChar;
moreCombiningMarks = true;
} else {
break;
}
}
if (moreCombiningMarks) __CFUniCharPrioritySort(decompBuffer + 1, idx - 1);
}
if (!CFUniCharFillDestinationBuffer(decompBuffer, idx, &dst, maxLength, &usedLength, dstFormat)) {
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return false;
}
}
}
}
if (decompBuffer != buffer) CFAllocatorDeallocate(NULL, decompBuffer);
if (consumedLength) *consumedLength = originalLength - length;
if (filledLength) *filledLength = usedLength;
return true;
}
#if !KERNEL
#define MAX_COMP_DECOMP_LEN (32)
static uint32_t __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) {
uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character);
uint32_t length = CFUniCharConvertFlagToCount(value);
UTF32Char firstChar = value & 0xFFFFFF;
const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar);
uint32_t usedLength = length;
UTF32Char currentChar;
uint32_t currentLength;
while (length-- > 0) {
currentChar = *(mappings++);
if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) {
currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length);
convertedChars += currentLength;
usedLength += (currentLength - 1);
} else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars);
convertedChars += currentLength;
usedLength += (currentLength - 1);
} else {
*(convertedChars++) = currentChar;
}
}
return usedLength;
}
CF_INLINE void __CFUniCharMoveBufferFromEnd(UTF32Char *convertedChars, uint32_t length, uint32_t delta) {
const UTF32Char *limit = convertedChars;
UTF32Char *dstP;
convertedChars += length;
dstP = convertedChars + delta;
while (convertedChars > limit) *(--dstP) = *(--convertedChars);
}
__private_extern__ uint32_t CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, uint32_t length, uint32_t maxBufferLength) {
UTF32Char currentChar;
UTF32Char buffer[MAX_COMP_DECOMP_LEN];
const UTF32Char *bufferP;
const UTF32Char *limit = convertedChars + length;
uint32_t filledLength;
if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable();
while (convertedChars < limit) {
currentChar = *convertedChars;
if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer);
if (filledLength + length - 1 > maxBufferLength) return 0;
if (filledLength > 1) __CFUniCharMoveBufferFromEnd(convertedChars + 1, limit - convertedChars - 1, filledLength - 1);
bufferP = buffer;
length += (filledLength - 1);
while (filledLength-- > 0) *(convertedChars++) = *(bufferP++);
} else {
++convertedChars;
}
}
return length;
}
CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, uint32_t length) {
__CFUniCharPrioritySort(characters, length);
}
#endif !KERNEL