CFXMLInputStream.c [plain text]
#include "CFXMLInputStream.h"
#include <CoreFoundation/CFCharacterSet.h>
#include <string.h>
#include "CFStringEncodingConverter.h"
#include "CFUniChar.h"
static Boolean determineEncoding(_CFXMLInputStream *stream) {
const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
UInt32 length = CFDataGetLength(stream->data);
const uint8_t *idx = 0L, *end = 0L;
const uint8_t *base = 0L;
char quote = ' ';
Boolean useUTF8 = false;
if (length > 2) {
if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
#if __BIG_ENDIAN__
stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
#else
stream->flags |= ENCODING_IS_UNICODE_NATURAL;
#endif
if (*bytes == 0xFF) {
stream->currentByte = bytes + 2;
}
stream->encoding = kCFStringEncodingUnicode;
return true;
} else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
#if __BIG_ENDIAN__
stream->flags |= ENCODING_IS_UNICODE_NATURAL;
#else
stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
#endif
if (*bytes == 0xFE) {
stream->currentByte = bytes + 2;
}
stream->encoding = kCFStringEncodingUnicode;
return true;
} else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
if(*bytes == 0xEF) {
stream->currentByte = bytes + 3;
}
stream->encoding = kCFStringEncodingUTF8;
stream->flags |= ENCODING_MATCHES_ASCII;
return true;
}
}
if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
useUTF8 = true;
}
if (!useUTF8) {
idx = bytes + 5;
end = bytes + length;
while (idx < end) {
uint8_t ch = *idx;
const uint8_t *scan;
if ( ch == '?' || ch == '>') {
useUTF8 = true;
break;
}
idx ++;
scan = idx;
if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
idx = scan;
break;
}
}
if (!useUTF8 && idx >= end) {
useUTF8 = true;
}
}
if (!useUTF8) {
quote = *idx;
if (quote != '\'' && quote != '\"') {
useUTF8 = true;
}
}
if (!useUTF8) {
base = idx + 1; idx ++;
while (idx < end && *idx != quote) idx ++;
if (idx >= end) {
useUTF8 = true;
}
}
if (!useUTF8) {
UInt32 len = idx - base;
if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
useUTF8 = true;
} else {
CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
CFRelease(encodingName);
}
}
if (useUTF8) {
stream->encoding = kCFStringEncodingUTF8;
stream->flags |= ENCODING_MATCHES_ASCII;
return true;
} else if (stream->encoding == kCFStringEncodingInvalidId) {
return false;
} else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
stream->flags |= ENCODING_MATCHES_ASCII;
}
return true;
}
CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
if (numChars) {
CFStringAppendCharacters(string, characters, numChars);
}
}
__private_extern__ Boolean _openInputStream(_CFXMLInputStream *stream) {
if (NULL == stream->data) {
return false;
} else {
stream->currentByte = CFDataGetBytePtr(stream->data);
if (determineEncoding(stream)) {
stream->flags |= STREAM_OPEN;
return true;
} else {
return false;
}
}
}
__private_extern__ void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
stream->encoding = kCFStringEncodingInvalidId;
stream->currentByte = NULL;
stream->allocator = (CFAllocatorRef)CFRetain(alloc);
stream->charBuffer = NULL;
stream->currentChar = NULL;
stream->mark = NULL;
stream->parserMark = NULL;
stream->bufferLength = 0;
stream->bufferCapacity = 0;
stream->charIndex = 1;
stream->lineNum = 1;
stream->flags = 0;
stream->nameSet = NULL;
stream->tempString = NULL;
}
__private_extern__ void _freeInputStream(_CFXMLInputStream *stream) {
if (stream->data) CFRelease(stream->data);
if (stream->url) CFRelease(stream->url);
if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
if (stream->nameSet) CFRelease(stream->nameSet);
if (stream->tempString) CFRelease(stream->tempString);
CFRelease(stream->allocator);
}
__private_extern__ CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
return stream->encoding;
}
__private_extern__ CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
return stream->charIndex;
}
__private_extern__ CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
return stream->lineNum;
}
__private_extern__ Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
if (!(stream->flags & STREAM_OPEN)) return false;
if (stream->currentChar) return false;
if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
return true;
}
__private_extern__ Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
return stream->flags & ENCODING_COMPOSITION_ERROR;
}
#define INITIAL_BUFFER_SIZE 64
static void growCharacterBuffer(_CFXMLInputStream *stream) {
if (!stream->charBuffer) {
stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
stream->bufferCapacity = INITIAL_BUFFER_SIZE;
} else {
CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
stream->bufferCapacity *= 2;
if (newBuffer != stream->charBuffer) {
stream->charBuffer = newBuffer;
if (currCharDelta != -1) {
stream->currentChar = newBuffer + currCharDelta;
}
if (markDelta != -1) {
stream->mark = newBuffer + markDelta;
}
if (parserMarkDelta != -1) {
stream->parserMark = newBuffer + parserMarkDelta;
}
}
}
}
static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
if (charsToTranslate > maxLength) {
charsToTranslate = maxLength;
}
if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
stream->currentByte += (charsToTranslate * sizeof(UniChar));
} else {
CFIndex i;
uint8_t *baseBytePtr = (uint8_t *)base;
for (i = 0; i < charsToTranslate; i ++) {
*(baseBytePtr + 1) = *stream->currentByte;
*baseBytePtr = *(stream->currentByte + 1);
baseBytePtr += 2;
stream->currentByte += 2;
}
}
return charsToTranslate;
} else {
CFIndex lengthConsumed = 0;
CFIndex usedByteLength, usedCharLength;
UInt32 conversionResult;
if (stream->flags & ENCODING_MATCHES_ASCII) {
while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
if (*stream->currentByte > 0x7f) break;
*base = *stream->currentByte;
base ++;
stream->currentByte ++;
lengthConsumed ++;
}
if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
return lengthConsumed;
}
}
conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
if(kCFStringEncodingConversionSuccess != conversionResult) {
switch(conversionResult) {
case kCFStringEncodingConverterUnavailable:
case kCFStringEncodingInvalidInputStream:
stream->flags |= ENCODING_COMPOSITION_ERROR;
break;
case kCFStringEncodingInsufficientOutputBufferLength:
default:
break;
}
}
if (usedByteLength > 0) {
stream->currentByte += usedByteLength;
lengthConsumed += usedCharLength;
}
return lengthConsumed;
}
}
CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
CFIndex numFilled;
if (stream->bufferLength >= stream->bufferCapacity) return 0;
numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
if (numFilled != 0) {
stream->currentChar = stream->charBuffer + stream->bufferLength;
stream->bufferLength += numFilled;
}
return numFilled;
}
static void fillCharacterBuffer(_CFXMLInputStream *stream) {
if (!stream->charBuffer) {
growCharacterBuffer(stream);
}
if (!stream->mark && !stream->parserMark) {
CFIndex fillLength = stream->bufferCapacity-5; stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
stream->currentChar = stream->charBuffer;
} else {
Boolean done;
done = (fillToCapacity(stream) != 0);
if (!done) {
const UniChar *leftMostMark;
if (stream->mark && !stream->parserMark) {
leftMostMark = stream->mark;
} else if (stream->parserMark && !stream->mark) {
leftMostMark = stream->parserMark;
} else if (stream->parserMark < stream->mark) {
leftMostMark = stream->parserMark;
} else {
leftMostMark = stream->mark;
}
if (leftMostMark > stream->charBuffer) {
CFIndex delta = leftMostMark - stream->charBuffer;
memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
stream->bufferLength -= delta;
if (stream->mark) {
stream->mark -= delta;
}
if (stream->parserMark) {
stream->parserMark -= delta;
}
done = (fillToCapacity(stream) != 0);
delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
}
}
if (!done) {
growCharacterBuffer(stream);
fillToCapacity(stream); }
}
}
static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
return false; } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
(((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
(stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
if (stream->flags & ENCODING_MATCHES_ASCII) {
*ch = (UniChar)*(stream->currentByte);
if (advanceStream) {
stream->currentByte ++;
}
} else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
*ch = *(UniChar *)(stream->currentByte);
if (advanceStream) {
stream->currentByte += 2;
}
} else {
*ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
if (advanceStream) {
stream->currentByte += 2;
}
}
} else {
fillCharacterBuffer(stream); if (!stream->charBuffer || !stream->currentChar) {
return false;
} else {
*ch = *(stream->currentChar);
if (advanceStream) {
stream->currentChar ++;
if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
stream->currentChar = NULL;
}
}
}
}
return true;
}
CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
if (!(stream->flags & STREAM_OPEN)) {
return false;
} else if (stream->currentChar) {
*ch = *stream->currentChar;
if (advanceStream) {
stream->currentChar ++;
if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
stream->currentChar = NULL;
}
}
} else {
if (!getCharacterGuts(stream, ch, advanceStream)) return false;
}
if (advanceStream) {
UniChar nextChar;
stream->charIndex ++;
if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
}
return true;
}
__private_extern__ Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
return getCharacter(stream, ch, false);
}
__private_extern__ Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
return getCharacter(stream, ch, true);
}
__private_extern__ Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
Boolean decrementLineNum = false;
if (ch == '\n') {
decrementLineNum = true;
} else if (ch == '\r') {
UniChar nextChar;
if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
decrementLineNum = true;
}
}
if (!(stream->flags & STREAM_OPEN)) {
return false;
} else if (stream->currentChar) {
if (stream->currentChar != stream->charBuffer) {
stream->currentChar --;
} else {
if (stream->bufferLength >= stream->bufferCapacity) {
growCharacterBuffer(stream);
}
memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
*stream->charBuffer = ch;
stream->bufferLength ++;
if (stream->mark) {
stream->mark ++;
}
if (stream->parserMark) {
stream->parserMark ++;
}
}
} else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
} else if (stream->charBuffer) {
*stream->charBuffer = ch;
stream->currentChar = stream->charBuffer;
stream->bufferLength = 1;
if (stream->mark) {
stream->mark ++;
}
if (stream->parserMark) {
stream->parserMark ++;
}
} else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
if (stream->flags & ENCODING_MATCHES_ASCII) {
stream->currentByte --;
} else { stream->currentByte -= 2;
}
} else {
return false;
}
stream->charIndex --;
if (decrementLineNum) {
stream->lineNum --;
}
return true;
}
static UniChar *dropMark(_CFXMLInputStream *stream) {
if (stream->currentChar) {
return stream->currentChar;
} else if (stream->mark || stream->parserMark) {
return stream->charBuffer + stream->bufferLength;
} else {
if (!stream->charBuffer) {
growCharacterBuffer(stream);
}
stream->bufferLength = 0; return stream->charBuffer;
}
}
__private_extern__ void _inputStreamSetMark(_CFXMLInputStream *stream) {
CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
stream->mark = dropMark(stream);
}
__private_extern__ void _inputStreamClearMark(_CFXMLInputStream *stream) {
CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
stream->mark = NULL;
}
__private_extern__ void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
CFIndex numChars = end - stream->mark;
CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
_fillStringWithCharacters(string, stream->mark, numChars);
}
static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
if (end > mark) {
CFIndex numChars = end - mark;
stream->charIndex -= numChars;
stream->currentChar = mark;
if (*(end - 1) == '\r') {
UniChar nextChar;
if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
end --;
}
}
while (end != mark) {
end --;
if (*end == '\r') {
stream->lineNum --;
} else if (*end == '\n') {
stream->lineNum --;
if (end != mark && *(end - 1) == '\r') {
end --;
}
}
}
}
}
__private_extern__ void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
restoreToMark(stream, stream->mark);
}
CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
}
__private_extern__ CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
UniChar ch;
CFIndex len = 0;
if (str) {
stream->parserMark = dropMark(stream);
}
while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
len ++;
}
if (!isWhitespaceChar(ch)) {
_inputStreamReturnCharacter(stream, ch);
}
if (str) {
_fillStringWithCharacters(str, stream->parserMark, len);
stream->parserMark = NULL;
}
return len;
}
__private_extern__ Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
Boolean done = false;
CFIndex firstRepeatIndex = -1;
CFIndex len = 0;
stream->parserMark = dropMark(stream);
do {
UniChar ch;
while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
len ++;
}
if (ch != scanChars[0]) {
restoreToMark(stream, stream->parserMark);
stream->parserMark = NULL;
return false;
} else {
CFIndex i;
for (i = 1; i < numChars; i ++) {
if (!_inputStreamGetCharacter(stream, &ch)) break;
if (ch != scanChars[i]) break;
}
if (i == numChars) {
done = true;
} else {
if (firstRepeatIndex == -1) {
CFIndex j;
for (j = 1; j < numChars; j ++) {
if (scanChars[0] == scanChars[j]) {
break;
}
}
firstRepeatIndex = j;
}
_inputStreamReturnCharacter(stream, ch);
while (i > firstRepeatIndex) {
i --;
_inputStreamReturnCharacter(stream, scanChars[i]);
}
len += i;
}
}
} while (!done);
if (str) {
_fillStringWithCharacters(str, stream->parserMark, len);
}
stream->parserMark = NULL;
return true;
}
__private_extern__ Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
const UniChar *end = stringToMatch+length;
const UniChar *sPtr=stringToMatch;
stream->parserMark = dropMark(stream);
while (sPtr < end) {
UniChar ch;
if (!_inputStreamGetCharacter(stream, &ch)) break;
if (ch != *sPtr) break;
sPtr ++;
}
if (sPtr != end) {
restoreToMark(stream, stream->parserMark);
stream->parserMark = NULL;
return false;
} else {
stream->parserMark = NULL;
return true;
}
}
__private_extern__ Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
UniChar ch;
if (!_inputStreamPeekCharacter(stream, &ch)) return false;
if (ch != '\'' && ch != '\"') return false;
_inputStreamGetCharacter(stream, &ch);
if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
return false;
}
return true;
}
__private_extern__ Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
UniChar ch;
Boolean success = true;
stream->parserMark = dropMark(stream);
if (!isNMToken) {
if (!getCharacter(stream, &ch, false)) {
success = false;
} else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
success = false;
} else {
getCharacter(stream, &ch, true);
}
}
if (success) {
while (getCharacter(stream, &ch, true)) {
if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
_inputStreamReturnCharacter(stream, ch);
break;
}
}
if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
success = false; }
}
if (success) {
if (str) {
if (!stream->nameSet) {
stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
}
CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
*str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
CFSetAddValue(stream->nameSet, *str);
CFRelease(*str);
}
}
} else {
restoreToMark(stream, stream->parserMark);
}
stream->parserMark = NULL;
return success;
}