HTMLEntityParser.cpp [plain text]
#include "config.h"
#include "HTMLEntityParser.h"
#include "CharacterReferenceParserInlines.h"
#include "HTMLEntitySearch.h"
#include "HTMLEntityTable.h"
#include <wtf/text/StringBuilder.h>
using namespace WTF;
namespace WebCore {
static const UChar windowsLatin1ExtensionArray[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, };
static inline bool isAlphaNumeric(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}
class HTMLEntityParser {
public:
inline static UChar adjustEntity(UChar32 value)
{
if ((value & ~0x1F) != 0x0080)
return value;
return windowsLatin1ExtensionArray[value - 0x80];
}
inline static UChar32 legalEntityFor(UChar32 value)
{
if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
return 0xFFFD;
if (U_IS_BMP(value))
return adjustEntity(value);
return value;
}
inline static bool acceptMalformed() { return true; }
inline static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
{
StringBuilder consumedCharacters;
HTMLEntitySearch entitySearch;
while (!source.isEmpty()) {
cc = source.currentChar();
entitySearch.advance(cc);
if (!entitySearch.isEntityPrefix())
break;
consumedCharacters.append(cc);
source.advanceAndASSERT(cc);
}
notEnoughCharacters = source.isEmpty();
if (notEnoughCharacters) {
unconsumeCharacters(source, consumedCharacters);
return false;
}
if (!entitySearch.mostRecentMatch()) {
unconsumeCharacters(source, consumedCharacters);
return false;
}
if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
unconsumeCharacters(source, consumedCharacters);
consumedCharacters.clear();
const int length = entitySearch.mostRecentMatch()->length;
const LChar* reference = entitySearch.mostRecentMatch()->entity;
for (int i = 0; i < length; ++i) {
cc = source.currentChar();
ASSERT_UNUSED(reference, cc == *reference++);
consumedCharacters.append(cc);
source.advanceAndASSERT(cc);
ASSERT(!source.isEmpty());
}
cc = source.currentChar();
}
if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
|| !additionalAllowedCharacter
|| !(isAlphaNumeric(cc) || cc == '=')) {
decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
if (entitySearch.mostRecentMatch()->secondValue)
decodedEntity.append(entitySearch.mostRecentMatch()->secondValue);
return true;
}
unconsumeCharacters(source, consumedCharacters);
return false;
}
};
bool consumeHTMLEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
{
return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
}
static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
{
if (U_IS_BMP(value)) {
UChar character = static_cast<UChar>(value);
ASSERT(character == value);
result[0] = character;
return 1;
}
result[0] = U16_LEAD(value);
result[1] = U16_TRAIL(value);
return 2;
}
size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
{
HTMLEntitySearch search;
while (*name) {
search.advance(*name++);
if (!search.isEntityPrefix())
return 0;
}
search.advance(';');
if (!search.isEntityPrefix())
return 0;
size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
if (!search.mostRecentMatch()->secondValue)
return numberOfCodePoints;
return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
}
}