#include "config.h"
#include "CSSTokenizer.h"
#include "CSSParserIdioms.h"
#include "CSSParserObserverWrapper.h"
#include "CSSParserTokenRange.h"
#include "CSSTokenizerInputStream.h"
#include "HTMLParserIdioms.h"
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
namespace WebCore {
CSSTokenizer::CSSTokenizer(const String& string)
: m_input(string)
{
if (string.isEmpty())
return;
m_tokens.reserveInitialCapacity(string.length() / 3);
while (true) {
CSSParserToken token = nextToken();
if (token.type() == CommentToken)
continue;
if (token.type() == EOFToken)
return;
m_tokens.append(token);
}
}
CSSTokenizer::CSSTokenizer(const String& string, CSSParserObserverWrapper& wrapper)
: m_input(string)
{
if (string.isEmpty())
return;
unsigned offset = 0;
while (true) {
CSSParserToken token = nextToken();
if (token.type() == EOFToken)
break;
if (token.type() == CommentToken)
wrapper.addComment(offset, m_input.offset(), m_tokens.size());
else {
m_tokens.append(token);
wrapper.addToken(offset);
}
offset = m_input.offset();
}
wrapper.addToken(offset);
wrapper.finalizeConstruction(m_tokens.begin());
}
CSSParserTokenRange CSSTokenizer::tokenRange() const
{
return m_tokens;
}
unsigned CSSTokenizer::tokenCount()
{
return m_tokens.size();
}
static bool isNewLine(UChar cc)
{
return (cc == '\r' || cc == '\n' || cc == '\f');
}
static bool twoCharsAreValidEscape(UChar first, UChar second)
{
return first == '\\' && !isNewLine(second);
}
void CSSTokenizer::reconsume(UChar c)
{
m_input.pushBack(c);
}
UChar CSSTokenizer::consume()
{
UChar current = m_input.nextInputChar();
m_input.advance();
return current;
}
CSSParserToken CSSTokenizer::whiteSpace(UChar )
{
m_input.advanceUntilNonWhitespace();
return CSSParserToken(WhitespaceToken);
}
CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType type)
{
m_blockStack.append(type);
return CSSParserToken(type, CSSParserToken::BlockStart);
}
CSSParserToken CSSTokenizer::blockStart(CSSParserTokenType blockType, CSSParserTokenType type, StringView name)
{
m_blockStack.append(blockType);
return CSSParserToken(type, name, CSSParserToken::BlockStart);
}
CSSParserToken CSSTokenizer::blockEnd(CSSParserTokenType type, CSSParserTokenType startType)
{
if (!m_blockStack.isEmpty() && m_blockStack.last() == startType) {
m_blockStack.removeLast();
return CSSParserToken(type, CSSParserToken::BlockEnd);
}
return CSSParserToken(type);
}
CSSParserToken CSSTokenizer::leftParenthesis(UChar )
{
return blockStart(LeftParenthesisToken);
}
CSSParserToken CSSTokenizer::rightParenthesis(UChar )
{
return blockEnd(RightParenthesisToken, LeftParenthesisToken);
}
CSSParserToken CSSTokenizer::leftBracket(UChar )
{
return blockStart(LeftBracketToken);
}
CSSParserToken CSSTokenizer::rightBracket(UChar )
{
return blockEnd(RightBracketToken, LeftBracketToken);
}
CSSParserToken CSSTokenizer::leftBrace(UChar )
{
return blockStart(LeftBraceToken);
}
CSSParserToken CSSTokenizer::rightBrace(UChar )
{
return blockEnd(RightBraceToken, LeftBraceToken);
}
CSSParserToken CSSTokenizer::plusOrFullStop(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::asterisk(UChar cc)
{
ASSERT_UNUSED(cc, cc == '*');
if (consumeIfNext('='))
return CSSParserToken(SubstringMatchToken);
return CSSParserToken(DelimiterToken, '*');
}
CSSParserToken CSSTokenizer::lessThan(UChar cc)
{
ASSERT_UNUSED(cc, cc == '<');
if (m_input.peekWithoutReplacement(0) == '!'
&& m_input.peekWithoutReplacement(1) == '-'
&& m_input.peekWithoutReplacement(2) == '-') {
m_input.advance(3);
return CSSParserToken(CDOToken);
}
return CSSParserToken(DelimiterToken, '<');
}
CSSParserToken CSSTokenizer::comma(UChar )
{
return CSSParserToken(CommaToken);
}
CSSParserToken CSSTokenizer::hyphenMinus(UChar cc)
{
if (nextCharsAreNumber(cc)) {
reconsume(cc);
return consumeNumericToken();
}
if (m_input.peekWithoutReplacement(0) == '-'
&& m_input.peekWithoutReplacement(1) == '>') {
m_input.advance(2);
return CSSParserToken(CDCToken);
}
if (nextCharsAreIdentifier(cc)) {
reconsume(cc);
return consumeIdentLikeToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::solidus(UChar cc)
{
if (consumeIfNext('*')) {
consumeUntilCommentEndFound();
return CSSParserToken(CommentToken);
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::colon(UChar )
{
return CSSParserToken(ColonToken);
}
CSSParserToken CSSTokenizer::semiColon(UChar )
{
return CSSParserToken(SemicolonToken);
}
CSSParserToken CSSTokenizer::hash(UChar cc)
{
UChar nextChar = m_input.peekWithoutReplacement(0);
if (isNameCodePoint(nextChar) || twoCharsAreValidEscape(nextChar, m_input.peekWithoutReplacement(1))) {
HashTokenType type = nextCharsAreIdentifier() ? HashTokenId : HashTokenUnrestricted;
return CSSParserToken(type, consumeName());
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::circumflexAccent(UChar cc)
{
ASSERT_UNUSED(cc, cc == '^');
if (consumeIfNext('='))
return CSSParserToken(PrefixMatchToken);
return CSSParserToken(DelimiterToken, '^');
}
CSSParserToken CSSTokenizer::dollarSign(UChar cc)
{
ASSERT_UNUSED(cc, cc == '$');
if (consumeIfNext('='))
return CSSParserToken(SuffixMatchToken);
return CSSParserToken(DelimiterToken, '$');
}
CSSParserToken CSSTokenizer::verticalLine(UChar cc)
{
ASSERT_UNUSED(cc, cc == '|');
if (consumeIfNext('='))
return CSSParserToken(DashMatchToken);
if (consumeIfNext('|'))
return CSSParserToken(ColumnToken);
return CSSParserToken(DelimiterToken, '|');
}
CSSParserToken CSSTokenizer::tilde(UChar cc)
{
ASSERT_UNUSED(cc, cc == '~');
if (consumeIfNext('='))
return CSSParserToken(IncludeMatchToken);
return CSSParserToken(DelimiterToken, '~');
}
CSSParserToken CSSTokenizer::commercialAt(UChar cc)
{
ASSERT_UNUSED(cc, cc == '@');
if (nextCharsAreIdentifier())
return CSSParserToken(AtKeywordToken, consumeName());
return CSSParserToken(DelimiterToken, '@');
}
CSSParserToken CSSTokenizer::reverseSolidus(UChar cc)
{
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
reconsume(cc);
return consumeIdentLikeToken();
}
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::asciiDigit(UChar cc)
{
reconsume(cc);
return consumeNumericToken();
}
CSSParserToken CSSTokenizer::letterU(UChar cc)
{
if (m_input.peekWithoutReplacement(0) == '+'
&& (isASCIIHexDigit(m_input.peekWithoutReplacement(1))
|| m_input.peekWithoutReplacement(1) == '?')) {
m_input.advance();
return consumeUnicodeRange();
}
reconsume(cc);
return consumeIdentLikeToken();
}
CSSParserToken CSSTokenizer::nameStart(UChar cc)
{
reconsume(cc);
return consumeIdentLikeToken();
}
CSSParserToken CSSTokenizer::stringStart(UChar cc)
{
return consumeStringTokenUntil(cc);
}
CSSParserToken CSSTokenizer::endOfFile(UChar )
{
return CSSParserToken(EOFToken);
}
const CSSTokenizer::CodePoint CSSTokenizer::codePoints[128] = {
&CSSTokenizer::endOfFile,
0,
0,
0,
0,
0,
0,
0,
0,
&CSSTokenizer::whiteSpace,
&CSSTokenizer::whiteSpace,
0,
&CSSTokenizer::whiteSpace,
&CSSTokenizer::whiteSpace,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
&CSSTokenizer::whiteSpace,
0,
&CSSTokenizer::stringStart,
&CSSTokenizer::hash,
&CSSTokenizer::dollarSign,
0,
0,
&CSSTokenizer::stringStart,
&CSSTokenizer::leftParenthesis,
&CSSTokenizer::rightParenthesis,
&CSSTokenizer::asterisk,
&CSSTokenizer::plusOrFullStop,
&CSSTokenizer::comma,
&CSSTokenizer::hyphenMinus,
&CSSTokenizer::plusOrFullStop,
&CSSTokenizer::solidus,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::asciiDigit,
&CSSTokenizer::colon,
&CSSTokenizer::semiColon,
&CSSTokenizer::lessThan,
0,
0,
0,
&CSSTokenizer::commercialAt,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::letterU,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::leftBracket,
&CSSTokenizer::reverseSolidus,
&CSSTokenizer::rightBracket,
&CSSTokenizer::circumflexAccent,
&CSSTokenizer::nameStart,
0,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::letterU,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::nameStart,
&CSSTokenizer::leftBrace,
&CSSTokenizer::verticalLine,
&CSSTokenizer::rightBrace,
&CSSTokenizer::tilde,
0,
};
#if !ASSERT_WITH_SECURITY_IMPLICATION_DISABLED
const unsigned codePointsNumber = 128;
#endif
CSSParserToken CSSTokenizer::nextToken()
{
UChar cc = consume();
CodePoint codePointFunc = 0;
if (isASCII(cc)) {
ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
codePointFunc = codePoints[cc];
} else
codePointFunc = &CSSTokenizer::nameStart;
if (codePointFunc)
return ((this)->*(codePointFunc))(cc);
return CSSParserToken(DelimiterToken, cc);
}
CSSParserToken CSSTokenizer::consumeNumber()
{
ASSERT(nextCharsAreNumber());
NumericValueType type = IntegerValueType;
NumericSign sign = NoSign;
unsigned numberLength = 0;
UChar next = m_input.peekWithoutReplacement(0);
if (next == '+') {
++numberLength;
sign = PlusSign;
} else if (next == '-') {
++numberLength;
sign = MinusSign;
}
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength);
next = m_input.peekWithoutReplacement(numberLength);
if (next == '.' && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 1))) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 2);
next = m_input.peekWithoutReplacement(numberLength);
}
if (next == 'E' || next == 'e') {
next = m_input.peekWithoutReplacement(numberLength + 1);
if (isASCIIDigit(next)) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 1);
} else if ((next == '+' || next == '-') && isASCIIDigit(m_input.peekWithoutReplacement(numberLength + 2))) {
type = NumberValueType;
numberLength = m_input.skipWhilePredicate<isASCIIDigit>(numberLength + 3);
}
}
double value = m_input.getDouble(0, numberLength);
m_input.advance(numberLength);
return CSSParserToken(NumberToken, value, type, sign);
}
CSSParserToken CSSTokenizer::consumeNumericToken()
{
CSSParserToken token = consumeNumber();
if (nextCharsAreIdentifier())
token.convertToDimensionWithUnit(consumeName());
else if (consumeIfNext('%'))
token.convertToPercentage();
return token;
}
CSSParserToken CSSTokenizer::consumeIdentLikeToken()
{
StringView name = consumeName();
if (consumeIfNext('(')) {
if (equalIgnoringASCIICase(name, "url")) {
m_input.advanceUntilNonWhitespace();
UChar next = m_input.peekWithoutReplacement(0);
if (next != '"' && next != '\'')
return consumeUrlToken();
}
return blockStart(LeftParenthesisToken, FunctionToken, name);
}
return CSSParserToken(IdentToken, name);
}
CSSParserToken CSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
{
for (unsigned size = 0; ; size++) {
UChar cc = m_input.peekWithoutReplacement(size);
if (cc == endingCodePoint) {
unsigned startOffset = m_input.offset();
m_input.advance(size + 1);
return CSSParserToken(StringToken, m_input.rangeAt(startOffset, size));
}
if (isNewLine(cc)) {
m_input.advance(size);
return CSSParserToken(BadStringToken);
}
if (cc == '\0' || cc == '\\')
break;
}
StringBuilder output;
while (true) {
UChar cc = consume();
if (cc == endingCodePoint || cc == kEndOfFileMarker)
return CSSParserToken(StringToken, registerString(output.toString()));
if (isNewLine(cc)) {
reconsume(cc);
return CSSParserToken(BadStringToken);
}
if (cc == '\\') {
if (m_input.nextInputChar() == kEndOfFileMarker)
continue;
if (isNewLine(m_input.peekWithoutReplacement(0)))
consumeSingleWhitespaceIfNext(); else
output.append(consumeEscape());
} else
output.append(cc);
}
}
CSSParserToken CSSTokenizer::consumeUnicodeRange()
{
ASSERT(isASCIIHexDigit(m_input.peekWithoutReplacement(0)) || m_input.peekWithoutReplacement(0) == '?');
int lengthRemaining = 6;
UChar32 start = 0;
while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
start = start * 16 + toASCIIHexValue(consume());
--lengthRemaining;
}
UChar32 end = start;
if (lengthRemaining && consumeIfNext('?')) {
do {
start *= 16;
end = end * 16 + 0xF;
--lengthRemaining;
} while (lengthRemaining && consumeIfNext('?'));
} else if (m_input.peekWithoutReplacement(0) == '-' && isASCIIHexDigit(m_input.peekWithoutReplacement(1))) {
m_input.advance();
lengthRemaining = 6;
end = 0;
do {
end = end * 16 + toASCIIHexValue(consume());
--lengthRemaining;
} while (lengthRemaining && isASCIIHexDigit(m_input.peekWithoutReplacement(0)));
}
return CSSParserToken(UnicodeRangeToken, start, end);
}
static bool isNonPrintableCodePoint(UChar cc)
{
return cc <= '\x8' || cc == '\xb' || (cc >= '\xe' && cc <= '\x1f') || cc == '\x7f';
}
CSSParserToken CSSTokenizer::consumeUrlToken()
{
m_input.advanceUntilNonWhitespace();
for (unsigned size = 0; ; size++) {
UChar cc = m_input.peekWithoutReplacement(size);
if (cc == ')') {
unsigned startOffset = m_input.offset();
m_input.advance(size + 1);
return CSSParserToken(UrlToken, m_input.rangeAt(startOffset, size));
}
if (cc <= ' ' || cc == '\\' || cc == '"' || cc == '\'' || cc == '(' || cc == '\x7f')
break;
}
StringBuilder result;
while (true) {
UChar cc = consume();
if (cc == ')' || cc == kEndOfFileMarker)
return CSSParserToken(UrlToken, registerString(result.toString()));
if (isHTMLSpace(cc)) {
m_input.advanceUntilNonWhitespace();
if (consumeIfNext(')') || m_input.nextInputChar() == kEndOfFileMarker)
return CSSParserToken(UrlToken, registerString(result.toString()));
break;
}
if (cc == '"' || cc == '\'' || cc == '(' || isNonPrintableCodePoint(cc))
break;
if (cc == '\\') {
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
result.append(consumeEscape());
continue;
}
break;
}
result.append(cc);
}
consumeBadUrlRemnants();
return CSSParserToken(BadUrlToken);
}
void CSSTokenizer::consumeBadUrlRemnants()
{
while (true) {
UChar cc = consume();
if (cc == ')' || cc == kEndOfFileMarker)
return;
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0)))
consumeEscape();
}
}
void CSSTokenizer::consumeSingleWhitespaceIfNext()
{
UChar next = m_input.peekWithoutReplacement(0);
if (next == '\r' && m_input.peekWithoutReplacement(1) == '\n')
m_input.advance(2);
else if (isHTMLSpace(next))
m_input.advance();
}
void CSSTokenizer::consumeUntilCommentEndFound()
{
UChar c = consume();
while (true) {
if (c == kEndOfFileMarker)
return;
if (c != '*') {
c = consume();
continue;
}
c = consume();
if (c == '/')
return;
}
}
bool CSSTokenizer::consumeIfNext(UChar character)
{
ASSERT(character);
if (m_input.peekWithoutReplacement(0) == character) {
m_input.advance();
return true;
}
return false;
}
StringView CSSTokenizer::consumeName()
{
for (unsigned size = 0; ; ++size) {
UChar cc = m_input.peekWithoutReplacement(size);
if (isNameCodePoint(cc))
continue;
if (cc == '\0' && m_input.offset() + size < m_input.length())
break;
if (cc == '\\')
break;
unsigned startOffset = m_input.offset();
m_input.advance(size);
return m_input.rangeAt(startOffset, size);
}
StringBuilder result;
while (true) {
UChar cc = consume();
if (isNameCodePoint(cc)) {
result.append(cc);
continue;
}
if (twoCharsAreValidEscape(cc, m_input.peekWithoutReplacement(0))) {
result.append(consumeEscape());
continue;
}
reconsume(cc);
return registerString(result.toString());
}
}
UChar32 CSSTokenizer::consumeEscape()
{
UChar cc = consume();
ASSERT(!isNewLine(cc));
if (isASCIIHexDigit(cc)) {
unsigned consumedHexDigits = 1;
StringBuilder hexChars;
hexChars.append(cc);
while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.peekWithoutReplacement(0))) {
cc = consume();
hexChars.append(cc);
consumedHexDigits++;
};
consumeSingleWhitespaceIfNext();
bool ok = false;
UChar32 codePoint = hexChars.toString().toUIntStrict(&ok, 16);
ASSERT(ok);
if (!codePoint || (0xD800 <= codePoint && codePoint <= 0xDFFF) || codePoint > 0x10FFFF)
return replacementCharacter;
return codePoint;
}
if (cc == kEndOfFileMarker)
return replacementCharacter;
return cc;
}
bool CSSTokenizer::nextTwoCharsAreValidEscape()
{
return twoCharsAreValidEscape(m_input.peekWithoutReplacement(0), m_input.peekWithoutReplacement(1));
}
bool CSSTokenizer::nextCharsAreNumber(UChar first)
{
UChar second = m_input.peekWithoutReplacement(0);
if (isASCIIDigit(first))
return true;
if (first == '+' || first == '-')
return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peekWithoutReplacement(1))));
if (first =='.')
return (isASCIIDigit(second));
return false;
}
bool CSSTokenizer::nextCharsAreNumber()
{
UChar first = consume();
bool areNumber = nextCharsAreNumber(first);
reconsume(first);
return areNumber;
}
bool CSSTokenizer::nextCharsAreIdentifier(UChar first)
{
UChar second = m_input.peekWithoutReplacement(0);
if (isNameStartCodePoint(first) || twoCharsAreValidEscape(first, second))
return true;
if (first == '-')
return isNameStartCodePoint(second) || second == '-' || nextTwoCharsAreValidEscape();
return false;
}
bool CSSTokenizer::nextCharsAreIdentifier()
{
UChar first = consume();
bool areIdentifier = nextCharsAreIdentifier(first);
reconsume(first);
return areIdentifier;
}
StringView CSSTokenizer::registerString(const String& string)
{
m_stringPool.append(string);
return string;
}
}