#ifndef HTMLTokenizer_h
#define HTMLTokenizer_h
#include "SegmentedString.h"
#include <wtf/Noncopyable.h>
#include <wtf/PassOwnPtr.h>
#include <wtf/Vector.h>
#include <wtf/text/AtomicString.h>
namespace WebCore {
class Element;
class Frame;
class HTMLToken;
class HTMLTokenizer {
WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED;
public:
enum State {
DataState,
CharacterReferenceInDataState,
RCDATAState,
CharacterReferenceInRCDATAState,
RAWTEXTState,
ScriptDataState,
PLAINTEXTState,
TagOpenState,
EndTagOpenState,
TagNameState,
RCDATALessThanSignState,
RCDATAEndTagOpenState,
RCDATAEndTagNameState,
RAWTEXTLessThanSignState,
RAWTEXTEndTagOpenState,
RAWTEXTEndTagNameState,
ScriptDataLessThanSignState,
ScriptDataEndTagOpenState,
ScriptDataEndTagNameState,
ScriptDataEscapeStartState,
ScriptDataEscapeStartDashState,
ScriptDataEscapedState,
ScriptDataEscapedDashState,
ScriptDataEscapedDashDashState,
ScriptDataEscapedLessThanSignState,
ScriptDataEscapedEndTagOpenState,
ScriptDataEscapedEndTagNameState,
ScriptDataDoubleEscapeStartState,
ScriptDataDoubleEscapedState,
ScriptDataDoubleEscapedDashState,
ScriptDataDoubleEscapedDashDashState,
ScriptDataDoubleEscapedLessThanSignState,
ScriptDataDoubleEscapeEndState,
BeforeAttributeNameState,
AttributeNameState,
AfterAttributeNameState,
BeforeAttributeValueState,
AttributeValueDoubleQuotedState,
AttributeValueSingleQuotedState,
AttributeValueUnquotedState,
CharacterReferenceInAttributeValueState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
ContinueBogusCommentState,
MarkupDeclarationOpenState,
CommentStartState,
CommentStartDashState,
CommentState,
CommentEndDashState,
CommentEndState,
CommentEndBangState,
DOCTYPEState,
BeforeDOCTYPENameState,
DOCTYPENameState,
AfterDOCTYPENameState,
AfterDOCTYPEPublicKeywordState,
BeforeDOCTYPEPublicIdentifierState,
DOCTYPEPublicIdentifierDoubleQuotedState,
DOCTYPEPublicIdentifierSingleQuotedState,
AfterDOCTYPEPublicIdentifierState,
BetweenDOCTYPEPublicAndSystemIdentifiersState,
AfterDOCTYPESystemKeywordState,
BeforeDOCTYPESystemIdentifierState,
DOCTYPESystemIdentifierDoubleQuotedState,
DOCTYPESystemIdentifierSingleQuotedState,
AfterDOCTYPESystemIdentifierState,
BogusDOCTYPEState,
CDATASectionState,
CDATASectionRightSquareBracketState,
CDATASectionDoubleRightSquareBracketState,
};
static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
~HTMLTokenizer();
void reset();
bool nextToken(SegmentedString&, HTMLToken&);
int lineNumber() const { return m_lineNumber; }
int columnNumber() const { return 1; }
State state() const { return m_state; }
void setState(State state) { m_state = state; }
void updateStateFor(const AtomicString& tagName, Frame*);
void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
bool shouldSkipNullCharacters() const
{
return !m_forceNullCharacterReplacement
&& (m_state == DataState
|| m_state == RCDATAState
|| m_state == RAWTEXTState
|| m_state == PLAINTEXTState);
}
private:
class InputStreamPreprocessor {
WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
public:
InputStreamPreprocessor(HTMLTokenizer* tokenizer)
: m_tokenizer(tokenizer)
, m_nextInputCharacter('\0')
, m_skipNextNewLine(false)
{
}
UChar nextInputCharacter() const { return m_nextInputCharacter; }
ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
{
PeekAgain:
m_nextInputCharacter = *source;
static const UChar specialCharacterMask = '\n' | '\r' | '\0';
if (m_nextInputCharacter & ~specialCharacterMask) {
m_skipNextNewLine = false;
return true;
}
if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
m_skipNextNewLine = false;
source.advancePastNewline(lineNumber);
if (source.isEmpty())
return false;
m_nextInputCharacter = *source;
}
if (m_nextInputCharacter == '\r') {
m_nextInputCharacter = '\n';
m_skipNextNewLine = true;
} else {
m_skipNextNewLine = false;
if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
if (m_tokenizer->shouldSkipNullCharacters()) {
source.advancePastNonNewline();
if (source.isEmpty())
return false;
goto PeekAgain;
}
m_nextInputCharacter = 0xFFFD;
}
}
return true;
}
bool advance(SegmentedString& source, int& lineNumber)
{
source.advance(lineNumber);
if (source.isEmpty())
return false;
return peek(source, lineNumber);
}
static const UChar endOfFileMarker;
private:
bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
{
return source.isClosed() && source.length() == 1;
}
HTMLTokenizer* m_tokenizer;
UChar m_nextInputCharacter;
bool m_skipNextNewLine;
};
HTMLTokenizer(bool usePreHTML5ParserQuirks);
inline bool processEntity(SegmentedString&);
inline void parseError();
inline void bufferCharacter(UChar);
inline void bufferCodePoint(unsigned);
inline bool emitAndResumeIn(SegmentedString&, State);
inline bool emitAndReconsumeIn(SegmentedString&, State);
inline bool emitEndOfFile(SegmentedString&);
inline bool flushEmitAndResumeIn(SegmentedString&, State);
inline bool flushBufferedEndTag(SegmentedString&);
inline bool temporaryBufferIs(const String&);
inline void addToPossibleEndTag(UChar cc);
inline void saveEndTagNameIfNeeded();
inline bool isAppropriateEndTag();
inline bool haveBufferedCharacterToken();
State m_state;
Vector<UChar, 32> m_appropriateEndTagName;
HTMLToken* m_token;
int m_lineNumber;
bool m_skipLeadingNewLineForListing;
bool m_forceNullCharacterReplacement;
bool m_shouldAllowCDATA;
Vector<UChar, 32> m_temporaryBuffer;
Vector<UChar, 32> m_bufferedEndTagName;
UChar m_additionalAllowedCharacter;
InputStreamPreprocessor m_inputStreamPreprocessor;
bool m_usePreHTML5ParserQuirks;
};
}
#endif