#pragma once
#include "HTMLParserOptions.h"
#include "HTMLToken.h"
#include "InputStreamPreprocessor.h"
namespace WebCore {
class SegmentedString;
class HTMLTokenizer {
public:
explicit HTMLTokenizer(const HTMLParserOptions& = HTMLParserOptions());
class TokenPtr;
TokenPtr nextToken(SegmentedString&);
void setTokenAttributeBaseOffset(unsigned);
String bufferedCharacters() const;
size_t numberOfBufferedCharacters() const;
void updateStateFor(const AtomString& tagName);
void setForceNullCharacterReplacement(bool);
bool shouldAllowCDATA() const;
void setShouldAllowCDATA(bool);
bool isInDataState() const;
void setDataState();
void setPLAINTEXTState();
void setRAWTEXTState();
void setRCDATAState();
void setScriptDataState();
bool neverSkipNullCharacters() const;
private:
enum State {
DataState,
CharacterReferenceInDataState,
RCDATAState,
CharacterReferenceInRCDATAState,
RAWTEXTState,
ScriptDataState,
PLAINTEXTState,
TagOpenState,
EndTagOpenState,
TagNameState,
RCDATALessThanSignState,
RCDATAEndTagOpenState,
RCDATAEndTagNameState,
RAWTEXTLessThanSignState,
RAWTEXTEndTagOpenState,
RAWTEXTEndTagNameState,
ScriptDataLessThanSignState,
ScriptDataEndTagOpenState,
ScriptDataEndTagNameState,
ScriptDataEscapeStartState,
ScriptDataEscapeStartDashState,
ScriptDataEscapedState,
ScriptDataEscapedDashState,
ScriptDataEscapedDashDashState,
ScriptDataEscapedLessThanSignState,
ScriptDataEscapedEndTagOpenState,
ScriptDataEscapedEndTagNameState,
ScriptDataDoubleEscapeStartState,
ScriptDataDoubleEscapedState,
ScriptDataDoubleEscapedDashState,
ScriptDataDoubleEscapedDashDashState,
ScriptDataDoubleEscapedLessThanSignState,
ScriptDataDoubleEscapeEndState,
BeforeAttributeNameState,
AttributeNameState,
AfterAttributeNameState,
BeforeAttributeValueState,
AttributeValueDoubleQuotedState,
AttributeValueSingleQuotedState,
AttributeValueUnquotedState,
CharacterReferenceInAttributeValueState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
ContinueBogusCommentState, MarkupDeclarationOpenState,
CommentStartState,
CommentStartDashState,
CommentState,
CommentEndDashState,
CommentEndState,
CommentEndBangState,
DOCTYPEState,
BeforeDOCTYPENameState,
DOCTYPENameState,
AfterDOCTYPENameState,
AfterDOCTYPEPublicKeywordState,
BeforeDOCTYPEPublicIdentifierState,
DOCTYPEPublicIdentifierDoubleQuotedState,
DOCTYPEPublicIdentifierSingleQuotedState,
AfterDOCTYPEPublicIdentifierState,
BetweenDOCTYPEPublicAndSystemIdentifiersState,
AfterDOCTYPESystemKeywordState,
BeforeDOCTYPESystemIdentifierState,
DOCTYPESystemIdentifierDoubleQuotedState,
DOCTYPESystemIdentifierSingleQuotedState,
AfterDOCTYPESystemIdentifierState,
BogusDOCTYPEState,
CDATASectionState,
CDATASectionRightSquareBracketState,
CDATASectionDoubleRightSquareBracketState,
};
bool processToken(SegmentedString&);
bool processEntity(SegmentedString&);
void parseError();
void bufferASCIICharacter(UChar);
void bufferCharacter(UChar);
bool emitAndResumeInDataState(SegmentedString&);
bool emitAndReconsumeInDataState();
bool emitEndOfFile(SegmentedString&);
void flushBufferedEndTag();
bool commitToPartialEndTag(SegmentedString&, UChar, State);
bool commitToCompleteEndTag(SegmentedString&);
void appendToTemporaryBuffer(UChar);
bool temporaryBufferIs(const char*);
bool inEndTagBufferingState() const;
void appendToPossibleEndTag(UChar);
void saveEndTagNameIfNeeded();
bool isAppropriateEndTag() const;
bool haveBufferedCharacterToken() const;
static bool isNullCharacterSkippingState(State);
State m_state { DataState };
bool m_forceNullCharacterReplacement { false };
bool m_shouldAllowCDATA { false };
mutable HTMLToken m_token;
UChar m_additionalAllowedCharacter { 0 };
InputStreamPreprocessor<HTMLTokenizer> m_preprocessor;
Vector<UChar, 32> m_appropriateEndTagName;
Vector<LChar, 32> m_temporaryBuffer;
Vector<LChar, 32> m_bufferedEndTagName;
const HTMLParserOptions m_options;
};
class HTMLTokenizer::TokenPtr {
public:
TokenPtr();
~TokenPtr();
TokenPtr(TokenPtr&&);
TokenPtr& operator=(TokenPtr&&) = delete;
void clear();
operator bool() const;
HTMLToken& operator*() const;
HTMLToken* operator->() const;
private:
friend class HTMLTokenizer;
explicit TokenPtr(HTMLToken*);
HTMLToken* m_token { nullptr };
};
inline HTMLTokenizer::TokenPtr::TokenPtr()
{
}
inline HTMLTokenizer::TokenPtr::TokenPtr(HTMLToken* token)
: m_token(token)
{
}
inline HTMLTokenizer::TokenPtr::~TokenPtr()
{
if (m_token)
m_token->clear();
}
inline HTMLTokenizer::TokenPtr::TokenPtr(TokenPtr&& other)
: m_token(other.m_token)
{
other.m_token = nullptr;
}
inline void HTMLTokenizer::TokenPtr::clear()
{
if (m_token) {
m_token->clear();
m_token = nullptr;
}
}
inline HTMLTokenizer::TokenPtr::operator bool() const
{
return m_token;
}
inline HTMLToken& HTMLTokenizer::TokenPtr::operator*() const
{
ASSERT(m_token);
return *m_token;
}
inline HTMLToken* HTMLTokenizer::TokenPtr::operator->() const
{
ASSERT(m_token);
return m_token;
}
inline HTMLTokenizer::TokenPtr HTMLTokenizer::nextToken(SegmentedString& source)
{
return TokenPtr(processToken(source) ? &m_token : nullptr);
}
inline void HTMLTokenizer::setTokenAttributeBaseOffset(unsigned offset)
{
m_token.setAttributeBaseOffset(offset);
}
inline size_t HTMLTokenizer::numberOfBufferedCharacters() const
{
return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
}
inline void HTMLTokenizer::setForceNullCharacterReplacement(bool value)
{
m_forceNullCharacterReplacement = value;
}
inline bool HTMLTokenizer::shouldAllowCDATA() const
{
return m_shouldAllowCDATA;
}
inline void HTMLTokenizer::setShouldAllowCDATA(bool value)
{
m_shouldAllowCDATA = value;
}
inline bool HTMLTokenizer::isInDataState() const
{
return m_state == DataState;
}
inline void HTMLTokenizer::setDataState()
{
m_state = DataState;
}
inline void HTMLTokenizer::setPLAINTEXTState()
{
m_state = PLAINTEXTState;
}
inline void HTMLTokenizer::setRAWTEXTState()
{
m_state = RAWTEXTState;
}
inline void HTMLTokenizer::setRCDATAState()
{
m_state = RCDATAState;
}
inline void HTMLTokenizer::setScriptDataState()
{
m_state = ScriptDataState;
}
inline bool HTMLTokenizer::isNullCharacterSkippingState(State state)
{
return state == DataState || state == RCDATAState || state == RAWTEXTState;
}
inline bool HTMLTokenizer::neverSkipNullCharacters() const
{
return m_forceNullCharacterReplacement;
}
}