HTMLTokenizer.h   [plain text]


/*
    Copyright (C) 1997 Martin Jones (mjones@kde.org)
              (C) 1997 Torben Weis (weis@kde.org)
              (C) 1998 Waldo Bastian (bastian@kde.org)
              (C) 2001 Dirk Mueller (mueller@kde.org)
    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public License
    along with this library; see the file COPYING.LIB.  If not, write to
    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
    Boston, MA 02110-1301, USA.
*/

#ifndef HTMLTokenizer_h
#define HTMLTokenizer_h

#include "CachedResourceClient.h"
#include "CachedResourceHandle.h"
#include "NamedMappedAttrMap.h"
#include "MappedAttributeEntry.h"
#include "SegmentedString.h"
#include "Timer.h"
#include "Tokenizer.h"
#include <wtf/Deque.h>
#include <wtf/OwnPtr.h>
#include <wtf/Vector.h>

namespace WebCore {

class CachedScript;
class DocumentFragment;
class Document;
class HTMLDocument;
class HTMLScriptElement;
class HTMLViewSourceDocument;
class FrameView;
class HTMLParser;
class Node;
class PreloadScanner;
class ScriptSourceCode;

extern const double defaultTokenizerTimeDelay;

/**
 * @internal
 * represents one HTML tag. Consists of a numerical id, and the list
 * of attributes. Can also represent text. In this case the id = 0 and
 * text contains the text.
 */
struct Token {
    Token()
        : beginTag(true)
        , selfClosingTag(false)
        , brokenXMLStyle(false)
        , m_sourceInfo(0)
    { }
    ~Token() { }

    void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);

    bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
    bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }

    void reset()
    {
        attrs = 0;
        text = 0;
        tagName = nullAtom;
        beginTag = true;
        selfClosingTag = false;
        brokenXMLStyle = false;
        if (m_sourceInfo)
            m_sourceInfo->clear();
    }

    void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }

    RefPtr<NamedMappedAttrMap> attrs;
    RefPtr<StringImpl> text;
    AtomicString tagName;
    bool beginTag;
    bool selfClosingTag;
    bool brokenXMLStyle;
    OwnPtr<Vector<UChar> > m_sourceInfo;
};

enum DoctypeState {
    DoctypeBegin,
    DoctypeBeforeName,
    DoctypeName,
    DoctypeAfterName,
    DoctypeBeforePublicID,
    DoctypePublicID,
    DoctypeAfterPublicID,
    DoctypeBeforeSystemID,
    DoctypeSystemID,
    DoctypeAfterSystemID,
    DoctypeBogus
};

class DoctypeToken {
public:
    DoctypeToken() {}
    
    void reset()
    {
        m_name.clear();
        m_publicID.clear();
        m_systemID.clear();
        m_state = DoctypeBegin;
        m_source.clear();
    }

    DoctypeState state() { return m_state; }
    void setState(DoctypeState s) { m_state = s; }

    Vector<UChar> m_name;
    Vector<UChar> m_publicID;
    Vector<UChar> m_systemID;
    DoctypeState m_state;
    
    Vector<UChar> m_source;
};

//-----------------------------------------------------------------------------

class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
public:
    HTMLTokenizer(HTMLDocument*, bool reportErrors);
    HTMLTokenizer(HTMLViewSourceDocument*);
    HTMLTokenizer(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
    virtual ~HTMLTokenizer();

    virtual void write(const SegmentedString&, bool appendData);
    virtual void finish();
    virtual bool forceSynchronous() const { return m_state.forceSynchronous(); }
    virtual void setForceSynchronous(bool force);
    virtual bool isWaitingForScripts() const;
    virtual void stopParsing();
    virtual bool processingData() const;
    virtual int executingScript() const { return m_executingScript; }

    virtual int lineNumber() const { return m_lineNumber; }
    virtual int columnNumber() const { return 1; }

    bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
    
    virtual void executeScriptsWaitingForStylesheets();
    
    virtual bool isHTMLTokenizer() const { return true; }
    virtual HTMLTokenizer* asHTMLTokenizer() { return this; }
    HTMLParser* htmlParser() const { return m_parser.get(); }

private:
    class State;

    // Where we are in parsing a tag
    void begin();
    void end();

    void reset();

    PassRefPtr<Node> processToken();
    void processDoctypeToken();

    State processListing(SegmentedString, State);
    State parseComment(SegmentedString&, State);
    State parseDoctype(SegmentedString&, State);
    State parseServer(SegmentedString&, State);
    State parseText(SegmentedString&, State);
    State parseNonHTMLText(SegmentedString&, State);
    State parseTag(SegmentedString&, State);
    State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
    State parseProcessingInstruction(SegmentedString&, State);
    State scriptHandler(State);
    State scriptExecution(const ScriptSourceCode&, State);
    void setSrc(const SegmentedString&);
 
    // check if we have enough space in the buffer.
    // if not enlarge it
    inline void checkBuffer(int len = 10)
    {
        if ((m_dest - m_buffer) > m_bufferSize - len)
            enlargeBuffer(len);
    }

    inline void checkScriptBuffer(int len = 10)
    {
        if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
            enlargeScriptBuffer(len);
    }

    void enlargeBuffer(int len);
    void enlargeScriptBuffer(int len);

    bool continueProcessing(int& processedCount, double startTime, State&);
    void timerFired(Timer<HTMLTokenizer>*);
    void allDataProcessed();

    // from CachedResourceClient
    void notifyFinished(CachedResource*);

    void executeExternalScriptsIfReady();
    void executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*);
    bool continueExecutingExternalScripts(double startTime);

    // Internal buffers
    ///////////////////
    UChar* m_buffer;
    int m_bufferSize;
    UChar* m_dest;

    Token m_currentToken;

    // This buffer holds the raw characters we've seen between the beginning of
    // the attribute name and the first character of the attribute value.
    Vector<UChar, 32> m_rawAttributeBeforeValue;

    // Tokenizer flags
    //////////////////
    // are we in quotes within a html tag
    enum { NoQuote, SingleQuote, DoubleQuote } tquote;

    // Are we in a &... character entity description?
    enum EntityState {
        NoEntity = 0,
        SearchEntity = 1,
        NumericSearch = 2,
        Hexadecimal = 3,
        Decimal = 4,
        EntityName = 5,
        SearchSemicolon = 6
    };
    unsigned EntityUnicodeValue;

    enum TagState {
        NoTag = 0,
        TagName = 1,
        SearchAttribute = 2,
        AttributeName = 3,
        SearchEqual = 4,
        SearchValue = 5,
        QuotedValue = 6,
        Value = 7,
        SearchEnd = 8
    };

    class State {
    public:
        State() : m_bits(0) { }

        TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
        void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
        EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
        void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }

        bool inScript() const { return testBit(InScript); }
        void setInScript(bool v) { setBit(InScript, v); }
        bool inStyle() const { return testBit(InStyle); }
        void setInStyle(bool v) { setBit(InStyle, v); }
        bool inXmp() const { return testBit(InXmp); }
        void setInXmp(bool v) { setBit(InXmp, v); }
        bool inTitle() const { return testBit(InTitle); }
        void setInTitle(bool v) { setBit(InTitle, v); }
        bool inIFrame() const { return testBit(InIFrame); }
        void setInIFrame(bool v) { setBit(InIFrame, v); }
        bool inPlainText() const { return testBit(InPlainText); }
        void setInPlainText(bool v) { setBit(InPlainText, v); }
        bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
        void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
        bool inComment() const { return testBit(InComment); }
        void setInComment(bool v) { setBit(InComment, v); }
        bool inDoctype() const { return testBit(InDoctype); }
        void setInDoctype(bool v) { setBit(InDoctype, v); }
        bool inTextArea() const { return testBit(InTextArea); }
        void setInTextArea(bool v) { setBit(InTextArea, v); }
        bool escaped() const { return testBit(Escaped); }
        void setEscaped(bool v) { setBit(Escaped, v); }
        bool inServer() const { return testBit(InServer); }
        void setInServer(bool v) { setBit(InServer, v); }
        bool skipLF() const { return testBit(SkipLF); }
        void setSkipLF(bool v) { setBit(SkipLF, v); }
        bool startTag() const { return testBit(StartTag); }
        void setStartTag(bool v) { setBit(StartTag, v); }
        bool discardLF() const { return testBit(DiscardLF); }
        void setDiscardLF(bool v) { setBit(DiscardLF, v); }
        bool allowYield() const { return testBit(AllowYield); }
        void setAllowYield(bool v) { setBit(AllowYield, v); }
        bool loadingExtScript() const { return testBit(LoadingExtScript); }
        void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
        bool forceSynchronous() const { return testBit(ForceSynchronous); }
        void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }

        bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
        bool hasTagState() const { return m_bits & TagMask; }
        bool hasEntityState() const { return m_bits & EntityMask; }

        bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }

    private:
        static const int EntityShift = 4;
        enum StateBits {
            TagMask = (1 << 4) - 1,
            EntityMask = (1 << 7) - (1 << 4),
            InScript = 1 << 7,
            InStyle = 1 << 8,
            // Bit 9 unused
            InXmp = 1 << 10,
            InTitle = 1 << 11,
            InPlainText = 1 << 12,
            InProcessingInstruction = 1 << 13,
            InComment = 1 << 14,
            InTextArea = 1 << 15,
            Escaped = 1 << 16,
            InServer = 1 << 17,
            SkipLF = 1 << 18,
            StartTag = 1 << 19,
            DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
            AllowYield = 1 << 21,
            LoadingExtScript = 1 << 22,
            ForceSynchronous = 1 << 23,
            InIFrame = 1 << 24,
            InDoctype = 1 << 25
        };

        void setBit(StateBits bit, bool value)
        {
            if (value)
                m_bits |= bit;
            else
                m_bits &= ~bit;
        }
        bool testBit(StateBits bit) const { return m_bits & bit; }

        unsigned m_bits;
    };

    State m_state;
    
    DoctypeToken m_doctypeToken;
    int m_doctypeSearchCount;
    int m_doctypeSecondarySearchCount;

    bool m_brokenServer;

    // Name of an attribute that we just scanned.
    AtomicString m_attrName;

    // Used to store the code of a scripting sequence
    UChar* m_scriptCode;
    // Size of the script sequenze stored in @ref #scriptCode
    int m_scriptCodeSize;
    // Maximal size that can be stored in @ref #scriptCode
    int m_scriptCodeCapacity;
    // resync point of script code size
    int m_scriptCodeResync;

    // Stores characters if we are scanning for a string like "</script>"
    UChar searchBuffer[10];
    
    // Counts where we are in the string we are scanning for
    int searchCount;
    // the stopper string
    const char* m_searchStopper;
    int m_searchStopperLength;
    
    // if no more data is coming, just parse what we have (including ext scripts that
    // may be still downloading) and finish
    bool m_noMoreData;
    // URL to get source code of script from
    String m_scriptTagSrcAttrValue;
    String m_scriptTagCharsetAttrValue;
    // the HTML code we will parse after the external script we are waiting for has loaded
    SegmentedString m_pendingSrc;

    // the HTML code we will parse after this particular script has
    // loaded, but before all pending HTML
    SegmentedString* m_currentPrependingSrc;

    // true if we are executing a script while parsing a document. This causes the parsing of
    // the output of the script to be postponed until after the script has finished executing
    int m_executingScript;
    Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
    RefPtr<HTMLScriptElement> m_scriptNode;

    bool m_requestingScript;
    bool m_hasScriptsWaitingForStylesheets;

    // if we found one broken comment, there are most likely others as well
    // store a flag to get rid of the O(n^2) behaviour in such a case.
    bool m_brokenComments;
    // current line number
    int m_lineNumber;
    int m_currentScriptTagStartLineNumber;
    int m_currentTagStartLineNumber;

    double m_tokenizerTimeDelay;
    int m_tokenizerChunkSize;

    // The timer for continued processing.
    Timer<HTMLTokenizer> m_timer;

    // The timer for continued executing external scripts.
    Timer<HTMLTokenizer> m_externalScriptsTimer;

// This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
// So any fixed number might be too small, but rather than rewriting all usage of this buffer
// we'll just make it large enough to handle all imaginable cases.
#define CBUFLEN 1024
    UChar m_cBuffer[CBUFLEN + 2];
    unsigned int m_cBufferPos;

    SegmentedString m_src;
    Document* m_doc;
    OwnPtr<HTMLParser> m_parser;
    bool m_inWrite;
    bool m_fragment;
    FragmentScriptingPermission m_scriptingPermission;

    OwnPtr<PreloadScanner> m_preloadScanner;
};

void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);

UChar decodeNamedEntity(const char*);

} // namespace WebCore

#endif // HTMLTokenizer_h