SourceHTMLTokenizer.re2js   [plain text]


/*
 * Copyright (C) 2009 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// Generate js file as follows:
//
// re2c -isc WebCore/inspector/front-end/SourceHTMLTokenizer.re2js \
// | sed 's|^yy\([^:]*\)*\:|case \1:|' \
// | sed 's|[*]cursor[+][+]|this._charAt(cursor++)|' \
// | sed 's|[[*][+][+]cursor|this._charAt(++cursor)|' \
// | sed 's|[*]cursor|this._charAt(cursor)|' \
// | sed 's|yych = \*\([^;]*\)|yych = this._charAt\1|' \
// | sed 's|goto case \([^;]*\)|{ gotoCase = \1; continue; }|' \
// | sed 's|unsigned\ int|var|' \
// | sed 's|var\ yych|case 1: var yych|'

WebInspector.SourceHTMLTokenizer = function()
{
    WebInspector.SourceTokenizer.call(this);

    // The order is determined by the generated code.
    this._lexConditions = {
        INITIAL: 0,
        COMMENT: 1,
        DOCTYPE: 2,
        TAG: 3,
        DSTRING: 4,
        SSTRING: 5
    };
    this.case_INITIAL = 1000;
    this.case_COMMENT = 1001;
    this.case_DOCTYPE = 1002;
    this.case_TAG = 1003;
    this.case_DSTRING = 1004;
    this.case_SSTRING = 1005;

    this._parseConditions = {
        INITIAL: 0,
        ATTRIBUTE: 1,
        ATTRIBUTE_VALUE: 2,
        LINKIFY: 4,
        A_NODE: 8,
        SCRIPT: 16
    };

    this.initialCondition = { lexCondition: this._lexConditions.INITIAL, parseCondition: this._parseConditions.INITIAL };
    this.condition = this.initialCondition;
}

WebInspector.SourceHTMLTokenizer.prototype = {
    set line(line) {
        if (this._internalJavaScriptTokenizer) {
            var match = /<\/script/i.exec(line);
            if (match) {
                this._internalJavaScriptTokenizer.line = line.substring(0, match.index);
            } else
                this._internalJavaScriptTokenizer.line = line;
        }
        this._line = line;
    },

    _isExpectingAttribute: function()
    {
        return this._condition.parseCondition & this._parseConditions.ATTRIBUTE;
    },

    _isExpectingAttributeValue: function()
    {
        return this._condition.parseCondition & this._parseConditions.ATTRIBUTE_VALUE;
    },

    _setExpectingAttribute: function()
    {
        if (this._isExpectingAttributeValue())
            this._condition.parseCondition ^= this._parseConditions.ATTRIBUTE_VALUE;
        this._condition.parseCondition |= this._parseConditions.ATTRIBUTE;
    },

    _setExpectingAttributeValue: function()
    {
        if (this._isExpectingAttribute())
            this._condition.parseCondition ^= this._parseConditions.ATTRIBUTE;
        this._condition.parseCondition |= this._parseConditions.ATTRIBUTE_VALUE;
    },

    _stringToken: function(cursor, stringEnds)
    {
        if (!this._isExpectingAttributeValue()) {
            this.tokenType = null;
            return cursor;
        }
        this.tokenType = this._attrValueTokenType();
        if (stringEnds)
            this._setExpectingAttribute();
        return cursor;
    },

    _attrValueTokenType: function()
    {
        if (this._condition.parseCondition & this._parseConditions.LINKIFY) {
            if (this._condition.parseCondition & this._parseConditions.A_NODE)
                return "html-external-link";
            return "html-resource-link";
        }
        return "html-attribute-value";
    },

    nextToken: function(cursor)
    {
        if (this._internalJavaScriptTokenizer) {
            // Re-set line to force </script> detection first.
            this.line = this._line;
            if (cursor !== this._internalJavaScriptTokenizer._line.length) {
                // Tokenizer is stateless, so restore its condition before tokenizing and save it after.
                this._internalJavaScriptTokenizer.condition = this._condition.internalJavaScriptTokenizerCondition;
                var result = this._internalJavaScriptTokenizer.nextToken(cursor);
                this.tokenType = this._internalJavaScriptTokenizer.tokenType;
                this._condition.internalJavaScriptTokenizerCondition = this._internalJavaScriptTokenizer.condition;
                return result;
            } else if (cursor !== this._line.length)
                delete this._internalJavaScriptTokenizer;
        }

        var cursorOnEnter = cursor;
        var gotoCase = 1;
        while (1) {
            switch (gotoCase)
            // Following comment is replaced with generated state machine.
            /*!re2c
                re2c:define:YYCTYPE  = "var";
                re2c:define:YYCURSOR = cursor;
                re2c:define:YYGETCONDITION = "this.getLexCondition";
                re2c:define:YYSETCONDITION = "this.setLexCondition";
                re2c:condprefix = "case this.case_";
                re2c:condenumprefix = "this._lexConditions.";
                re2c:yyfill:enable = 0;
                re2c:labelprefix = "case ";
                re2c:indent:top = 2;
                re2c:indent:string = "    ";

                CommentContent = ([^-\r\n] | ("--" [^>]))*;
                Comment = "<!--" CommentContent "-->";
                CommentStart = "<!--" CommentContent [\r\n];
                CommentEnd = CommentContent "-->";

                DocTypeStart = "<!" [Dd] [Oo] [Cc] [Tt] [Yy] [Pp] [Ee];
                DocTypeContent = [^\r\n>]*;

                ScriptStart = "<" [Ss] [Cc] [Rr] [Ii] [Pp] [Tt];
                ScriptEnd = "</" [Ss] [Cc] [Rr] [Ii] [Pp] [Tt];

                LT = "<" | "</";
                GT = ">";
                EqualSign = "=";

                DoubleStringContent = [^\r\n\"]*;
                SingleStringContent = [^\r\n\']*;
                StringLiteral = "\"" DoubleStringContent "\"" | "'" SingleStringContent "'";
                DoubleStringStart = "\"" DoubleStringContent [\r\n];
                DoubleStringEnd = DoubleStringContent "\"";
                SingleStringStart = "'" SingleStringContent [\r\n];
                SingleStringEnd = SingleStringContent "'";

                Identifier = [^ \r\n"'<>\[\]=]+;

                <INITIAL> Comment { this.tokenType = "html-comment"; return cursor; }
                <INITIAL> CommentStart => COMMENT { this.tokenType = "html-comment"; return cursor; }
                <COMMENT> CommentContent => COMMENT { this.tokenType = "html-comment"; return cursor; }
                <COMMENT> CommentEnd => INITIAL { this.tokenType = "html-comment"; return cursor; }

                <INITIAL> DocTypeStart => DOCTYPE { this.tokenType = "html-doctype"; return cursor; }
                <DOCTYPE> DocTypeContent => DOCTYPE { this.tokenType = "html-doctype"; return cursor; }
                <DOCTYPE> GT => INITIAL { this.tokenType = "html-doctype"; return cursor; }

                <INITIAL> ScriptStart => TAG
                {
                    if (this._condition.parseCondition & this._parseConditions.SCRIPT) {
                        // Do not tokenize script tag contents, keep lexer state although processing "<".
                        this.setLexCondition(this._lexConditions.INITIAL);
                        this.tokenType = null;
                        return cursor;
                    }
                    this.tokenType = "html-tag";
                    this._condition.parseCondition = this._parseConditions.SCRIPT;
                    this._setExpectingAttribute();
                    return cursor;
                }

                <INITIAL> ScriptEnd => TAG
                {
                    this.tokenType = "html-tag";
                    this._condition.parseCondition = this._parseConditions.INITIAL;
                    return cursor;
                }

                <INITIAL> LT => TAG
                {
                    if (this._condition.parseCondition & this._parseConditions.SCRIPT) {
                        // Do not tokenize script tag contents, keep lexer state although processing "<".
                        this.setLexCondition(this._lexConditions.INITIAL);
                        this.tokenType = null;
                        return cursor;
                    }

                    this._condition.parseCondition = this._parseConditions.INITIAL;
                    this.tokenType = "html-tag";
                    return cursor;
                }
  
                <TAG> GT => INITIAL
                {
                    this.tokenType = "html-tag";
                    if (this._condition.parseCondition & this._parseConditions.SCRIPT) {
                        if (!this._internalJavaScriptTokenizer) {
                            this._internalJavaScriptTokenizer = WebInspector.SourceTokenizer.Registry.getInstance().getTokenizer("text/javascript");
                            this._condition.internalJavaScriptTokenizerCondition = this._internalJavaScriptTokenizer.initialCondition;
                        }
                        // Do not tokenize script tag contents.
                        return cursor;
                    }

                    this._condition.parseCondition = this._parseConditions.INITIAL;
                    return cursor;
                }

                <TAG> StringLiteral { return this._stringToken(cursor, true); }
                <TAG> DoubleStringStart => DSTRING { return this._stringToken(cursor); }
                <DSTRING> DoubleStringContent => DSTRING { return this._stringToken(cursor); }
                <DSTRING> DoubleStringEnd => TAG { return this._stringToken(cursor, true); }
                <TAG> SingleStringStart => SSTRING { return this._stringToken(cursor); }
                <SSTRING> SingleStringContent => SSTRING { return this._stringToken(cursor); }
                <SSTRING> SingleStringEnd => TAG { return this._stringToken(cursor, true); }

                <TAG> EqualSign => TAG
                {
                    if (this._isExpectingAttribute())
                        this._setExpectingAttributeValue();
                    this.tokenType = null;
                    return cursor;
                }

                <TAG> Identifier
                {
                    if (this._condition.parseCondition === this._parseConditions.SCRIPT) {
                        // Fall through if expecting attributes.
                        this.tokenType = null;
                        return cursor;
                    }

                    if (this._condition.parseCondition === this._parseConditions.INITIAL) {
                        this.tokenType = "html-tag";
                        this._setExpectingAttribute();
                        var token = this._line.substring(cursorOnEnter, cursor);
                        if (token === "a")
                            this._condition.parseCondition |= this._parseConditions.A_NODE;
                        else if (this._condition.parseCondition & this._parseConditions.A_NODE)
                            this._condition.parseCondition ^= this._parseConditions.A_NODE;
                    } else if (this._isExpectingAttribute()) {
                        var token = this._line.substring(cursorOnEnter, cursor);
                        if (token === "href" || token === "src")
                            this._condition.parseCondition |= this._parseConditions.LINKIFY;
                        else if (this._condition.parseCondition |= this._parseConditions.LINKIFY)
                            this._condition.parseCondition ^= this._parseConditions.LINKIFY;
                        this.tokenType = "html-attribute-name";
                    } else if (this._isExpectingAttributeValue())
                        this.tokenType = this._attrValueTokenType();
                    else
                        this.tokenType = null;
                    return cursor;
                }
                <*> [^] { this.tokenType = null; return cursor; }
            */
        }
    }
}

WebInspector.SourceHTMLTokenizer.prototype.__proto__ = WebInspector.SourceTokenizer.prototype;